From be4afa0bb4384f201c8fe68af536faffefbae661 Mon Sep 17 00:00:00 2001 From: Mark Van Aken <1727697+vanakema@users.noreply.github.com> Date: Fri, 10 May 2024 12:53:31 -0700 Subject: [PATCH] #7535 Update FloatTensor type hints to Tensor (#7883) * find & replace all FloatTensors to Tensor * apply formatting * Update torch.FloatTensor to torch.Tensor in the remaining files * formatting * Fix the rest of the places where FloatTensor is used as well as in documentation * formatting * Update new file from FloatTensor to Tensor --- docs/README.md | 6 +- docs/source/en/optimization/memory.md | 2 +- docs/source/ko/optimization/fp16.md | 2 +- examples/community/bit_diffusion.py | 16 +- ...p_guided_images_mixing_stable_diffusion.py | 4 +- .../community/clip_guided_stable_diffusion.py | 2 +- .../clip_guided_stable_diffusion_img2img.py | 4 +- .../community/composable_stable_diffusion.py | 8 +- .../ddim_noise_comparative_analysis.py | 4 +- examples/community/gluegen.py | 28 +-- examples/community/hd_painter.py | 24 +- examples/community/iadb.py | 20 +- examples/community/imagic_stable_diffusion.py | 4 +- examples/community/img2img_inpainting.py | 14 +- examples/community/instaflow_one_step.py | 28 +-- .../community/interpolate_stable_diffusion.py | 12 +- examples/community/ip_adapter_face_id.py | 32 +-- .../community/latent_consistency_img2img.py | 48 ++-- .../latent_consistency_interpolate.py | 32 +-- .../community/latent_consistency_txt2img.py | 48 ++-- examples/community/llm_grounded_diffusion.py | 30 +-- examples/community/lpw_stable_diffusion.py | 82 +++---- examples/community/lpw_stable_diffusion_xl.py | 72 +++--- .../masked_stable_diffusion_img2img.py | 24 +- examples/community/mixture_canvas.py | 2 +- .../multilingual_stable_diffusion.py | 8 +- .../pipeline_animatediff_controlnet.py | 24 +- .../pipeline_animatediff_img2video.py | 28 +-- .../community/pipeline_demofusion_sdxl.py | 40 ++-- examples/community/pipeline_fabric.py | 10 +- examples/community/pipeline_prompt2prompt.py | 24 +- .../community/pipeline_sdxl_style_aligned.py | 40 ++-- .../pipeline_stable_diffusion_pag.py | 50 ++-- ...pipeline_stable_diffusion_upscale_ldm3d.py | 30 +-- ..._stable_diffusion_xl_controlnet_adapter.py | 50 ++-- ...diffusion_xl_controlnet_adapter_inpaint.py | 50 ++-- ...table_diffusion_xl_differential_img2img.py | 58 ++--- ...e_stable_diffusion_xl_instandid_img2img.py | 30 +-- .../pipeline_stable_diffusion_xl_instantid.py | 30 +-- .../pipeline_stable_diffusion_xl_ipex.py | 48 ++-- examples/community/pipeline_zero1to3.py | 30 +-- .../regional_prompting_stable_diffusion.py | 12 +- examples/community/rerender_a_video.py | 28 +-- examples/community/run_onnx_controlnet.py | 34 +-- examples/community/run_tensorrt_controlnet.py | 34 +-- examples/community/scheduling_ufogen.py | 38 ++- examples/community/sd_text2img_k_diffusion.py | 8 +- .../community/seed_resize_stable_diffusion.py | 10 +- .../community/speech_to_image_diffusion.py | 4 +- .../community/stable_diffusion_comparison.py | 22 +- .../stable_diffusion_controlnet_img2img.py | 30 +-- .../stable_diffusion_controlnet_inpaint.py | 30 +-- ...le_diffusion_controlnet_inpaint_img2img.py | 30 +-- .../stable_diffusion_controlnet_reference.py | 62 ++--- examples/community/stable_diffusion_ipex.py | 24 +- examples/community/stable_diffusion_mega.py | 14 +- .../community/stable_diffusion_reference.py | 102 ++++---- .../community/stable_diffusion_repaint.py | 28 +-- .../stable_diffusion_tensorrt_img2img.py | 2 +- .../stable_diffusion_tensorrt_inpaint.py | 4 +- .../stable_diffusion_xl_reference.py | 44 ++-- examples/community/stable_unclip.py | 2 +- examples/community/text_inpainting.py | 10 +- examples/community/tiled_upscaling.py | 8 +- .../community/unclip_image_interpolation.py | 20 +- .../community/wildcard_stable_diffusion.py | 8 +- .../train_lcm_distill_lora_sd_wds.py | 2 +- .../train_lcm_distill_sd_wds.py | 2 +- .../train_lcm_distill_sdxl_wds.py | 2 +- .../train_cm_ct_unconditional.py | 6 +- .../geodiff_molecule_conformation.ipynb | 8 +- .../pipeline_prompt_diffusion.py | 34 +-- .../promptdiffusioncontrolnet.py | 14 +- .../research_projects/rdm/pipeline_rdm.py | 12 +- examples/research_projects/rdm/retriever.py | 4 +- .../convert_music_spectrogram_to_diffusers.py | 102 ++++---- src/diffusers/models/attention.py | 16 +- src/diffusers/models/attention_processor.py | 142 +++++------ .../autoencoders/autoencoder_asym_kl.py | 30 ++- .../models/autoencoders/autoencoder_kl.py | 26 +-- .../autoencoder_kl_temporal_decoder.py | 22 +- .../models/autoencoders/autoencoder_tiny.py | 30 ++- .../autoencoders/consistency_decoder_vae.py | 24 +- src/diffusers/models/autoencoders/vae.py | 40 ++-- src/diffusers/models/controlnet.py | 10 +- src/diffusers/models/controlnet_xs.py | 56 ++--- src/diffusers/models/downsampling.py | 32 +-- src/diffusers/models/embeddings.py | 16 +- src/diffusers/models/resnet.py | 16 +- .../transformers/dual_transformer_2d.py | 5 +- .../models/transformers/prior_transformer.py | 14 +- .../transformers/t5_film_transformer.py | 36 ++- .../models/transformers/transformer_2d.py | 8 +- .../transformers/transformer_temporal.py | 10 +- src/diffusers/models/unets/unet_1d.py | 10 +- src/diffusers/models/unets/unet_1d_blocks.py | 58 ++--- src/diffusers/models/unets/unet_2d.py | 12 +- src/diffusers/models/unets/unet_2d_blocks.py | 220 +++++++++--------- .../models/unets/unet_2d_condition.py | 12 +- src/diffusers/models/unets/unet_3d_blocks.py | 148 ++++++------ .../models/unets/unet_3d_condition.py | 14 +- src/diffusers/models/unets/unet_i2vgen_xl.py | 18 +- src/diffusers/models/unets/unet_kandinsky3.py | 2 +- .../models/unets/unet_motion_model.py | 8 +- .../unets/unet_spatio_temporal_condition.py | 14 +- .../models/unets/unet_stable_cascade.py | 2 +- src/diffusers/models/upsampling.py | 34 ++- src/diffusers/models/vq_model.py | 16 +- .../pipelines/amused/pipeline_amused.py | 12 +- .../amused/pipeline_amused_img2img.py | 14 +- .../amused/pipeline_amused_inpaint.py | 16 +- .../animatediff/pipeline_animatediff.py | 27 ++- .../animatediff/pipeline_animatediff_sdxl.py | 44 ++-- .../pipeline_animatediff_video2video.py | 27 ++- .../pipelines/audioldm/pipeline_audioldm.py | 24 +- .../pipelines/audioldm2/modeling_audioldm2.py | 64 ++--- .../pipelines/audioldm2/pipeline_audioldm2.py | 48 ++-- .../blip_diffusion/blip_image_processing.py | 2 +- .../blip_diffusion/modeling_blip2.py | 10 +- .../blip_diffusion/modeling_ctx_clip.py | 2 +- .../blip_diffusion/pipeline_blip_diffusion.py | 4 +- .../pipeline_consistency_models.py | 10 +- .../pipelines/controlnet/multicontrolnet.py | 2 +- .../controlnet/pipeline_controlnet.py | 48 ++-- .../pipeline_controlnet_blip_diffusion.py | 4 +- .../controlnet/pipeline_controlnet_img2img.py | 46 ++-- .../controlnet/pipeline_controlnet_inpaint.py | 46 ++-- .../pipeline_controlnet_inpaint_sd_xl.py | 40 ++-- .../controlnet/pipeline_controlnet_sd_xl.py | 58 ++--- .../pipeline_controlnet_sd_xl_img2img.py | 58 ++--- .../controlnet_xs/pipeline_controlnet_xs.py | 38 +-- .../pipeline_controlnet_xs_sd_xl.py | 50 ++-- .../pipelines/deepfloyd_if/pipeline_if.py | 20 +- .../deepfloyd_if/pipeline_if_img2img.py | 24 +- .../pipeline_if_img2img_superresolution.py | 30 +-- .../deepfloyd_if/pipeline_if_inpainting.py | 26 +-- .../pipeline_if_inpainting_superresolution.py | 32 +-- .../pipeline_if_superresolution.py | 26 +-- .../alt_diffusion/modeling_roberta_series.py | 22 +- .../alt_diffusion/pipeline_alt_diffusion.py | 26 +-- .../pipeline_alt_diffusion_img2img.py | 24 +- .../deprecated/repaint/pipeline_repaint.py | 4 +- .../pipeline_spectrogram_diffusion.py | 4 +- .../pipeline_cycle_diffusion.py | 24 +- ...ipeline_stable_diffusion_inpaint_legacy.py | 36 +-- ...pipeline_stable_diffusion_model_editing.py | 28 +-- .../pipeline_stable_diffusion_paradigms.py | 28 +-- .../pipeline_stable_diffusion_pix2pix_zero.py | 48 ++-- .../versatile_diffusion/modeling_text_unet.py | 78 +++---- .../pipeline_versatile_diffusion.py | 26 +-- ...ipeline_versatile_diffusion_dual_guided.py | 8 +- ...ine_versatile_diffusion_image_variation.py | 10 +- ...eline_versatile_diffusion_text_to_image.py | 8 +- .../vq_diffusion/pipeline_vq_diffusion.py | 10 +- .../pipelines/i2vgen_xl/pipeline_i2vgen_xl.py | 24 +- .../pipelines/kandinsky/pipeline_kandinsky.py | 16 +- .../kandinsky/pipeline_kandinsky_combined.py | 34 +-- .../kandinsky/pipeline_kandinsky_img2img.py | 22 +- .../kandinsky/pipeline_kandinsky_inpaint.py | 24 +- .../kandinsky/pipeline_kandinsky_prior.py | 18 +- .../kandinsky2_2/pipeline_kandinsky2_2.py | 12 +- .../pipeline_kandinsky2_2_combined.py | 28 +-- .../pipeline_kandinsky2_2_controlnet.py | 20 +- ...ipeline_kandinsky2_2_controlnet_img2img.py | 20 +- .../pipeline_kandinsky2_2_img2img.py | 12 +- .../pipeline_kandinsky2_2_inpainting.py | 16 +- .../pipeline_kandinsky2_2_prior.py | 12 +- .../pipeline_kandinsky2_2_prior_emb2emb.py | 10 +- .../kandinsky3/pipeline_kandinsky3.py | 34 +-- .../kandinsky3/pipeline_kandinsky3_img2img.py | 36 +-- .../pipeline_latent_consistency_img2img.py | 26 +-- .../pipeline_latent_consistency_text2img.py | 26 +-- .../pipeline_latent_diffusion.py | 22 +- .../pipeline_leditspp_stable_diffusion.py | 16 +- .../pipeline_leditspp_stable_diffusion_xl.py | 32 +-- .../pipelines/musicldm/pipeline_musicldm.py | 24 +- .../pipeline_paint_by_example.py | 22 +- src/diffusers/pipelines/pia/pipeline_pia.py | 27 ++- .../pixart_alpha/pipeline_pixart_alpha.py | 36 +-- .../pixart_alpha/pipeline_pixart_sigma.py | 36 +-- .../pipeline_semantic_stable_diffusion.py | 8 +- .../pipelines/shap_e/pipeline_shap_e.py | 6 +- .../shap_e/pipeline_shap_e_img2img.py | 8 +- .../stable_cascade/pipeline_stable_cascade.py | 32 +-- .../pipeline_stable_cascade_combined.py | 20 +- .../pipeline_stable_cascade_prior.py | 48 ++-- .../pipeline_onnx_stable_diffusion.py | 4 +- .../pipeline_onnx_stable_diffusion_upscale.py | 2 +- .../pipeline_stable_diffusion.py | 32 +-- .../pipeline_stable_diffusion_depth2img.py | 26 +-- ...peline_stable_diffusion_image_variation.py | 14 +- .../pipeline_stable_diffusion_img2img.py | 30 +-- .../pipeline_stable_diffusion_inpaint.py | 38 +-- ...eline_stable_diffusion_instruct_pix2pix.py | 24 +- ...ipeline_stable_diffusion_latent_upscale.py | 10 +- .../pipeline_stable_diffusion_upscale.py | 30 +-- .../pipeline_stable_unclip.py | 34 +-- .../pipeline_stable_unclip_img2img.py | 40 ++-- .../stable_diffusion/safety_checker.py | 2 +- ...line_stable_diffusion_attend_and_excite.py | 28 +-- .../pipeline_stable_diffusion_diffedit.py | 70 +++--- .../pipeline_stable_diffusion_gligen.py | 28 +-- ...line_stable_diffusion_gligen_text_image.py | 24 +- .../pipeline_stable_diffusion_k_diffusion.py | 28 +-- ...ipeline_stable_diffusion_xl_k_diffusion.py | 36 +-- .../pipeline_stable_diffusion_ldm3d.py | 32 +-- .../pipeline_stable_diffusion_panorama.py | 32 +-- .../pipeline_stable_diffusion_safe.py | 8 +- .../stable_diffusion_safe/safety_checker.py | 2 +- .../pipeline_stable_diffusion_sag.py | 32 +-- .../pipeline_stable_diffusion_xl.py | 44 ++-- .../pipeline_stable_diffusion_xl_img2img.py | 46 ++-- .../pipeline_stable_diffusion_xl_inpaint.py | 46 ++-- ...ne_stable_diffusion_xl_instruct_pix2pix.py | 42 ++-- .../stable_diffusion_xl/watermark.py | 2 +- .../pipeline_stable_video_diffusion.py | 24 +- .../pipeline_stable_diffusion_adapter.py | 36 +-- .../pipeline_stable_diffusion_xl_adapter.py | 52 ++--- .../pipeline_text_to_video_synth.py | 30 +-- .../pipeline_text_to_video_synth_img2img.py | 34 +-- .../pipeline_text_to_video_zero.py | 18 +- .../pipeline_text_to_video_zero_sdxl.py | 42 ++-- .../pipelines/unclip/pipeline_unclip.py | 12 +- .../unclip/pipeline_unclip_image_variation.py | 12 +- .../unidiffuser/modeling_text_decoder.py | 2 +- .../pipelines/unidiffuser/modeling_uvit.py | 15 +- .../unidiffuser/pipeline_unidiffuser.py | 46 ++-- .../wuerstchen/modeling_paella_vq_model.py | 10 +- .../wuerstchen/pipeline_wuerstchen.py | 8 +- .../pipeline_wuerstchen_combined.py | 12 +- .../wuerstchen/pipeline_wuerstchen_prior.py | 20 +- .../deprecated/scheduling_karras_ve.py | 50 ++-- src/diffusers/schedulers/scheduling_amused.py | 10 +- .../scheduling_consistency_decoder.py | 20 +- .../scheduling_consistency_models.py | 34 ++- src/diffusers/schedulers/scheduling_ddim.py | 42 ++-- .../schedulers/scheduling_ddim_inverse.py | 28 +-- .../schedulers/scheduling_ddim_parallel.py | 54 +++-- src/diffusers/schedulers/scheduling_ddpm.py | 38 ++- .../schedulers/scheduling_ddpm_parallel.py | 50 ++-- .../schedulers/scheduling_ddpm_wuerstchen.py | 26 +-- .../schedulers/scheduling_deis_multistep.py | 72 +++--- .../scheduling_dpmsolver_multistep.py | 86 +++---- .../scheduling_dpmsolver_multistep_inverse.py | 80 +++---- .../schedulers/scheduling_dpmsolver_sde.py | 36 +-- .../scheduling_dpmsolver_singlestep.py | 84 +++---- .../scheduling_edm_dpmsolver_multistep.py | 82 ++++--- .../schedulers/scheduling_edm_euler.py | 38 ++- .../scheduling_euler_ancestral_discrete.py | 38 ++- .../schedulers/scheduling_euler_discrete.py | 44 ++-- .../schedulers/scheduling_heun_discrete.py | 30 +-- src/diffusers/schedulers/scheduling_ipndm.py | 14 +- .../scheduling_k_dpm_2_ancestral_discrete.py | 30 +-- .../schedulers/scheduling_k_dpm_2_discrete.py | 30 +-- .../schedulers/scheduling_karras_ve_flax.py | 12 +- src/diffusers/schedulers/scheduling_lcm.py | 40 ++-- .../schedulers/scheduling_lms_discrete.py | 40 ++-- src/diffusers/schedulers/scheduling_pndm.py | 36 +-- .../schedulers/scheduling_repaint.py | 36 +-- .../schedulers/scheduling_sasolver.py | 72 +++--- src/diffusers/schedulers/scheduling_sde_ve.py | 38 +-- src/diffusers/schedulers/scheduling_tcd.py | 40 ++-- src/diffusers/schedulers/scheduling_unclip.py | 28 +-- .../schedulers/scheduling_unipc_multistep.py | 68 +++--- src/diffusers/schedulers/scheduling_utils.py | 4 +- .../schedulers/scheduling_vq_diffusion.py | 20 +- tests/others/test_check_copies.py | 8 +- .../stable_diffusion/test_stable_diffusion.py | 2 +- .../test_stable_diffusion_img2img.py | 2 +- ...st_stable_diffusion_instruction_pix2pix.py | 2 +- .../test_stable_diffusion.py | 2 +- .../test_stable_diffusion_depth.py | 2 +- .../test_stable_diffusion_v_pred.py | 2 +- .../test_stable_diffusion_image_variation.py | 2 +- .../test_stable_diffusion_panorama.py | 2 +- 275 files changed, 3765 insertions(+), 3824 deletions(-) diff --git a/docs/README.md b/docs/README.md index e7aa8c4f68..f36b76fb07 100644 --- a/docs/README.md +++ b/docs/README.md @@ -242,10 +242,10 @@ Here's an example of a tuple return, comprising several objects: ``` Returns: - `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs: - - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` -- + `tuple(torch.Tensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs: + - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.Tensor` of shape `(1,)` -- Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss. - - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) -- + - **prediction_scores** (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`) -- Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). ``` diff --git a/docs/source/en/optimization/memory.md b/docs/source/en/optimization/memory.md index 6b2a22b315..e3f4d2652d 100644 --- a/docs/source/en/optimization/memory.md +++ b/docs/source/en/optimization/memory.md @@ -261,7 +261,7 @@ from dataclasses import dataclass @dataclass class UNet2DConditionOutput: - sample: torch.FloatTensor + sample: torch.Tensor pipe = StableDiffusionPipeline.from_pretrained( diff --git a/docs/source/ko/optimization/fp16.md b/docs/source/ko/optimization/fp16.md index 2e58421c35..f7b2cf8095 100644 --- a/docs/source/ko/optimization/fp16.md +++ b/docs/source/ko/optimization/fp16.md @@ -339,7 +339,7 @@ from dataclasses import dataclass @dataclass class UNet2DConditionOutput: - sample: torch.FloatTensor + sample: torch.Tensor pipe = StableDiffusionPipeline.from_pretrained( diff --git a/examples/community/bit_diffusion.py b/examples/community/bit_diffusion.py index 18d5fca561..71d8f31163 100644 --- a/examples/community/bit_diffusion.py +++ b/examples/community/bit_diffusion.py @@ -44,9 +44,9 @@ def bits_to_decimal(x, bits=BITS): # modified scheduler step functions for clamping the predicted x_0 between -bit_scale and +bit_scale def ddim_bit_scheduler_step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = True, generator=None, @@ -56,9 +56,9 @@ def ddim_bit_scheduler_step( Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): TODO @@ -134,9 +134,9 @@ def ddim_bit_scheduler_step( def ddpm_bit_scheduler_step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, prediction_type="epsilon", generator=None, return_dict: bool = True, @@ -145,9 +145,9 @@ def ddpm_bit_scheduler_step( Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. prediction_type (`str`, default `epsilon`): indicates whether the model predicts the noise (epsilon), or the samples (`sample`). diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py index 16dcecd7b2..75b7df84dc 100644 --- a/examples/community/clip_guided_images_mixing_stable_diffusion.py +++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py @@ -233,8 +233,8 @@ class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, StableDiffusionMi @torch.no_grad() def __call__( self, - style_image: Union[torch.FloatTensor, PIL.Image.Image], - content_image: Union[torch.FloatTensor, PIL.Image.Image], + style_image: Union[torch.Tensor, PIL.Image.Image], + content_image: Union[torch.Tensor, PIL.Image.Image], style_prompt: Optional[str] = None, content_prompt: Optional[str] = None, height: Optional[int] = 512, diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py index 4205718802..1350650113 100644 --- a/examples/community/clip_guided_stable_diffusion.py +++ b/examples/community/clip_guided_stable_diffusion.py @@ -180,7 +180,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin): num_cutouts: Optional[int] = 4, use_cutouts: Optional[bool] = True, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, ): diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py index c8e0a9094f..9a77458be5 100644 --- a/examples/community/clip_guided_stable_diffusion_img2img.py +++ b/examples/community/clip_guided_stable_diffusion_img2img.py @@ -306,7 +306,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin): prompt: Union[str, List[str]], height: Optional[int] = 512, width: Optional[int] = 512, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -317,7 +317,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin): num_cutouts: Optional[int] = 4, use_cutouts: Optional[bool] = True, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, ): diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py index 2ad37df5ab..46d12ba1f2 100644 --- a/examples/community/composable_stable_diffusion.py +++ b/examples/community/composable_stable_diffusion.py @@ -354,10 +354,10 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin) num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, weights: Optional[str] = "", ): @@ -391,7 +391,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin) generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -403,7 +403,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin) plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/ddim_noise_comparative_analysis.py b/examples/community/ddim_noise_comparative_analysis.py index 482c0a5826..829106c47f 100644 --- a/examples/community/ddim_noise_comparative_analysis.py +++ b/examples/community/ddim_noise_comparative_analysis.py @@ -103,7 +103,7 @@ class DDIMNoiseComparativeAnalysisPipeline(DiffusionPipeline): @torch.no_grad() def __call__( self, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, batch_size: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -115,7 +115,7 @@ class DDIMNoiseComparativeAnalysisPipeline(DiffusionPipeline): ) -> Union[ImagePipelineOutput, Tuple]: r""" Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): diff --git a/examples/community/gluegen.py b/examples/community/gluegen.py index c656dce55a..1ad6911905 100644 --- a/examples/community/gluegen.py +++ b/examples/community/gluegen.py @@ -205,7 +205,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, language_adapter: TranslatorNoLN = None, - tensor_norm: torch.FloatTensor = None, + tensor_norm: torch.Tensor = None, requires_safety_checker: bool = True, ): super().__init__() @@ -231,7 +231,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo num_token: int, dim: int, dim_out: int, - tensor_norm: torch.FloatTensor, + tensor_norm: torch.Tensor, mult: int = 2, depth: int = 5, ): @@ -242,7 +242,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo ) self.language_adapter.load_state_dict(torch.load(model_path)) - def _adapt_language(self, prompt_embeds: torch.FloatTensor): + def _adapt_language(self, prompt_embeds: torch.Tensor): prompt_embeds = prompt_embeds / 3 prompt_embeds = self.language_adapter(prompt_embeds) * (self.tensor_norm / 2) return prompt_embeds @@ -254,8 +254,8 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -275,10 +275,10 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -535,7 +535,7 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -594,9 +594,9 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -635,14 +635,14 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. diff --git a/examples/community/hd_painter.py b/examples/community/hd_painter.py index c157e16889..df41be9ef7 100644 --- a/examples/community/hd_painter.py +++ b/examples/community/hd_painter.py @@ -28,10 +28,10 @@ class RASGAttnProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, ) -> torch.Tensor: # Same as the default AttnProcessor up untill the part where similarity matrix gets saved @@ -111,10 +111,10 @@ class PAIntAAttnProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, ) -> torch.Tensor: # Automatically recognize the resolution of the current attention layer and resize the masks accordingly @@ -454,7 +454,7 @@ class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline): prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, - masked_image_latents: torch.FloatTensor = None, + masked_image_latents: torch.Tensor = None, height: Optional[int] = None, width: Optional[int] = None, padding_mask_crop: Optional[int] = None, @@ -467,9 +467,9 @@ class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.01, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, diff --git a/examples/community/iadb.py b/examples/community/iadb.py index 6089e49fc6..81e9e8d89d 100644 --- a/examples/community/iadb.py +++ b/examples/community/iadb.py @@ -17,21 +17,21 @@ class IADBScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - x_alpha: torch.FloatTensor, - ) -> torch.FloatTensor: + x_alpha: torch.Tensor, + ) -> torch.Tensor: """ Predict the sample at the previous timestep by reversing the ODE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. It is the direction from x0 to x1. + model_output (`torch.Tensor`): direct output from learned diffusion model. It is the direction from x0 to x1. timestep (`float`): current timestep in the diffusion chain. - x_alpha (`torch.FloatTensor`): x_alpha sample for the current timestep + x_alpha (`torch.Tensor`): x_alpha sample for the current timestep Returns: - `torch.FloatTensor`: the sample at the previous timestep + `torch.Tensor`: the sample at the previous timestep """ if self.num_inference_steps is None: @@ -53,10 +53,10 @@ class IADBScheduler(SchedulerMixin, ConfigMixin): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - alpha: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + alpha: torch.Tensor, + ) -> torch.Tensor: return original_samples * alpha + noise * (1 - alpha) def __len__(self): diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 719e8f12bf..cea55dd383 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -110,7 +110,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): def train( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], height: Optional[int] = 512, width: Optional[int] = 512, generator: Optional[torch.Generator] = None, @@ -144,7 +144,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py index 71dc3cf712..4dfb7a3915 100644 --- a/examples/community/img2img_inpainting.py +++ b/examples/community/img2img_inpainting.py @@ -133,9 +133,9 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - inner_image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + inner_image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], height: int = 512, width: int = 512, num_inference_steps: int = 50, @@ -144,10 +144,10 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -194,7 +194,7 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -206,7 +206,7 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/instaflow_one_step.py b/examples/community/instaflow_one_step.py index b0476d3afe..ab0393c8f7 100644 --- a/examples/community/instaflow_one_step.py +++ b/examples/community/instaflow_one_step.py @@ -189,8 +189,8 @@ class InstaFlowPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." @@ -219,8 +219,8 @@ class InstaFlowPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -239,10 +239,10 @@ class InstaFlowPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -501,12 +501,12 @@ class InstaFlowPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -538,14 +538,14 @@ class InstaFlowPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -555,7 +555,7 @@ class InstaFlowPipeline( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py index 1b859c35f1..52b2707f33 100644 --- a/examples/community/interpolate_stable_diffusion.py +++ b/examples/community/interpolate_stable_diffusion.py @@ -132,12 +132,12 @@ class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, - text_embeddings: Optional[torch.FloatTensor] = None, + text_embeddings: Optional[torch.Tensor] = None, **kwargs, ): r""" @@ -170,7 +170,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -182,11 +182,11 @@ class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. - text_embeddings (`torch.FloatTensor`, *optional*, defaults to `None`): + text_embeddings (`torch.Tensor`, *optional*, defaults to `None`): Pre-generated text embeddings to be used as inputs for image generation. Can be used in place of `prompt` to avoid re-computing the embeddings. If not provided, the embeddings will be generated from the supplied `prompt`. diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py index bb5a2a4fe5..befb48c739 100644 --- a/examples/community/ip_adapter_face_id.py +++ b/examples/community/ip_adapter_face_id.py @@ -62,7 +62,7 @@ class IPAdapterFullImageProjection(nn.Module): self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): x = self.ff(image_embeds) x = x.reshape(-1, self.num_tokens, self.cross_attention_dim) return self.norm(x) @@ -452,8 +452,8 @@ class IPAdapterFaceIDStableDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -484,8 +484,8 @@ class IPAdapterFaceIDStableDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -505,10 +505,10 @@ class IPAdapterFaceIDStableDiffusionPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -788,7 +788,7 @@ class IPAdapterFaceIDStableDiffusionPipeline( data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -847,10 +847,10 @@ class IPAdapterFaceIDStableDiffusionPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -891,17 +891,17 @@ class IPAdapterFaceIDStableDiffusionPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. diff --git a/examples/community/latent_consistency_img2img.py b/examples/community/latent_consistency_img2img.py index 3c5ffa8456..97089e7d19 100644 --- a/examples/community/latent_consistency_img2img.py +++ b/examples/community/latent_consistency_img2img.py @@ -88,7 +88,7 @@ class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline): torch device num_images_per_prompt (`int`): number of images that should be generated per prompt - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. """ @@ -282,10 +282,10 @@ class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline): width: Optional[int] = 768, guidance_scale: float = 7.5, num_images_per_prompt: Optional[int] = 1, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, num_inference_steps: int = 4, lcm_origin_steps: int = 50, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -395,16 +395,16 @@ class LCMSchedulerOutput(BaseOutput): """ Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - denoised: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + denoised: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -452,10 +452,10 @@ def rescale_zero_terminal_snr(betas): """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -587,17 +587,17 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin): self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -613,7 +613,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -685,25 +685,25 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timeindex: int, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: """ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -714,7 +714,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin): `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): @@ -777,10 +777,10 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -799,9 +799,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin): return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) diff --git a/examples/community/latent_consistency_interpolate.py b/examples/community/latent_consistency_interpolate.py index 3d2413c991..8db70d3b95 100644 --- a/examples/community/latent_consistency_interpolate.py +++ b/examples/community/latent_consistency_interpolate.py @@ -281,8 +281,8 @@ class LatentConsistencyModelWalkPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -302,10 +302,10 @@ class LatentConsistencyModelWalkPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -506,7 +506,7 @@ class LatentConsistencyModelWalkPipeline( data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -546,7 +546,7 @@ class LatentConsistencyModelWalkPipeline( height: int, width: int, callback_steps: int, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -580,11 +580,11 @@ class LatentConsistencyModelWalkPipeline( @torch.no_grad() def interpolate_embedding( self, - start_embedding: torch.FloatTensor, - end_embedding: torch.FloatTensor, + start_embedding: torch.Tensor, + end_embedding: torch.Tensor, num_interpolation_steps: Union[int, List[int]], interpolation_type: str, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if interpolation_type == "lerp": interpolation_fn = lerp elif interpolation_type == "slerp": @@ -611,11 +611,11 @@ class LatentConsistencyModelWalkPipeline( @torch.no_grad() def interpolate_latent( self, - start_latent: torch.FloatTensor, - end_latent: torch.FloatTensor, + start_latent: torch.Tensor, + end_latent: torch.Tensor, num_interpolation_steps: Union[int, List[int]], interpolation_type: str, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if interpolation_type == "lerp": interpolation_fn = lerp elif interpolation_type == "slerp": @@ -663,8 +663,8 @@ class LatentConsistencyModelWalkPipeline( guidance_scale: float = 8.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -705,11 +705,11 @@ class LatentConsistencyModelWalkPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/examples/community/latent_consistency_txt2img.py b/examples/community/latent_consistency_txt2img.py index c31d6abae3..279b56d1e1 100755 --- a/examples/community/latent_consistency_txt2img.py +++ b/examples/community/latent_consistency_txt2img.py @@ -86,7 +86,7 @@ class LatentConsistencyModelPipeline(DiffusionPipeline): torch device num_images_per_prompt (`int`): number of images that should be generated per prompt - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. """ @@ -208,10 +208,10 @@ class LatentConsistencyModelPipeline(DiffusionPipeline): width: Optional[int] = 768, guidance_scale: float = 7.5, num_images_per_prompt: Optional[int] = 1, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, num_inference_steps: int = 4, lcm_origin_steps: int = 50, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -310,16 +310,16 @@ class LCMSchedulerOutput(BaseOutput): """ Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - denoised: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + denoised: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -367,10 +367,10 @@ def rescale_zero_terminal_snr(betas): """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -499,17 +499,17 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -525,7 +525,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -593,25 +593,25 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timeindex: int, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: """ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -622,7 +622,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): @@ -685,10 +685,10 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -707,9 +707,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) diff --git a/examples/community/llm_grounded_diffusion.py b/examples/community/llm_grounded_diffusion.py index c3b07ade7b..24b0a9d0e2 100644 --- a/examples/community/llm_grounded_diffusion.py +++ b/examples/community/llm_grounded_diffusion.py @@ -756,13 +756,13 @@ class LLMGroundedDiffusionPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -807,14 +807,14 @@ class LLMGroundedDiffusionPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. @@ -825,7 +825,7 @@ class LLMGroundedDiffusionPipeline( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -1194,8 +1194,8 @@ class LLMGroundedDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -1227,8 +1227,8 @@ class LLMGroundedDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -1248,10 +1248,10 @@ class LLMGroundedDiffusionPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1509,7 +1509,7 @@ class LLMGroundedDiffusionPipeline( data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index 184ba30c4b..9f496330a0 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -378,7 +378,7 @@ def preprocess_image(image, batch_size): def preprocess_mask(mask, batch_size, scale_factor=8): - if not isinstance(mask, torch.FloatTensor): + if not isinstance(mask, torch.Tensor): mask = mask.convert("L") w, h = mask.size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 @@ -543,8 +543,8 @@ class StableDiffusionLongPromptWeightingPipeline( do_classifier_free_guidance, negative_prompt=None, max_embeddings_multiples=3, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -767,8 +767,8 @@ class StableDiffusionLongPromptWeightingPipeline( self, prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, height: int = 512, width: int = 512, num_inference_steps: int = 50, @@ -778,13 +778,13 @@ class StableDiffusionLongPromptWeightingPipeline( add_predicted_noise: Optional[bool] = False, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -798,10 +798,10 @@ class StableDiffusionLongPromptWeightingPipeline( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should @@ -836,14 +836,14 @@ class StableDiffusionLongPromptWeightingPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -857,7 +857,7 @@ class StableDiffusionLongPromptWeightingPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. @@ -1032,13 +1032,13 @@ class StableDiffusionLongPromptWeightingPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1072,14 +1072,14 @@ class StableDiffusionLongPromptWeightingPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1093,7 +1093,7 @@ class StableDiffusionLongPromptWeightingPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. @@ -1137,7 +1137,7 @@ class StableDiffusionLongPromptWeightingPipeline( def img2img( self, - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, strength: float = 0.8, @@ -1146,12 +1146,12 @@ class StableDiffusionLongPromptWeightingPipeline( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1159,7 +1159,7 @@ class StableDiffusionLongPromptWeightingPipeline( r""" Function for image-to-image generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. prompt (`str` or `List[str]`): @@ -1190,10 +1190,10 @@ class StableDiffusionLongPromptWeightingPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1207,7 +1207,7 @@ class StableDiffusionLongPromptWeightingPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. @@ -1249,8 +1249,8 @@ class StableDiffusionLongPromptWeightingPipeline( def inpaint( self, - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, strength: float = 0.8, @@ -1260,12 +1260,12 @@ class StableDiffusionLongPromptWeightingPipeline( add_predicted_noise: Optional[bool] = False, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1273,10 +1273,10 @@ class StableDiffusionLongPromptWeightingPipeline( r""" Function for inpaint. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should @@ -1311,10 +1311,10 @@ class StableDiffusionLongPromptWeightingPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1328,7 +1328,7 @@ class StableDiffusionLongPromptWeightingPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py index 64b7973e89..4ea98b1306 100644 --- a/examples/community/lpw_stable_diffusion_xl.py +++ b/examples/community/lpw_stable_diffusion_xl.py @@ -694,10 +694,10 @@ class SDXLLongPromptWeightingPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -722,17 +722,17 @@ class SDXLLongPromptWeightingPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1320,7 +1320,7 @@ class SDXLLongPromptWeightingPipeline( data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1378,7 +1378,7 @@ class SDXLLongPromptWeightingPipeline( prompt_2: Optional[str] = None, image: Optional[PipelineImageInput] = None, mask_image: Optional[PipelineImageInput] = None, - masked_image_latents: Optional[torch.FloatTensor] = None, + masked_image_latents: Optional[torch.Tensor] = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 0.8, @@ -1392,12 +1392,12 @@ class SDXLLongPromptWeightingPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1481,23 +1481,23 @@ class SDXLLongPromptWeightingPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1926,12 +1926,12 @@ class SDXLLongPromptWeightingPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -2001,12 +2001,12 @@ class SDXLLongPromptWeightingPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -2066,7 +2066,7 @@ class SDXLLongPromptWeightingPipeline( prompt_2: Optional[str] = None, image: Optional[PipelineImageInput] = None, mask_image: Optional[PipelineImageInput] = None, - masked_image_latents: Optional[torch.FloatTensor] = None, + masked_image_latents: Optional[torch.Tensor] = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 0.8, @@ -2080,12 +2080,12 @@ class SDXLLongPromptWeightingPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, diff --git a/examples/community/masked_stable_diffusion_img2img.py b/examples/community/masked_stable_diffusion_img2img.py index 0b08086c7d..a210c167a2 100644 --- a/examples/community/masked_stable_diffusion_img2img.py +++ b/examples/community/masked_stable_diffusion_img2img.py @@ -16,10 +16,10 @@ class MaskedStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline): self, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -30,18 +30,18 @@ class MaskedStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline): num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, mask: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -52,7 +52,7 @@ class MaskedStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline): Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. strength (`float`, *optional*, defaults to 0.8): @@ -78,10 +78,10 @@ class MaskedStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -91,14 +91,14 @@ class MaskedStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - mask (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`, *optional*): + mask (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`, *optional*): A mask with non-zero elements for the area to be inpainted. If not specified, no mask is applied. Examples: diff --git a/examples/community/mixture_canvas.py b/examples/community/mixture_canvas.py index 2083c7acad..7196ee9587 100644 --- a/examples/community/mixture_canvas.py +++ b/examples/community/mixture_canvas.py @@ -154,7 +154,7 @@ class Text2ImageRegion(DiffusionRegion): class Image2ImageRegion(DiffusionRegion): """Class defining a region where an image guided diffusion process is acting""" - reference_image: torch.FloatTensor = None + reference_image: torch.Tensor = None strength: float = 0.8 # Strength of the image def __post_init__(self): diff --git a/examples/community/multilingual_stable_diffusion.py b/examples/community/multilingual_stable_diffusion.py index f3b0540cf4..dc335e0b58 100644 --- a/examples/community/multilingual_stable_diffusion.py +++ b/examples/community/multilingual_stable_diffusion.py @@ -147,10 +147,10 @@ class MultilingualStableDiffusion(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -184,7 +184,7 @@ class MultilingualStableDiffusion(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -196,7 +196,7 @@ class MultilingualStableDiffusion(DiffusionPipeline, StableDiffusionMixin): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py index 4b1a641427..ac0aa38254 100644 --- a/examples/community/pipeline_animatediff_controlnet.py +++ b/examples/community/pipeline_animatediff_controlnet.py @@ -198,8 +198,8 @@ class AnimateDiffControlNetPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -219,10 +219,10 @@ class AnimateDiffControlNetPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -752,9 +752,9 @@ class AnimateDiffControlNetPipeline( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, ip_adapter_image_embeds: Optional[PipelineImageInput] = None, conditioning_frames: Optional[List[PipelineImageInput]] = None, @@ -798,20 +798,20 @@ class AnimateDiffControlNetPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. @@ -821,7 +821,7 @@ class AnimateDiffControlNetPipeline( are specified, images must be passed as a list such that each element of the list can be correctly batched for input to a single ControlNet. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py index d920912226..7546fbd9bc 100644 --- a/examples/community/pipeline_animatediff_img2video.py +++ b/examples/community/pipeline_animatediff_img2video.py @@ -315,8 +315,8 @@ class AnimateDiffImgToVideoPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -336,10 +336,10 @@ class AnimateDiffImgToVideoPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -746,14 +746,14 @@ class AnimateDiffImgToVideoPipeline( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, ip_adapter_image_embeds: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -791,33 +791,33 @@ class AnimateDiffImgToVideoPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`AnimateDiffImgToVideoPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py index f46d635dae..b4d47f1856 100644 --- a/examples/community/pipeline_demofusion_sdxl.py +++ b/examples/community/pipeline_demofusion_sdxl.py @@ -187,10 +187,10 @@ class DemoFusionSDXLPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -215,17 +215,17 @@ class DemoFusionSDXLPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -642,14 +642,14 @@ class DemoFusionSDXLPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = False, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -720,21 +720,21 @@ class DemoFusionSDXLPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -746,7 +746,7 @@ class DemoFusionSDXLPipeline( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_fabric.py b/examples/community/pipeline_fabric.py index 46692a9684..f17c8e52f5 100644 --- a/examples/community/pipeline_fabric.py +++ b/examples/community/pipeline_fabric.py @@ -190,8 +190,8 @@ class FabricPipeline(DiffusionPipeline): num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -210,10 +210,10 @@ class FabricPipeline(DiffusionPipeline): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -512,7 +512,7 @@ class FabricPipeline(DiffusionPipeline): neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ): r""" The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The diff --git a/examples/community/pipeline_prompt2prompt.py b/examples/community/pipeline_prompt2prompt.py index db9f215b6b..8e9bcddfef 100644 --- a/examples/community/pipeline_prompt2prompt.py +++ b/examples/community/pipeline_prompt2prompt.py @@ -217,8 +217,8 @@ class Prompt2PromptPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -250,8 +250,8 @@ class Prompt2PromptPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -271,10 +271,10 @@ class Prompt2PromptPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -564,12 +564,12 @@ class Prompt2PromptPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -604,7 +604,7 @@ class Prompt2PromptPipeline( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -616,7 +616,7 @@ class Prompt2PromptPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_sdxl_style_aligned.py b/examples/community/pipeline_sdxl_style_aligned.py index 88edeeb7ee..5ad85dc90a 100644 --- a/examples/community/pipeline_sdxl_style_aligned.py +++ b/examples/community/pipeline_sdxl_style_aligned.py @@ -514,10 +514,10 @@ class StyleAlignedSDXLPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -543,17 +543,17 @@ class StyleAlignedSDXLPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1325,7 +1325,7 @@ class StyleAlignedSDXLPipeline( data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1387,7 +1387,7 @@ class StyleAlignedSDXLPipeline( prompt_2: Optional[Union[str, List[str]]] = None, image: Optional[PipelineImageInput] = None, mask_image: Optional[PipelineImageInput] = None, - masked_image_latents: Optional[torch.FloatTensor] = None, + masked_image_latents: Optional[torch.Tensor] = None, strength: float = 0.3, height: Optional[int] = None, width: Optional[int] = None, @@ -1401,11 +1401,11 @@ class StyleAlignedSDXLPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -1474,21 +1474,21 @@ class StyleAlignedSDXLPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/examples/community/pipeline_stable_diffusion_pag.py b/examples/community/pipeline_stable_diffusion_pag.py index cdb7bd99cb..5c588adc4f 100644 --- a/examples/community/pipeline_stable_diffusion_pag.py +++ b/examples/community/pipeline_stable_diffusion_pag.py @@ -57,13 +57,13 @@ class PAGIdentitySelfAttnProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -171,13 +171,13 @@ class PAGCFGIdentitySelfAttnProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -493,8 +493,8 @@ class StableDiffusionPAGPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -525,8 +525,8 @@ class StableDiffusionPAGPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -545,10 +545,10 @@ class StableDiffusionPAGPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -966,7 +966,7 @@ class StableDiffusionPAGPipeline( dtype: data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1078,11 +1078,11 @@ class StableDiffusionPAGPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1122,18 +1122,18 @@ class StableDiffusionPAGPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py index 61c4fbc13e..0622db005d 100644 --- a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py +++ b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py @@ -164,8 +164,8 @@ class StableDiffusionUpscaleLDM3DPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -197,8 +197,8 @@ class StableDiffusionUpscaleLDM3DPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -218,10 +218,10 @@ class StableDiffusionUpscaleLDM3DPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -535,12 +535,12 @@ class StableDiffusionUpscaleLDM3DPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, target_res: Optional[List[int]] = [1024, 1024], @@ -551,7 +551,7 @@ class StableDiffusionUpscaleLDM3DPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be upscaled. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -570,14 +570,14 @@ class StableDiffusionUpscaleLDM3DPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -587,7 +587,7 @@ class StableDiffusionUpscaleLDM3DPipeline( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py index 82c522b448..6cb4928f76 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py @@ -248,10 +248,10 @@ class StableDiffusionXLControlNetAdapterPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -277,17 +277,17 @@ class StableDiffusionXLControlNetAdapterPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -887,14 +887,14 @@ class StableDiffusionXLControlNetAdapterPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -922,14 +922,14 @@ class StableDiffusionXLControlNetAdapterPipeline( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - adapter_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + adapter_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -973,21 +973,21 @@ class StableDiffusionXLControlNetAdapterPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -999,7 +999,7 @@ class StableDiffusionXLControlNetAdapterPipeline( instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py index a85f1c3da6..c5495f756c 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py @@ -396,10 +396,10 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -425,17 +425,17 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1229,14 +1229,14 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[Union[torch.FloatTensor]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[Union[torch.Tensor]] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -1270,14 +1270,14 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - adapter_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + adapter_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -1330,21 +1330,21 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1356,7 +1356,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline( instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py index 49fed61254..ae9708a6fa 100644 --- a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py +++ b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py @@ -280,10 +280,10 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -309,17 +309,17 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -868,7 +868,7 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -881,7 +881,7 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -942,10 +942,10 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -960,13 +960,13 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -982,12 +982,12 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( clip_skip: Optional[int] = None, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], - map: torch.FloatTensor = None, + map: torch.Tensor = None, original_image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -1003,7 +1003,7 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. strength (`float`, *optional*, defaults to 0.3): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -1051,26 +1051,26 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. @@ -1083,7 +1083,7 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py b/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py index f1b8dfa96f..fb46ff3f38 100644 --- a/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py +++ b/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py @@ -561,12 +561,12 @@ class StableDiffusionXLInstantIDImg2ImgPipeline(StableDiffusionXLControlNetImg2I num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -596,10 +596,10 @@ class StableDiffusionXLInstantIDImg2ImgPipeline(StableDiffusionXLControlNetImg2I prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -632,24 +632,24 @@ class StableDiffusionXLInstantIDImg2ImgPipeline(StableDiffusionXLControlNetImg2I generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. diff --git a/examples/community/pipeline_stable_diffusion_xl_instantid.py b/examples/community/pipeline_stable_diffusion_xl_instantid.py index 147ba46a07..6e77261f51 100644 --- a/examples/community/pipeline_stable_diffusion_xl_instantid.py +++ b/examples/community/pipeline_stable_diffusion_xl_instantid.py @@ -559,12 +559,12 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -592,10 +592,10 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline): prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -628,24 +628,24 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. diff --git a/examples/community/pipeline_stable_diffusion_xl_ipex.py b/examples/community/pipeline_stable_diffusion_xl_ipex.py index a44ccf89ea..791b077a84 100644 --- a/examples/community/pipeline_stable_diffusion_xl_ipex.py +++ b/examples/community/pipeline_stable_diffusion_xl_ipex.py @@ -276,10 +276,10 @@ class StableDiffusionXLPipelineIpex( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -305,17 +305,17 @@ class StableDiffusionXLPipelineIpex( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -687,7 +687,7 @@ class StableDiffusionXLPipelineIpex( data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -750,11 +750,11 @@ class StableDiffusionXLPipelineIpex( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -826,21 +826,21 @@ class StableDiffusionXLPipelineIpex( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1190,11 +1190,11 @@ class StableDiffusionXLPipelineIpex( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, diff --git a/examples/community/pipeline_zero1to3.py b/examples/community/pipeline_zero1to3.py index 5e02ba2866..af1c82dfad 100644 --- a/examples/community/pipeline_zero1to3.py +++ b/examples/community/pipeline_zero1to3.py @@ -192,8 +192,8 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -211,10 +211,10 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -481,7 +481,7 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -595,8 +595,8 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - input_imgs: Union[torch.FloatTensor, PIL.Image.Image] = None, - prompt_imgs: Union[torch.FloatTensor, PIL.Image.Image] = None, + input_imgs: Union[torch.Tensor, PIL.Image.Image] = None, + prompt_imgs: Union[torch.Tensor, PIL.Image.Image] = None, poses: Union[List[float], List[List[float]]] = None, torch_dtype=torch.float32, height: Optional[int] = None, @@ -607,12 +607,12 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: float = 1.0, @@ -650,14 +650,14 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -669,7 +669,7 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/regional_prompting_stable_diffusion.py b/examples/community/regional_prompting_stable_diffusion.py index 71f24a81bd..19715a4fb6 100644 --- a/examples/community/regional_prompting_stable_diffusion.py +++ b/examples/community/regional_prompting_stable_diffusion.py @@ -104,7 +104,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, rp_args: Dict[str, str] = None, @@ -168,7 +168,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline): orig_hw = (height, width) revers = True - def pcallback(s_self, step: int, timestep: int, latents: torch.FloatTensor, selfs=None): + def pcallback(s_self, step: int, timestep: int, latents: torch.Tensor, selfs=None): if "PRO" in mode: # in Prompt mode, make masks from sum of attension maps self.step = step @@ -198,10 +198,10 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline): def hook_forward(module): # diffusers==0.23.2 def forward( - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, ) -> torch.Tensor: attn = module diff --git a/examples/community/rerender_a_video.py b/examples/community/rerender_a_video.py index ad414390c9..6e25b92603 100644 --- a/examples/community/rerender_a_video.py +++ b/examples/community/rerender_a_video.py @@ -142,12 +142,12 @@ class TextToVideoSDPipelineOutput(BaseOutput): Output class for text-to-video pipelines. Args: - frames (`List[np.ndarray]` or `torch.FloatTensor`) + frames (`List[np.ndarray]` or `torch.Tensor`) List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as a `torch` tensor. The length of the list denotes the video length (the number of frames). """ - frames: Union[List[np.ndarray], torch.FloatTensor] + frames: Union[List[np.ndarray], torch.Tensor] @torch.no_grad() @@ -589,20 +589,20 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline): def __call__( self, prompt: Union[str, List[str]] = None, - frames: Union[List[np.ndarray], torch.FloatTensor] = None, - control_frames: Union[List[np.ndarray], torch.FloatTensor] = None, + frames: Union[List[np.ndarray], torch.Tensor] = None, + control_frames: Union[List[np.ndarray], torch.Tensor] = None, strength: float = 0.8, num_inference_steps: int = 50, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, @@ -624,8 +624,8 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline): prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - frames (`List[np.ndarray]` or `torch.FloatTensor`): The input images to be used as the starting point for the image generation process. - control_frames (`List[np.ndarray]` or `torch.FloatTensor`): The ControlNet input images condition to provide guidance to the `unet` for generation. + frames (`List[np.ndarray]` or `torch.Tensor`): The input images to be used as the starting point for the image generation process. + control_frames (`List[np.ndarray]` or `torch.Tensor`): The ControlNet input images condition to provide guidance to the `unet` for generation. strength ('float'): SDEdit strength. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -646,14 +646,14 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -665,7 +665,7 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/run_onnx_controlnet.py b/examples/community/run_onnx_controlnet.py index ed9b233184..af2672c17e 100644 --- a/examples/community/run_onnx_controlnet.py +++ b/examples/community/run_onnx_controlnet.py @@ -507,18 +507,18 @@ class OnnxStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): fp16: bool = True, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, control_image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -531,12 +531,12 @@ class OnnxStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, @@ -551,14 +551,14 @@ class OnnxStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image will be used as the starting point for the image generation process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in init, images must be passed as a list such that each element of the list can be correctly @@ -588,14 +588,14 @@ class OnnxStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -607,7 +607,7 @@ class OnnxStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/run_tensorrt_controlnet.py b/examples/community/run_tensorrt_controlnet.py index aece5484e3..873195fa31 100644 --- a/examples/community/run_tensorrt_controlnet.py +++ b/examples/community/run_tensorrt_controlnet.py @@ -611,18 +611,18 @@ class TensorRTStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): fp16: bool = True, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, control_image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -635,12 +635,12 @@ class TensorRTStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, @@ -655,14 +655,14 @@ class TensorRTStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image will be used as the starting point for the image generation process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in init, images must be passed as a list such that each element of the list can be correctly @@ -692,14 +692,14 @@ class TensorRTStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -711,7 +711,7 @@ class TensorRTStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/scheduling_ufogen.py b/examples/community/scheduling_ufogen.py index 3c03b3e8df..5213f13ffb 100644 --- a/examples/community/scheduling_ufogen.py +++ b/examples/community/scheduling_ufogen.py @@ -34,16 +34,16 @@ class UFOGenSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -98,11 +98,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -240,19 +240,19 @@ class UFOGenScheduler(SchedulerMixin, ConfigMixin): self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy()) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -340,7 +340,7 @@ class UFOGenScheduler(SchedulerMixin, ConfigMixin): self.timesteps = torch.from_numpy(timesteps).to(device) # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -375,9 +375,9 @@ class UFOGenScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[UFOGenSchedulerOutput, Tuple]: @@ -386,11 +386,11 @@ class UFOGenScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -461,10 +461,10 @@ class UFOGenScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -483,9 +483,7 @@ class UFOGenScheduler(SchedulerMixin, ConfigMixin): return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) diff --git a/examples/community/sd_text2img_k_diffusion.py b/examples/community/sd_text2img_k_diffusion.py index 3299a76052..9f83973aba 100755 --- a/examples/community/sd_text2img_k_diffusion.py +++ b/examples/community/sd_text2img_k_diffusion.py @@ -286,10 +286,10 @@ class StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -323,7 +323,7 @@ class StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -335,7 +335,7 @@ class StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py index e9eba994b5..ae2d8a53b2 100644 --- a/examples/community/seed_resize_stable_diffusion.py +++ b/examples/community/seed_resize_stable_diffusion.py @@ -81,12 +81,12 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin) num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, - text_embeddings: Optional[torch.FloatTensor] = None, + text_embeddings: Optional[torch.Tensor] = None, **kwargs, ): r""" @@ -119,7 +119,7 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin) generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -131,7 +131,7 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin) plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/speech_to_image_diffusion.py b/examples/community/speech_to_image_diffusion.py index 3537ef89e1..9cb5a2a8c7 100644 --- a/examples/community/speech_to_image_diffusion.py +++ b/examples/community/speech_to_image_diffusion.py @@ -76,10 +76,10 @@ class SpeechToImagePipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): diff --git a/examples/community/stable_diffusion_comparison.py b/examples/community/stable_diffusion_comparison.py index dab5705b33..2b510a64f8 100644 --- a/examples/community/stable_diffusion_comparison.py +++ b/examples/community/stable_diffusion_comparison.py @@ -96,10 +96,10 @@ class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin) num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -133,10 +133,10 @@ class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin) num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -170,10 +170,10 @@ class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin) num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -207,10 +207,10 @@ class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin) num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -244,10 +244,10 @@ class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin) num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -276,7 +276,7 @@ class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin) generator (`torch.Generator`, optional): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, optional): + latents (`torch.Tensor`, optional): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/examples/community/stable_diffusion_controlnet_img2img.py b/examples/community/stable_diffusion_controlnet_img2img.py index 74674e65f0..c7c88d6fdc 100644 --- a/examples/community/stable_diffusion_controlnet_img2img.py +++ b/examples/community/stable_diffusion_controlnet_img2img.py @@ -189,8 +189,8 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusio num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -207,10 +207,10 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusio negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -600,7 +600,7 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusio prompt: Union[str, List[str]] = None, image: Union[torch.Tensor, PIL.Image.Image] = None, controlnet_conditioning_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image] ] = None, strength: float = 0.8, height: Optional[int] = None, @@ -611,12 +611,12 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusio num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, @@ -633,9 +633,9 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusio image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will be masked out with `mask_image` and repainted according to `prompt`. - controlnet_conditioning_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`): + controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. strength (`float`, *optional*): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -667,14 +667,14 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusio generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -686,7 +686,7 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusio plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_controlnet_inpaint.py b/examples/community/stable_diffusion_controlnet_inpaint.py index a13497dddc..b473ffe799 100644 --- a/examples/community/stable_diffusion_controlnet_inpaint.py +++ b/examples/community/stable_diffusion_controlnet_inpaint.py @@ -288,8 +288,8 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusio num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -306,10 +306,10 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusio negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -744,7 +744,7 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusio image: Union[torch.Tensor, PIL.Image.Image] = None, mask_image: Union[torch.Tensor, PIL.Image.Image] = None, controlnet_conditioning_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image] ] = None, height: Optional[int] = None, width: Optional[int] = None, @@ -754,12 +754,12 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusio num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, @@ -779,9 +779,9 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusio repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - controlnet_conditioning_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`): + controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -807,14 +807,14 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusio generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -826,7 +826,7 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusio plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py index 14c4e4aa6d..8928f34239 100644 --- a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py +++ b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py @@ -273,8 +273,8 @@ class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableD num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -291,10 +291,10 @@ class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableD negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -731,7 +731,7 @@ class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableD image: Union[torch.Tensor, PIL.Image.Image] = None, mask_image: Union[torch.Tensor, PIL.Image.Image] = None, controlnet_conditioning_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image] ] = None, strength: float = 0.8, height: Optional[int] = None, @@ -742,12 +742,12 @@ class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableD num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: float = 1.0, @@ -767,9 +767,9 @@ class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableD repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - controlnet_conditioning_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`): + controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. strength (`float`, *optional*): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -801,14 +801,14 @@ class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableD generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -820,7 +820,7 @@ class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableD plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py index 16f7f589b7..e4d8e12f85 100644 --- a/examples/community/stable_diffusion_controlnet_reference.py +++ b/examples/community/stable_diffusion_controlnet_reference.py @@ -100,14 +100,14 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli self, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + ref_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -116,12 +116,12 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, @@ -139,17 +139,17 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in init, images must be passed as a list such that each element of the list can be correctly batched for input to a single controlnet. - ref_image (`torch.FloatTensor`, `PIL.Image.Image`): + ref_image (`torch.Tensor`, `PIL.Image.Image`): The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to Reference Control as is. `PIL.Image.Image` can also be accepted as an image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -176,14 +176,14 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -195,7 +195,7 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -374,10 +374,10 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli def hacked_basic_transformer_inner_forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, @@ -492,12 +492,12 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli def hack_CrossAttnDownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 @@ -588,14 +588,14 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 # TODO(Patrick, William) - attention mask is not used diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py index dd648fd8c7..92588ba8a2 100644 --- a/examples/community/stable_diffusion_ipex.py +++ b/examples/community/stable_diffusion_ipex.py @@ -311,8 +311,8 @@ class StableDiffusionIPEXPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -330,10 +330,10 @@ class StableDiffusionIPEXPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -567,12 +567,12 @@ class StableDiffusionIPEXPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): @@ -608,14 +608,14 @@ class StableDiffusionIPEXPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -627,7 +627,7 @@ class StableDiffusionIPEXPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py index e53afb703e..95b4b03e4d 100644 --- a/examples/community/stable_diffusion_mega.py +++ b/examples/community/stable_diffusion_mega.py @@ -99,8 +99,8 @@ class StableDiffusionMegaPipeline(DiffusionPipeline, StableDiffusionMixin): def inpaint( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -110,7 +110,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline, StableDiffusionMixin): generator: Optional[torch.Generator] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline @@ -134,7 +134,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline, StableDiffusionMixin): def img2img( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -144,7 +144,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline, StableDiffusionMixin): generator: Optional[torch.Generator] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -177,10 +177,10 @@ class StableDiffusionMegaPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): # For more information on how this function https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py index c2dd184c2f..2352b92c30 100644 --- a/examples/community/stable_diffusion_reference.py +++ b/examples/community/stable_diffusion_reference.py @@ -268,10 +268,10 @@ class StableDiffusionReferencePipeline( width: int, callback_steps: Optional[int], negative_prompt: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[torch.Tensor] = None, - ip_adapter_image_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image_embeds: Optional[torch.Tensor] = None, callback_on_step_end_tensor_inputs: Optional[List[str]] = None, ) -> None: """ @@ -283,10 +283,10 @@ class StableDiffusionReferencePipeline( width (int): The width of the input image. callback_steps (Optional[int]): The number of steps to perform the callback on. negative_prompt (Optional[str]): The negative prompt text. - prompt_embeds (Optional[torch.FloatTensor]): The prompt embeddings. - negative_prompt_embeds (Optional[torch.FloatTensor]): The negative prompt embeddings. + prompt_embeds (Optional[torch.Tensor]): The prompt embeddings. + negative_prompt_embeds (Optional[torch.Tensor]): The negative prompt embeddings. ip_adapter_image (Optional[torch.Tensor]): The input adapter image. - ip_adapter_image_embeds (Optional[torch.FloatTensor]): The input adapter image embeddings. + ip_adapter_image_embeds (Optional[torch.Tensor]): The input adapter image embeddings. callback_on_step_end_tensor_inputs (Optional[List[str]]): The list of tensor inputs to perform the callback on. Raises: @@ -357,11 +357,11 @@ class StableDiffusionReferencePipeline( num_images_per_prompt: int, do_classifier_free_guidance: bool, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Encodes the prompt into embeddings. @@ -371,13 +371,13 @@ class StableDiffusionReferencePipeline( num_images_per_prompt (int): The number of images per prompt. do_classifier_free_guidance (bool): Whether to use classifier-free guidance. negative_prompt (Optional[Union[str, List[str]]], optional): The negative prompt text or a list of negative prompt texts. Defaults to None. - prompt_embeds (Optional[torch.FloatTensor], optional): The prompt embeddings. Defaults to None. - negative_prompt_embeds (Optional[torch.FloatTensor], optional): The negative prompt embeddings. Defaults to None. + prompt_embeds (Optional[torch.Tensor], optional): The prompt embeddings. Defaults to None. + negative_prompt_embeds (Optional[torch.Tensor], optional): The negative prompt embeddings. Defaults to None. lora_scale (Optional[float], optional): The LoRA scale. Defaults to None. **kwargs: Additional keyword arguments. Returns: - torch.FloatTensor: The encoded prompt embeddings. + torch.Tensor: The encoded prompt embeddings. """ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) @@ -407,11 +407,11 @@ class StableDiffusionReferencePipeline( num_images_per_prompt: int, do_classifier_free_guidance: bool, negative_prompt: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Encodes the prompt into text encoder hidden states. @@ -428,10 +428,10 @@ class StableDiffusionReferencePipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -813,7 +813,7 @@ class StableDiffusionReferencePipeline( def __call__( self, prompt: Union[str, List[str]] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + ref_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -822,12 +822,12 @@ class StableDiffusionReferencePipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -844,9 +844,9 @@ class StableDiffusionReferencePipeline( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - ref_image (`torch.FloatTensor`, `PIL.Image.Image`): + ref_image (`torch.Tensor`, `PIL.Image.Image`): The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to Reference Control as is. `PIL.Image.Image` can also be accepted as an image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -873,14 +873,14 @@ class StableDiffusionReferencePipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -892,7 +892,7 @@ class StableDiffusionReferencePipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -1017,10 +1017,10 @@ class StableDiffusionReferencePipeline( def hacked_basic_transformer_inner_forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, @@ -1135,12 +1135,12 @@ class StableDiffusionReferencePipeline( def hack_CrossAttnDownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 @@ -1191,10 +1191,10 @@ class StableDiffusionReferencePipeline( def hacked_DownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, **kwargs: Any, - ) -> Tuple[torch.FloatTensor, ...]: + ) -> Tuple[torch.Tensor, ...]: eps = 1e-6 output_states = () @@ -1236,15 +1236,15 @@ class StableDiffusionReferencePipeline( def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: eps = 1e-6 # TODO(Patrick, William) - attention mask is not used for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)): @@ -1292,12 +1292,12 @@ class StableDiffusionReferencePipeline( def hacked_UpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, **kwargs: Any, - ) -> torch.FloatTensor: + ) -> torch.Tensor: eps = 1e-6 for i, resnet in enumerate(self.resnets): # pop res hidden states diff --git a/examples/community/stable_diffusion_repaint.py b/examples/community/stable_diffusion_repaint.py index 02bef293bb..2addc5a62d 100644 --- a/examples/community/stable_diffusion_repaint.py +++ b/examples/community/stable_diffusion_repaint.py @@ -285,8 +285,8 @@ class StableDiffusionRepaintPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -303,10 +303,10 @@ class StableDiffusionRepaintPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -606,8 +606,8 @@ class StableDiffusionRepaintPipeline( def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -618,12 +618,12 @@ class StableDiffusionRepaintPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -671,14 +671,14 @@ class StableDiffusionRepaintPipeline( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -690,7 +690,7 @@ class StableDiffusionRepaintPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_tensorrt_img2img.py b/examples/community/stable_diffusion_tensorrt_img2img.py index 90ec138398..7264a60506 100755 --- a/examples/community/stable_diffusion_tensorrt_img2img.py +++ b/examples/community/stable_diffusion_tensorrt_img2img.py @@ -962,7 +962,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: int = 50, guidance_scale: float = 7.5, diff --git a/examples/community/stable_diffusion_tensorrt_inpaint.py b/examples/community/stable_diffusion_tensorrt_inpaint.py index 9ace697cc7..b2d61a3dab 100755 --- a/examples/community/stable_diffusion_tensorrt_inpaint.py +++ b/examples/community/stable_diffusion_tensorrt_inpaint.py @@ -962,8 +962,8 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 1.0, num_inference_steps: int = 50, guidance_scale: float = 7.5, diff --git a/examples/community/stable_diffusion_xl_reference.py b/examples/community/stable_diffusion_xl_reference.py index 7282582baf..107afc1f8b 100644 --- a/examples/community/stable_diffusion_xl_reference.py +++ b/examples/community/stable_diffusion_xl_reference.py @@ -190,7 +190,7 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline): self, prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + ref_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -201,14 +201,14 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -335,10 +335,10 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline): def hacked_basic_transformer_inner_forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, @@ -453,12 +453,12 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline): def hack_CrossAttnDownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 @@ -549,14 +549,14 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline): def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 # TODO(Patrick, William) - attention mask is not used diff --git a/examples/community/stable_unclip.py b/examples/community/stable_unclip.py index 6acca20d6a..f13c4e0a49 100644 --- a/examples/community/stable_unclip.py +++ b/examples/community/stable_unclip.py @@ -191,7 +191,7 @@ class StableUnCLIPPipeline(DiffusionPipeline): num_images_per_prompt: int = 1, prior_num_inference_steps: int = 25, generator: Optional[torch.Generator] = None, - prior_latents: Optional[torch.FloatTensor] = None, + prior_latents: Optional[torch.Tensor] = None, text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None, text_attention_mask: Optional[torch.Tensor] = None, prior_guidance_scale: float = 4.0, diff --git a/examples/community/text_inpainting.py b/examples/community/text_inpainting.py index ea4da966bb..c4378ab96f 100644 --- a/examples/community/text_inpainting.py +++ b/examples/community/text_inpainting.py @@ -125,7 +125,7 @@ class TextInpainting(DiffusionPipeline, StableDiffusionMixin): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], text: str, height: int = 512, width: int = 512, @@ -135,10 +135,10 @@ class TextInpainting(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -177,7 +177,7 @@ class TextInpainting(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -189,7 +189,7 @@ class TextInpainting(DiffusionPipeline, StableDiffusionMixin): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/tiled_upscaling.py b/examples/community/tiled_upscaling.py index bf290042e8..313b5fc6f7 100644 --- a/examples/community/tiled_upscaling.py +++ b/examples/community/tiled_upscaling.py @@ -193,8 +193,8 @@ class StableDiffusionTiledUpscalePipeline(StableDiffusionUpscalePipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + latents: Optional[torch.Tensor] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, tile_size: int = 128, tile_border: int = 32, @@ -206,7 +206,7 @@ class StableDiffusionTiledUpscalePipeline(StableDiffusionUpscalePipeline): Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`): + image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`): `Image`, or tensor representing an image batch which will be upscaled. * num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -228,7 +228,7 @@ class StableDiffusionTiledUpscalePipeline(StableDiffusionUpscalePipeline): generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/examples/community/unclip_image_interpolation.py b/examples/community/unclip_image_interpolation.py index e3bb44e503..210bd61ecd 100644 --- a/examples/community/unclip_image_interpolation.py +++ b/examples/community/unclip_image_interpolation.py @@ -207,14 +207,14 @@ class UnCLIPImageInterpolationPipeline(DiffusionPipeline): @torch.no_grad() def __call__( self, - image: Optional[Union[List[PIL.Image.Image], torch.FloatTensor]] = None, + image: Optional[Union[List[PIL.Image.Image], torch.Tensor]] = None, steps: int = 5, decoder_num_inference_steps: int = 25, super_res_num_inference_steps: int = 7, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, image_embeddings: Optional[torch.Tensor] = None, - decoder_latents: Optional[torch.FloatTensor] = None, - super_res_latents: Optional[torch.FloatTensor] = None, + decoder_latents: Optional[torch.Tensor] = None, + super_res_latents: Optional[torch.Tensor] = None, decoder_guidance_scale: float = 8.0, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -223,7 +223,7 @@ class UnCLIPImageInterpolationPipeline(DiffusionPipeline): Function invoked when calling the pipeline for generation. Args: - image (`List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`List[PIL.Image.Image]` or `torch.Tensor`): The images to use for the image interpolation. Only accepts a list of two PIL Images or If you provide a tensor, it needs to comply with the configuration of [this](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json) @@ -242,9 +242,9 @@ class UnCLIPImageInterpolationPipeline(DiffusionPipeline): image_embeddings (`torch.Tensor`, *optional*): Pre-defined image embeddings that can be derived from the image encoder. Pre-defined image embeddings can be passed for tasks like image interpolations. `image` can the be left to `None`. - decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): + decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. - super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): + super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. decoder_guidance_scale (`float`, *optional*, defaults to 4.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). @@ -272,19 +272,19 @@ class UnCLIPImageInterpolationPipeline(DiffusionPipeline): raise AssertionError( f"Expected 'image' List to contain PIL.Image.Image, but passed 'image' contents are {type(image[0])} and {type(image[1])}" ) - elif isinstance(image, torch.FloatTensor): + elif isinstance(image, torch.Tensor): if image.shape[0] != 2: raise AssertionError( - f"Expected 'image' to be torch.FloatTensor of shape 2 in 0th dimension, but passed 'image' size is {image.shape[0]}" + f"Expected 'image' to be torch.Tensor of shape 2 in 0th dimension, but passed 'image' size is {image.shape[0]}" ) elif isinstance(image_embeddings, torch.Tensor): if image_embeddings.shape[0] != 2: raise AssertionError( - f"Expected 'image_embeddings' to be torch.FloatTensor of shape 2 in 0th dimension, but passed 'image_embeddings' shape is {image_embeddings.shape[0]}" + f"Expected 'image_embeddings' to be torch.Tensor of shape 2 in 0th dimension, but passed 'image_embeddings' shape is {image_embeddings.shape[0]}" ) else: raise AssertionError( - f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or Torch.FloatTensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively" + f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively" ) original_image_embeddings = self._encode_image( diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py index 241e661536..c866ce2ae9 100644 --- a/examples/community/wildcard_stable_diffusion.py +++ b/examples/community/wildcard_stable_diffusion.py @@ -166,10 +166,10 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = [], @@ -206,7 +206,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -218,7 +218,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py index 1e88cb67ee..e7d934dd07 100644 --- a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py @@ -336,7 +336,7 @@ def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/consistency_distillation/train_lcm_distill_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_sd_wds.py index 5dcad9f6cc..7fbcb5d6fb 100644 --- a/examples/consistency_distillation/train_lcm_distill_sd_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_sd_wds.py @@ -314,7 +314,7 @@ def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py index a7deca72a8..c8b91e7abd 100644 --- a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py @@ -406,7 +406,7 @@ def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/research_projects/consistency_training/train_cm_ct_unconditional.py b/examples/research_projects/consistency_training/train_cm_ct_unconditional.py index 947de230b6..b7a1e2a545 100644 --- a/examples/research_projects/consistency_training/train_cm_ct_unconditional.py +++ b/examples/research_projects/consistency_training/train_cm_ct_unconditional.py @@ -126,7 +126,7 @@ def get_karras_sigmas( return sigmas -def get_discretized_lognormal_weights(noise_levels: torch.FloatTensor, p_mean: float = -1.1, p_std: float = 2.0): +def get_discretized_lognormal_weights(noise_levels: torch.Tensor, p_mean: float = -1.1, p_std: float = 2.0): """ Calculates the unnormalized weights for a 1D array of noise level sigma_i based on the discretized lognormal" " distribution used in the iCT paper (given in Equation 10). @@ -137,14 +137,14 @@ def get_discretized_lognormal_weights(noise_levels: torch.FloatTensor, p_mean: f return weights -def get_loss_weighting_schedule(noise_levels: torch.FloatTensor): +def get_loss_weighting_schedule(noise_levels: torch.Tensor): """ Calculates the loss weighting schedule lambda given a set of noise levels. """ return 1.0 / (noise_levels[1:] - noise_levels[:-1]) -def add_noise(original_samples: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.FloatTensor): +def add_noise(original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor): # Make sure timesteps (Karras sigmas) have the same device and dtype as original_samples sigmas = timesteps.to(device=original_samples.device, dtype=original_samples.dtype) while len(sigmas.shape) < len(original_samples.shape): diff --git a/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb b/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb index 6997c74bca..bde093802a 100644 --- a/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb +++ b/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb @@ -737,11 +737,11 @@ "class MoleculeGNNOutput(BaseOutput):\n", " \"\"\"\n", " Args:\n", - " sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):\n", + " sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):\n", " Hidden states output. Output of last layer of model.\n", " \"\"\"\n", "\n", - " sample: torch.FloatTensor\n", + " sample: torch.Tensor\n", "\n", "\n", "class MultiLayerPerceptron(nn.Module):\n", @@ -1354,7 +1354,7 @@ " r\"\"\"\n", " Args:\n", " sample: packed torch geometric object\n", - " timestep (`torch.FloatTensor` or `float` or `int): TODO verify type and shape (batch) timesteps\n", + " timestep (`torch.Tensor` or `float` or `int): TODO verify type and shape (batch) timesteps\n", " return_dict (`bool`, *optional*, defaults to `True`):\n", " Whether or not to return a [`~models.molecule_gnn.MoleculeGNNOutput`] instead of a plain tuple.\n", " Returns:\n", @@ -1404,7 +1404,7 @@ " if not return_dict:\n", " return (-eps_pos,)\n", "\n", - " return MoleculeGNNOutput(sample=torch.FloatTensor(-eps_pos).to(pos.device))" + " return MoleculeGNNOutput(sample=torch.Tensor(-eps_pos).to(pos.device))" ], "metadata": { "id": "MCeZA1qQXzoK" diff --git a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py index dcbc2704b8..61b1cbef19 100644 --- a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py +++ b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py @@ -279,8 +279,8 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -312,8 +312,8 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -333,10 +333,10 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -852,7 +852,7 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -906,9 +906,9 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -928,10 +928,10 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -963,14 +963,14 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. @@ -981,7 +981,7 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py b/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py index 4e90159601..46cabd863d 100644 --- a/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py +++ b/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py @@ -181,11 +181,11 @@ class PromptDiffusionControlNetModel(ControlNetModel): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, - controlnet_cond: torch.FloatTensor, - controlnet_query_cond: torch.FloatTensor, + controlnet_cond: torch.Tensor, + controlnet_query_cond: torch.Tensor, conditioning_scale: float = 1.0, class_labels: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None, @@ -194,20 +194,20 @@ class PromptDiffusionControlNetModel(ControlNetModel): cross_attention_kwargs: Optional[Dict[str, Any]] = None, guess_mode: bool = False, return_dict: bool = True, - ) -> Union[ControlNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]: + ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]: """ The [`~PromptDiffusionControlNetModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor. timestep (`Union[torch.Tensor, float, int]`): The number of timesteps to denoise an input. encoder_hidden_states (`torch.Tensor`): The encoder hidden states. - controlnet_cond (`torch.FloatTensor`): + controlnet_cond (`torch.Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. - controlnet_query_cond (`torch.FloatTensor`): + controlnet_query_cond (`torch.Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): The scale factor for ControlNet outputs. diff --git a/examples/research_projects/rdm/pipeline_rdm.py b/examples/research_projects/rdm/pipeline_rdm.py index e0c4847c7e..201acb95aa 100644 --- a/examples/research_projects/rdm/pipeline_rdm.py +++ b/examples/research_projects/rdm/pipeline_rdm.py @@ -163,11 +163,11 @@ class RDMPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, knn: Optional[int] = 10, **kwargs, @@ -199,11 +199,11 @@ class RDMPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -213,7 +213,7 @@ class RDMPipeline(DiffusionPipeline, StableDiffusionMixin): Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/research_projects/rdm/retriever.py b/examples/research_projects/rdm/retriever.py index 16518ed1bc..6be9785a21 100644 --- a/examples/research_projects/rdm/retriever.py +++ b/examples/research_projects/rdm/retriever.py @@ -20,7 +20,7 @@ def normalize_images(images: List[Image.Image]): return images -def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtractor) -> torch.FloatTensor: +def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtractor) -> torch.Tensor: """ Preprocesses a list of images into a batch of tensors. @@ -29,7 +29,7 @@ def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtr A list of images to preprocess. Returns: - :obj:`torch.FloatTensor`: A batch of tensors. + :obj:`torch.Tensor`: A batch of tensors. """ images = [np.array(image) for image in images] images = [(image + 1.0) / 2.0 for image in images] diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 41ee8b9147..1ac9ceddb5 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -17,109 +17,99 @@ MODEL = "base_with_context" def load_notes_encoder(weights, model): - model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"])) - model.position_encoding.weight = nn.Parameter( - torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False - ) + model.token_embedder.weight = nn.Parameter(torch.Tensor(weights["token_embedder"]["embedding"])) + model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False) for lyr_num, lyr in enumerate(model.encoders): ly_weight = weights[f"layers_{lyr_num}"] - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) + lyr.layer[0].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_attention_layer_norm"]["scale"])) attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T)) - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + model.layer_norm.weight = nn.Parameter(torch.Tensor(weights["encoder_norm"]["scale"])) return model def load_continuous_encoder(weights, model): - model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) + model.input_proj.weight = nn.Parameter(torch.Tensor(weights["input_proj"]["kernel"].T)) - model.position_encoding.weight = nn.Parameter( - torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False - ) + model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False) for lyr_num, lyr in enumerate(model.encoders): ly_weight = weights[f"layers_{lyr_num}"] attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_attention_layer_norm"]["scale"])) - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + model.layer_norm.weight = nn.Parameter(torch.Tensor(weights["encoder_norm"]["scale"])) return model def load_decoder(weights, model): - model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) - model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T)) + model.conditioning_emb[0].weight = nn.Parameter(torch.Tensor(weights["time_emb_dense0"]["kernel"].T)) + model.conditioning_emb[2].weight = nn.Parameter(torch.Tensor(weights["time_emb_dense1"]["kernel"].T)) - model.position_encoding.weight = nn.Parameter( - torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False - ) + model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False) model.continuous_inputs_projection.weight = nn.Parameter( - torch.FloatTensor(weights["continuous_inputs_projection"]["kernel"].T) + torch.Tensor(weights["continuous_inputs_projection"]["kernel"].T) ) for lyr_num, lyr in enumerate(model.decoders): ly_weight = weights[f"layers_{lyr_num}"] lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) + torch.Tensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) ) lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter( - torch.FloatTensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T) + torch.Tensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T) ) attention_weights = ly_weight["self_attention"] - lyr.layer[0].attention.to_q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[0].attention.to_k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].attention.to_v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[0].attention.to_out[0].weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].attention.to_q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].attention.to_k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].attention.to_v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].attention.to_out[0].weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) attention_weights = ly_weight["MultiHeadDotProductAttention_0"] - lyr.layer[1].attention.to_q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[1].attention.to_k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[1].attention.to_v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[1].attention.to_out[0].weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[1].attention.to_q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[1].attention.to_k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[1].attention.to_v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[1].attention.to_out[0].weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) lyr.layer[1].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) + torch.Tensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) ) - lyr.layer[2].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[2].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"])) lyr.layer[2].film.scale_bias.weight = nn.Parameter( - torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) + torch.Tensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) ) - lyr.layer[2].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[2].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[2].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T)) - model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) + model.decoder_norm.weight = nn.Parameter(torch.Tensor(weights["decoder_norm"]["scale"])) - model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) + model.spec_out.weight = nn.Parameter(torch.Tensor(weights["spec_out_dense"]["kernel"].T)) return model diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index 50866e3a7a..3d4fccb207 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -282,15 +282,15 @@ class BasicTransformerBlock(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -477,10 +477,10 @@ class TemporalBasicTransformerBlock(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, + hidden_states: torch.Tensor, num_frames: int, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_hidden_states: Optional[torch.Tensor] = None, + ) -> torch.Tensor: # Notice that normalization is always applied before the real computation in the following blocks. # 0. Self-Attention batch_size = hidden_states.shape[0] diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index ea1c987e95..cbb07eafa3 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -503,9 +503,9 @@ class Attention(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, **cross_attention_kwargs, ) -> torch.Tensor: r""" @@ -751,10 +751,10 @@ class AttnProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, ) -> torch.Tensor: @@ -863,9 +863,9 @@ class CustomDiffusionAttnProcessor(nn.Module): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: batch_size, sequence_length, _ = hidden_states.shape attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) @@ -928,9 +928,9 @@ class AttnAddedKVProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, *args, **kwargs, ) -> torch.Tensor: @@ -1001,9 +1001,9 @@ class AttnAddedKVProcessor2_0: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, *args, **kwargs, ) -> torch.Tensor: @@ -1080,9 +1080,9 @@ class XFormersAttnAddedKVProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: residual = hidden_states hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) @@ -1151,13 +1151,13 @@ class XFormersAttnProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1243,13 +1243,13 @@ class AttnProcessorNPU: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1349,13 +1349,13 @@ class AttnProcessor2_0: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1448,13 +1448,13 @@ class FusedAttnProcessor2_0: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1581,10 +1581,10 @@ class CustomDiffusionXFormersAttnProcessor(nn.Module): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: batch_size, sequence_length, _ = ( hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape ) @@ -1692,10 +1692,10 @@ class CustomDiffusionAttnProcessor2_0(nn.Module): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: batch_size, sequence_length, _ = hidden_states.shape attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) if self.train_q_out: @@ -1773,10 +1773,10 @@ class SlicedAttnProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: residual = hidden_states input_ndim = hidden_states.ndim @@ -1860,11 +1860,11 @@ class SlicedAttnAddedKVProcessor: def __call__( self, attn: "Attention", - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: residual = hidden_states if attn.spatial_norm is not None: @@ -1957,7 +1957,7 @@ class SpatialNorm(nn.Module): self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) - def forward(self, f: torch.FloatTensor, zq: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, f: torch.Tensor, zq: torch.Tensor) -> torch.Tensor: f_size = f.shape[-2:] zq = F.interpolate(zq, size=f_size, mode="nearest") norm_f = self.norm_layer(f) @@ -2003,7 +2003,7 @@ class LoRAAttnProcessor(nn.Module): self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2064,7 +2064,7 @@ class LoRAAttnProcessor2_0(nn.Module): self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2143,7 +2143,7 @@ class LoRAXFormersAttnProcessor(nn.Module): self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2202,7 +2202,7 @@ class LoRAAttnAddedKVProcessor(nn.Module): self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2264,12 +2264,12 @@ class IPAdapterAttnProcessor(nn.Module): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, - ip_adapter_masks: Optional[torch.FloatTensor] = None, + ip_adapter_masks: Optional[torch.Tensor] = None, ): residual = hidden_states @@ -2467,12 +2467,12 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, - ip_adapter_masks: Optional[torch.FloatTensor] = None, + ip_adapter_masks: Optional[torch.Tensor] = None, ): residual = hidden_states diff --git a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py index fc2041d2e9..85b4195077 100644 --- a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py @@ -112,9 +112,7 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin): self.register_to_config(force_upcast=False) @apply_forward_hook - def encode( - self, x: torch.FloatTensor, return_dict: bool = True - ) -> Union[AutoencoderKLOutput, Tuple[torch.FloatTensor]]: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[AutoencoderKLOutput, Tuple[torch.Tensor]]: h = self.encoder(x) moments = self.quant_conv(h) posterior = DiagonalGaussianDistribution(moments) @@ -126,11 +124,11 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin): def _decode( self, - z: torch.FloatTensor, - image: Optional[torch.FloatTensor] = None, - mask: Optional[torch.FloatTensor] = None, + z: torch.Tensor, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, return_dict: bool = True, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: z = self.post_quant_conv(z) dec = self.decoder(z, image, mask) @@ -142,12 +140,12 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin): @apply_forward_hook def decode( self, - z: torch.FloatTensor, + z: torch.Tensor, generator: Optional[torch.Generator] = None, - image: Optional[torch.FloatTensor] = None, - mask: Optional[torch.FloatTensor] = None, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, return_dict: bool = True, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: decoded = self._decode(z, image, mask).sample if not return_dict: @@ -157,16 +155,16 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin): def forward( self, - sample: torch.FloatTensor, - mask: Optional[torch.FloatTensor] = None, + sample: torch.Tensor, + mask: Optional[torch.Tensor] = None, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: r""" Args: - sample (`torch.FloatTensor`): Input sample. - mask (`torch.FloatTensor`, *optional*, defaults to `None`): Optional inpainting mask. + sample (`torch.Tensor`): Input sample. + mask (`torch.Tensor`, *optional*, defaults to `None`): Optional inpainting mask. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index 6755cd7146..567d78d1fe 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -237,13 +237,13 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): @apply_forward_hook def encode( - self, x: torch.FloatTensor, return_dict: bool = True + self, x: torch.Tensor, return_dict: bool = True ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]: """ Encode a batch of images into latents. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. @@ -268,7 +268,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): return AutoencoderKLOutput(latent_dist=posterior) - def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size): return self.tiled_decode(z, return_dict=return_dict) @@ -281,14 +281,12 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): return DecoderOutput(sample=dec) @apply_forward_hook - def decode( - self, z: torch.FloatTensor, return_dict: bool = True, generator=None - ) -> Union[DecoderOutput, torch.FloatTensor]: + def decode(self, z: torch.Tensor, return_dict: bool = True, generator=None) -> Union[DecoderOutput, torch.Tensor]: """ Decode a batch of images. Args: - z (`torch.FloatTensor`): Input batch of latent vectors. + z (`torch.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. @@ -321,7 +319,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) return b - def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput: + def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -331,7 +329,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): output, but they should be much less noticeable. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. @@ -375,12 +373,12 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): return AutoencoderKLOutput(latent_dist=posterior) - def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: r""" Decode a batch of images using a tiled decoder. Args: - z (`torch.FloatTensor`): Input batch of latent vectors. + z (`torch.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. @@ -425,14 +423,14 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, torch.FloatTensor]: + ) -> Union[DecoderOutput, torch.Tensor]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py index b12226fa4b..67540cb7dc 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py @@ -86,10 +86,10 @@ class TemporalDecoder(nn.Module): def forward( self, - sample: torch.FloatTensor, - image_only_indicator: torch.FloatTensor, + sample: torch.Tensor, + image_only_indicator: torch.Tensor, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r"""The forward method of the `Decoder` class.""" sample = self.conv_in(sample) @@ -315,13 +315,13 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin): @apply_forward_hook def encode( - self, x: torch.FloatTensor, return_dict: bool = True + self, x: torch.Tensor, return_dict: bool = True ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]: """ Encode a batch of images into latents. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. @@ -341,15 +341,15 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin): @apply_forward_hook def decode( self, - z: torch.FloatTensor, + z: torch.Tensor, num_frames: int, return_dict: bool = True, - ) -> Union[DecoderOutput, torch.FloatTensor]: + ) -> Union[DecoderOutput, torch.Tensor]: """ Decode a batch of images. Args: - z (`torch.FloatTensor`): Input batch of latent vectors. + z (`torch.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. @@ -370,15 +370,15 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, num_frames: int = 1, - ) -> Union[DecoderOutput, torch.FloatTensor]: + ) -> Union[DecoderOutput, torch.Tensor]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/autoencoder_tiny.py b/src/diffusers/models/autoencoders/autoencoder_tiny.py index a7047acdfd..39b885b452 100644 --- a/src/diffusers/models/autoencoders/autoencoder_tiny.py +++ b/src/diffusers/models/autoencoders/autoencoder_tiny.py @@ -157,11 +157,11 @@ class AutoencoderTiny(ModelMixin, ConfigMixin): if isinstance(module, (EncoderTiny, DecoderTiny)): module.gradient_checkpointing = value - def scale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor: + def scale_latents(self, x: torch.Tensor) -> torch.Tensor: """raw latents -> [0, 1]""" return x.div(2 * self.latent_magnitude).add(self.latent_shift).clamp(0, 1) - def unscale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor: + def unscale_latents(self, x: torch.Tensor) -> torch.Tensor: """[0, 1] -> raw latents""" return x.sub(self.latent_shift).mul(2 * self.latent_magnitude) @@ -194,7 +194,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin): """ self.enable_tiling(False) - def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor: + def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -202,10 +202,10 @@ class AutoencoderTiny(ModelMixin, ConfigMixin): tiles overlap and are blended together to form a smooth output. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. Returns: - `torch.FloatTensor`: Encoded batch of images. + `torch.Tensor`: Encoded batch of images. """ # scale of encoder output relative to input sf = self.spatial_scale_factor @@ -242,7 +242,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin): tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out) return out - def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor: + def _tiled_decode(self, x: torch.Tensor) -> torch.Tensor: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -250,10 +250,10 @@ class AutoencoderTiny(ModelMixin, ConfigMixin): tiles overlap and are blended together to form a smooth output. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. Returns: - `torch.FloatTensor`: Encoded batch of images. + `torch.Tensor`: Encoded batch of images. """ # scale of decoder output relative to input sf = self.spatial_scale_factor @@ -290,9 +290,7 @@ class AutoencoderTiny(ModelMixin, ConfigMixin): return out @apply_forward_hook - def encode( - self, x: torch.FloatTensor, return_dict: bool = True - ) -> Union[AutoencoderTinyOutput, Tuple[torch.FloatTensor]]: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[AutoencoderTinyOutput, Tuple[torch.Tensor]]: if self.use_slicing and x.shape[0] > 1: output = [ self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x_slice) for x_slice in x.split(1) @@ -308,8 +306,8 @@ class AutoencoderTiny(ModelMixin, ConfigMixin): @apply_forward_hook def decode( - self, x: torch.FloatTensor, generator: Optional[torch.Generator] = None, return_dict: bool = True - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + self, x: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: if self.use_slicing and x.shape[0] > 1: output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)] output = torch.cat(output) @@ -323,12 +321,12 @@ class AutoencoderTiny(ModelMixin, ConfigMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. """ diff --git a/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py index 7287cbd43f..212c465377 100644 --- a/src/diffusers/models/autoencoders/consistency_decoder_vae.py +++ b/src/diffusers/models/autoencoders/consistency_decoder_vae.py @@ -276,13 +276,13 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin): @apply_forward_hook def encode( - self, x: torch.FloatTensor, return_dict: bool = True + self, x: torch.Tensor, return_dict: bool = True ) -> Union[ConsistencyDecoderVAEOutput, Tuple[DiagonalGaussianDistribution]]: """ Encode a batch of images into latents. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain tuple. @@ -312,22 +312,22 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin): @apply_forward_hook def decode( self, - z: torch.FloatTensor, + z: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, num_inference_steps: int = 2, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: """ Decodes the input latent vector `z` using the consistency decoder VAE model. Args: - z (torch.FloatTensor): The input latent vector. + z (torch.Tensor): The input latent vector. generator (Optional[torch.Generator]): The random number generator. Default is None. return_dict (bool): Whether to return the output as a dictionary. Default is True. num_inference_steps (int): The number of inference steps. Default is 2. Returns: - Union[DecoderOutput, Tuple[torch.FloatTensor]]: The decoded output. + Union[DecoderOutput, Tuple[torch.Tensor]]: The decoded output. """ z = (z * self.config.scaling_factor - self.means) / self.stds @@ -370,9 +370,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin): b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) return b - def tiled_encode( - self, x: torch.FloatTensor, return_dict: bool = True - ) -> Union[ConsistencyDecoderVAEOutput, Tuple]: + def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[ConsistencyDecoderVAEOutput, Tuple]: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -382,7 +380,7 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin): output, but they should be much less noticeable. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain tuple. @@ -429,14 +427,14 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/vae.py b/src/diffusers/models/autoencoders/vae.py index 75503dc6a4..333842905b 100644 --- a/src/diffusers/models/autoencoders/vae.py +++ b/src/diffusers/models/autoencoders/vae.py @@ -36,11 +36,11 @@ class DecoderOutput(BaseOutput): Output of decoding method. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The decoded output sample from the last layer of the model. """ - sample: torch.FloatTensor + sample: torch.Tensor class Encoder(nn.Module): @@ -136,7 +136,7 @@ class Encoder(nn.Module): self.gradient_checkpointing = False - def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, sample: torch.Tensor) -> torch.Tensor: r"""The forward method of the `Encoder` class.""" sample = self.conv_in(sample) @@ -282,9 +282,9 @@ class Decoder(nn.Module): def forward( self, - sample: torch.FloatTensor, - latent_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + sample: torch.Tensor, + latent_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: r"""The forward method of the `Decoder` class.""" sample = self.conv_in(sample) @@ -367,7 +367,7 @@ class UpSample(nn.Module): self.out_channels = out_channels self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1) - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: r"""The forward method of the `UpSample` class.""" x = torch.relu(x) x = self.deconv(x) @@ -416,7 +416,7 @@ class MaskConditionEncoder(nn.Module): self.layers = nn.Sequential(*layers) - def forward(self, x: torch.FloatTensor, mask=None) -> torch.FloatTensor: + def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor: r"""The forward method of the `MaskConditionEncoder` class.""" out = {} for l in range(len(self.layers)): @@ -533,11 +533,11 @@ class MaskConditionDecoder(nn.Module): def forward( self, - z: torch.FloatTensor, - image: Optional[torch.FloatTensor] = None, - mask: Optional[torch.FloatTensor] = None, - latent_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + z: torch.Tensor, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, + latent_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: r"""The forward method of the `MaskConditionDecoder` class.""" sample = z sample = self.conv_in(sample) @@ -711,7 +711,7 @@ class VectorQuantizer(nn.Module): back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds) return back.reshape(ishape) - def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, Tuple]: + def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Tuple]: # reshape z -> (batch, height, width, channel) and flatten z = z.permute(0, 2, 3, 1).contiguous() z_flattened = z.view(-1, self.vq_embed_dim) @@ -730,7 +730,7 @@ class VectorQuantizer(nn.Module): loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2) # preserve gradients - z_q: torch.FloatTensor = z + (z_q - z).detach() + z_q: torch.Tensor = z + (z_q - z).detach() # reshape back to match original input shape z_q = z_q.permute(0, 3, 1, 2).contiguous() @@ -745,7 +745,7 @@ class VectorQuantizer(nn.Module): return z_q, loss, (perplexity, min_encodings, min_encoding_indices) - def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.FloatTensor: + def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.Tensor: # shape specifying (batch, height, width, channel) if self.remap is not None: indices = indices.reshape(shape[0], -1) # add batch axis @@ -753,7 +753,7 @@ class VectorQuantizer(nn.Module): indices = indices.reshape(-1) # flatten again # get quantized latent vectors - z_q: torch.FloatTensor = self.embedding(indices) + z_q: torch.Tensor = self.embedding(indices) if shape is not None: z_q = z_q.view(shape) @@ -776,7 +776,7 @@ class DiagonalGaussianDistribution(object): self.mean, device=self.parameters.device, dtype=self.parameters.dtype ) - def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor: + def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor: # make sure sample is on the same device as the parameters and has same dtype sample = randn_tensor( self.mean.shape, @@ -873,7 +873,7 @@ class EncoderTiny(nn.Module): self.layers = nn.Sequential(*layers) self.gradient_checkpointing = False - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: r"""The forward method of the `EncoderTiny` class.""" if self.training and self.gradient_checkpointing: @@ -956,7 +956,7 @@ class DecoderTiny(nn.Module): self.layers = nn.Sequential(*layers) self.gradient_checkpointing = False - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: r"""The forward method of the `DecoderTiny` class.""" # Clamp. x = torch.tanh(x / 3) * 3 diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 9dc5292887..2618a8b15f 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -665,10 +665,10 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, - controlnet_cond: torch.FloatTensor, + controlnet_cond: torch.Tensor, conditioning_scale: float = 1.0, class_labels: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None, @@ -677,18 +677,18 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin): cross_attention_kwargs: Optional[Dict[str, Any]] = None, guess_mode: bool = False, return_dict: bool = True, - ) -> Union[ControlNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]: + ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]: """ The [`ControlNetModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor. timestep (`Union[torch.Tensor, float, int]`): The number of timesteps to denoise an input. encoder_hidden_states (`torch.Tensor`): The encoder hidden states. - controlnet_cond (`torch.FloatTensor`): + controlnet_cond (`torch.Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): The scale factor for ControlNet outputs. diff --git a/src/diffusers/models/controlnet_xs.py b/src/diffusers/models/controlnet_xs.py index a4f9e61f37..7c3ade26f1 100644 --- a/src/diffusers/models/controlnet_xs.py +++ b/src/diffusers/models/controlnet_xs.py @@ -17,7 +17,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.utils.checkpoint -from torch import FloatTensor, nn +from torch import Tensor, nn from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput, is_torch_version, logging @@ -54,12 +54,12 @@ class ControlNetXSOutput(BaseOutput): The output of [`UNetControlNetXSModel`]. Args: - sample (`FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`Tensor` of shape `(batch_size, num_channels, height, width)`): The output of the `UNetControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base model output, but is already the final output. """ - sample: FloatTensor = None + sample: Tensor = None class DownBlockControlNetXSAdapter(nn.Module): @@ -1001,7 +1001,7 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin): def forward( self, - sample: FloatTensor, + sample: Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, controlnet_cond: Optional[torch.Tensor] = None, @@ -1018,13 +1018,13 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin): The [`ControlNetXSModel`] forward method. Args: - sample (`FloatTensor`): + sample (`Tensor`): The noisy input tensor. timestep (`Union[torch.Tensor, float, int]`): The number of timesteps to denoise an input. encoder_hidden_states (`torch.Tensor`): The encoder hidden states. - controlnet_cond (`FloatTensor`): + controlnet_cond (`Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): How much the control model affects the base model outputs. @@ -1402,16 +1402,16 @@ class ControlNetXSCrossAttnDownBlock2D(nn.Module): def forward( self, - hidden_states_base: FloatTensor, - temb: FloatTensor, - encoder_hidden_states: Optional[FloatTensor] = None, - hidden_states_ctrl: Optional[FloatTensor] = None, + hidden_states_base: Tensor, + temb: Tensor, + encoder_hidden_states: Optional[Tensor] = None, + hidden_states_ctrl: Optional[Tensor] = None, conditioning_scale: Optional[float] = 1.0, - attention_mask: Optional[FloatTensor] = None, + attention_mask: Optional[Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[FloatTensor] = None, + encoder_attention_mask: Optional[Tensor] = None, apply_control: bool = True, - ) -> Tuple[FloatTensor, FloatTensor, Tuple[FloatTensor, ...], Tuple[FloatTensor, ...]]: + ) -> Tuple[Tensor, Tensor, Tuple[Tensor, ...], Tuple[Tensor, ...]]: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1626,16 +1626,16 @@ class ControlNetXSCrossAttnMidBlock2D(nn.Module): def forward( self, - hidden_states_base: FloatTensor, - temb: FloatTensor, - encoder_hidden_states: FloatTensor, - hidden_states_ctrl: Optional[FloatTensor] = None, + hidden_states_base: Tensor, + temb: Tensor, + encoder_hidden_states: Tensor, + hidden_states_ctrl: Optional[Tensor] = None, conditioning_scale: Optional[float] = 1.0, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - attention_mask: Optional[FloatTensor] = None, - encoder_attention_mask: Optional[FloatTensor] = None, + attention_mask: Optional[Tensor] = None, + encoder_attention_mask: Optional[Tensor] = None, apply_control: bool = True, - ) -> Tuple[FloatTensor, FloatTensor]: + ) -> Tuple[Tensor, Tensor]: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1807,18 +1807,18 @@ class ControlNetXSCrossAttnUpBlock2D(nn.Module): def forward( self, - hidden_states: FloatTensor, - res_hidden_states_tuple_base: Tuple[FloatTensor, ...], - res_hidden_states_tuple_ctrl: Tuple[FloatTensor, ...], - temb: FloatTensor, - encoder_hidden_states: Optional[FloatTensor] = None, + hidden_states: Tensor, + res_hidden_states_tuple_base: Tuple[Tensor, ...], + res_hidden_states_tuple_ctrl: Tuple[Tensor, ...], + temb: Tensor, + encoder_hidden_states: Optional[Tensor] = None, conditioning_scale: Optional[float] = 1.0, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - attention_mask: Optional[FloatTensor] = None, + attention_mask: Optional[Tensor] = None, upsample_size: Optional[int] = None, - encoder_attention_mask: Optional[FloatTensor] = None, + encoder_attention_mask: Optional[Tensor] = None, apply_control: bool = True, - ) -> FloatTensor: + ) -> Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") diff --git a/src/diffusers/models/downsampling.py b/src/diffusers/models/downsampling.py index 6d556e1e67..4e384e731c 100644 --- a/src/diffusers/models/downsampling.py +++ b/src/diffusers/models/downsampling.py @@ -129,7 +129,7 @@ class Downsample2D(nn.Module): else: self.conv = conv - def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -180,24 +180,24 @@ class FirDownsample2D(nn.Module): def _downsample_2d( self, - hidden_states: torch.FloatTensor, - weight: Optional[torch.FloatTensor] = None, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + weight: Optional[torch.Tensor] = None, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """Fused `Conv2d()` followed by `downsample_2d()`. Padding is performed only once at the beginning, not between the operations. The fused op is considerably more efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of arbitrary order. Args: - hidden_states (`torch.FloatTensor`): + hidden_states (`torch.Tensor`): Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - weight (`torch.FloatTensor`, *optional*): + weight (`torch.Tensor`, *optional*): Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to average pooling. factor (`int`, *optional*, default to `2`): @@ -206,7 +206,7 @@ class FirDownsample2D(nn.Module): Scaling factor for signal magnitude. Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and same datatype as `x`. """ @@ -244,7 +244,7 @@ class FirDownsample2D(nn.Module): return output - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if self.use_conv: downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel) hidden_states = downsample_input + self.Conv2d_0.bias.reshape(1, -1, 1, 1) @@ -286,11 +286,11 @@ class KDownsample2D(nn.Module): def downsample_2d( - hidden_states: torch.FloatTensor, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, -) -> torch.FloatTensor: +) -> torch.Tensor: r"""Downsample2D a batch of 2D images with the given filter. Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the @@ -298,9 +298,9 @@ def downsample_2d( shape is a multiple of the downsampling factor. Args: - hidden_states (`torch.FloatTensor`) + hidden_states (`torch.Tensor`) Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to average pooling. factor (`int`, *optional*, default to `2`): @@ -309,7 +309,7 @@ def downsample_2d( Scaling factor for signal magnitude. Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H // factor, W // factor]` """ diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index ced520bb82..d13f8a06cf 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -424,7 +424,7 @@ class TextImageProjection(nn.Module): self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim) self.text_proj = nn.Linear(text_embed_dim, cross_attention_dim) - def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor): + def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): batch_size = text_embeds.shape[0] # image @@ -450,7 +450,7 @@ class ImageProjection(nn.Module): self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim) self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): batch_size = image_embeds.shape[0] # image @@ -468,7 +468,7 @@ class IPAdapterFullImageProjection(nn.Module): self.ff = FeedForward(image_embed_dim, cross_attention_dim, mult=1, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): return self.norm(self.ff(image_embeds)) @@ -482,7 +482,7 @@ class IPAdapterFaceIDImageProjection(nn.Module): self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): x = self.ff(image_embeds) x = x.reshape(-1, self.num_tokens, self.cross_attention_dim) return self.norm(x) @@ -530,7 +530,7 @@ class TextImageTimeEmbedding(nn.Module): self.text_norm = nn.LayerNorm(time_embed_dim) self.image_proj = nn.Linear(image_embed_dim, time_embed_dim) - def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor): + def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): # text time_text_embeds = self.text_proj(text_embeds) time_text_embeds = self.text_norm(time_text_embeds) @@ -547,7 +547,7 @@ class ImageTimeEmbedding(nn.Module): self.image_proj = nn.Linear(image_embed_dim, time_embed_dim) self.image_norm = nn.LayerNorm(time_embed_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): # image time_image_embeds = self.image_proj(image_embeds) time_image_embeds = self.image_norm(time_image_embeds) @@ -577,7 +577,7 @@ class ImageHintTimeEmbedding(nn.Module): nn.Conv2d(256, 4, 3, padding=1), ) - def forward(self, image_embeds: torch.FloatTensor, hint: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor, hint: torch.Tensor): # image time_image_embeds = self.image_proj(image_embeds) time_image_embeds = self.image_norm(time_image_embeds) @@ -1007,7 +1007,7 @@ class MultiIPAdapterImageProjection(nn.Module): super().__init__() self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers) - def forward(self, image_embeds: List[torch.FloatTensor]): + def forward(self, image_embeds: List[torch.Tensor]): projected_image_embeds = [] # currently, we accept `image_embeds` as diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index adda53a114..00b55cd9c9 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -58,7 +58,7 @@ class ResnetBlockCondNorm2D(nn.Module): non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use. time_embedding_norm (`str`, *optional*, default to `"ada_group"` ): The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial". - kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see + kernel (`torch.Tensor`, optional, default to None): FIR filter, see [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`]. output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output. use_in_shortcut (`bool`, *optional*, default to `True`): @@ -146,7 +146,7 @@ class ResnetBlockCondNorm2D(nn.Module): bias=conv_shortcut_bias, ) - def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -204,7 +204,7 @@ class ResnetBlock2D(nn.Module): time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config. By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a stronger conditioning with scale and shift. - kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see + kernel (`torch.Tensor`, optional, default to None): FIR filter, see [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`]. output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output. use_in_shortcut (`bool`, *optional*, default to `True`): @@ -232,7 +232,7 @@ class ResnetBlock2D(nn.Module): non_linearity: str = "swish", skip_time_act: bool = False, time_embedding_norm: str = "default", # default, scale_shift, - kernel: Optional[torch.FloatTensor] = None, + kernel: Optional[torch.Tensor] = None, output_scale_factor: float = 1.0, use_in_shortcut: Optional[bool] = None, up: bool = False, @@ -317,7 +317,7 @@ class ResnetBlock2D(nn.Module): bias=conv_shortcut_bias, ) - def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -605,7 +605,7 @@ class TemporalResnetBlock(nn.Module): padding=0, ) - def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor) -> torch.Tensor: hidden_states = input_tensor hidden_states = self.norm1(hidden_states) @@ -685,8 +685,8 @@ class SpatioTemporalResBlock(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, ): num_frames = image_only_indicator.shape[-1] diff --git a/src/diffusers/models/transformers/dual_transformer_2d.py b/src/diffusers/models/transformers/dual_transformer_2d.py index e2f1b8538c..edc8cbf783 100644 --- a/src/diffusers/models/transformers/dual_transformer_2d.py +++ b/src/diffusers/models/transformers/dual_transformer_2d.py @@ -106,14 +106,13 @@ class DualTransformer2DModel(nn.Module): """ Args: hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. - When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input - hidden_states. + When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states. encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. timestep ( `torch.long`, *optional*): Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Optional attention mask to be applied in Attention. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under diff --git a/src/diffusers/models/transformers/prior_transformer.py b/src/diffusers/models/transformers/prior_transformer.py index 990eabe2c3..8dbcfc64e0 100644 --- a/src/diffusers/models/transformers/prior_transformer.py +++ b/src/diffusers/models/transformers/prior_transformer.py @@ -26,11 +26,11 @@ class PriorTransformerOutput(BaseOutput): The output of [`PriorTransformer`]. Args: - predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): + predicted_image_embedding (`torch.Tensor` of shape `(batch_size, embedding_dim)`): The predicted CLIP image embedding conditioned on the CLIP text embedding input. """ - predicted_image_embedding: torch.FloatTensor + predicted_image_embedding: torch.Tensor class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin): @@ -246,8 +246,8 @@ class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, Pef self, hidden_states, timestep: Union[torch.Tensor, float, int], - proj_embedding: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + proj_embedding: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, attention_mask: Optional[torch.BoolTensor] = None, return_dict: bool = True, ): @@ -255,13 +255,13 @@ class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, Pef The [`PriorTransformer`] forward method. Args: - hidden_states (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): + hidden_states (`torch.Tensor` of shape `(batch_size, embedding_dim)`): The currently predicted image embeddings. timestep (`torch.LongTensor`): Current denoising step. - proj_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): + proj_embedding (`torch.Tensor` of shape `(batch_size, embedding_dim)`): Projected embedding vector the denoising process is conditioned on. - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_embeddings, embedding_dim)`): + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, num_embeddings, embedding_dim)`): Hidden states of the text embeddings the denoising process is conditioned on. attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`): Text mask for the text embeddings. diff --git a/src/diffusers/models/transformers/t5_film_transformer.py b/src/diffusers/models/transformers/t5_film_transformer.py index bff98db021..1dea37a259 100644 --- a/src/diffusers/models/transformers/t5_film_transformer.py +++ b/src/diffusers/models/transformers/t5_film_transformer.py @@ -86,7 +86,7 @@ class T5FilmDecoder(ModelMixin, ConfigMixin): self.post_dropout = nn.Dropout(p=dropout_rate) self.spec_out = nn.Linear(d_model, input_dims, bias=False) - def encoder_decoder_mask(self, query_input: torch.FloatTensor, key_input: torch.FloatTensor) -> torch.FloatTensor: + def encoder_decoder_mask(self, query_input: torch.Tensor, key_input: torch.Tensor) -> torch.Tensor: mask = torch.mul(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) return mask.unsqueeze(-3) @@ -195,13 +195,13 @@ class DecoderLayer(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - conditioning_emb: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + conditioning_emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, encoder_decoder_position_bias=None, - ) -> Tuple[torch.FloatTensor]: + ) -> Tuple[torch.Tensor]: hidden_states = self.layer[0]( hidden_states, conditioning_emb=conditioning_emb, @@ -249,10 +249,10 @@ class T5LayerSelfAttentionCond(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - conditioning_emb: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + conditioning_emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: # pre_self_attention_layer_norm normed_hidden_states = self.layer_norm(hidden_states) @@ -292,10 +292,10 @@ class T5LayerCrossAttention(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - key_value_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.attention( normed_hidden_states, @@ -328,9 +328,7 @@ class T5LayerFFCond(nn.Module): self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) - def forward( - self, hidden_states: torch.FloatTensor, conditioning_emb: Optional[torch.FloatTensor] = None - ) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, conditioning_emb: Optional[torch.Tensor] = None) -> torch.Tensor: forwarded_states = self.layer_norm(hidden_states) if conditioning_emb is not None: forwarded_states = self.film(forwarded_states, conditioning_emb) @@ -361,7 +359,7 @@ class T5DenseGatedActDense(nn.Module): self.dropout = nn.Dropout(dropout_rate) self.act = NewGELUActivation() - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_gelu = self.act(self.wi_0(hidden_states)) hidden_linear = self.wi_1(hidden_states) hidden_states = hidden_gelu * hidden_linear @@ -390,7 +388,7 @@ class T5LayerNorm(nn.Module): self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for @@ -431,7 +429,7 @@ class T5FiLMLayer(nn.Module): super().__init__() self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False) - def forward(self, x: torch.FloatTensor, conditioning_emb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor, conditioning_emb: torch.Tensor) -> torch.Tensor: emb = self.scale_bias(conditioning_emb) scale, shift = torch.chunk(emb, 2, -1) x = x * (1 + scale) + shift diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py index 6a2695b9e4..ef9e0de0b6 100644 --- a/src/diffusers/models/transformers/transformer_2d.py +++ b/src/diffusers/models/transformers/transformer_2d.py @@ -35,12 +35,12 @@ class Transformer2DModelOutput(BaseOutput): The output of [`Transformer2DModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability distributions for the unnoised latent pixels. """ - sample: torch.FloatTensor + sample: torch.Tensor class Transformer2DModel(ModelMixin, ConfigMixin): @@ -346,9 +346,9 @@ class Transformer2DModel(ModelMixin, ConfigMixin): The [`Transformer2DModel`] forward method. Args: - hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous): + hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous): Input `hidden_states`. - encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*): + encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. timestep ( `torch.LongTensor`, *optional*): diff --git a/src/diffusers/models/transformers/transformer_temporal.py b/src/diffusers/models/transformers/transformer_temporal.py index c2d490f3d0..2e1bb041a2 100644 --- a/src/diffusers/models/transformers/transformer_temporal.py +++ b/src/diffusers/models/transformers/transformer_temporal.py @@ -31,11 +31,11 @@ class TransformerTemporalModelOutput(BaseOutput): The output of [`TransformerTemporalModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size x num_frames, num_channels, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. """ - sample: torch.FloatTensor + sample: torch.Tensor class TransformerTemporalModel(ModelMixin, ConfigMixin): @@ -120,7 +120,7 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin): def forward( self, - hidden_states: torch.FloatTensor, + hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.LongTensor] = None, timestep: Optional[torch.LongTensor] = None, class_labels: torch.LongTensor = None, @@ -132,7 +132,7 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin): The [`TransformerTemporal`] forward method. Args: - hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous): + hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous): Input hidden_states. encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to @@ -283,7 +283,7 @@ class TransformerSpatioTemporalModel(nn.Module): ): """ Args: - hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`): + hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states. num_frames (`int`): The number of frames to be processed per batch. This is used to reshape the hidden states. diff --git a/src/diffusers/models/unets/unet_1d.py b/src/diffusers/models/unets/unet_1d.py index 59d70f67c9..d1538cdc61 100644 --- a/src/diffusers/models/unets/unet_1d.py +++ b/src/diffusers/models/unets/unet_1d.py @@ -31,11 +31,11 @@ class UNet1DOutput(BaseOutput): The output of [`UNet1DModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, sample_size)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, sample_size)`): The hidden states output from the last layer of the model. """ - sample: torch.FloatTensor + sample: torch.Tensor class UNet1DModel(ModelMixin, ConfigMixin): @@ -194,7 +194,7 @@ class UNet1DModel(ModelMixin, ConfigMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], return_dict: bool = True, ) -> Union[UNet1DOutput, Tuple]: @@ -202,9 +202,9 @@ class UNet1DModel(ModelMixin, ConfigMixin): The [`UNet1DModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple. diff --git a/src/diffusers/models/unets/unet_1d_blocks.py b/src/diffusers/models/unets/unet_1d_blocks.py index e3163cd1d5..3c7c1cbece 100644 --- a/src/diffusers/models/unets/unet_1d_blocks.py +++ b/src/diffusers/models/unets/unet_1d_blocks.py @@ -66,7 +66,7 @@ class DownResnetBlock1D(nn.Module): if add_downsample: self.downsample = Downsample1D(out_channels, use_conv=True, padding=1) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: output_states = () hidden_states = self.resnets[0](hidden_states, temb) @@ -128,10 +128,10 @@ class UpResnetBlock1D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Optional[Tuple[torch.FloatTensor, ...]] = None, - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Optional[Tuple[torch.Tensor, ...]] = None, + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if res_hidden_states_tuple is not None: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat((hidden_states, res_hidden_states), dim=1) @@ -161,7 +161,7 @@ class ValueFunctionMidBlock1D(nn.Module): self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim) self.down2 = Downsample1D(out_channels // 4, use_conv=True) - def forward(self, x: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, x: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: x = self.res1(x, temb) x = self.down1(x) x = self.res2(x, temb) @@ -209,7 +209,7 @@ class MidResTemporalBlock1D(nn.Module): if self.upsample and self.downsample: raise ValueError("Block cannot downsample and upsample") - def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) for resnet in self.resnets[1:]: hidden_states = resnet(hidden_states, temb) @@ -230,7 +230,7 @@ class OutConv1DBlock(nn.Module): self.final_conv1d_act = get_activation(act_fn) self.final_conv1d_2 = nn.Conv1d(embed_dim, out_channels, 1) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.final_conv1d_1(hidden_states) hidden_states = rearrange_dims(hidden_states) hidden_states = self.final_conv1d_gn(hidden_states) @@ -251,7 +251,7 @@ class OutValueFunctionBlock(nn.Module): ] ) - def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view(hidden_states.shape[0], -1) hidden_states = torch.cat((hidden_states, temb), dim=-1) for layer in self.final_block: @@ -288,7 +288,7 @@ class Downsample1d(nn.Module): self.pad = kernel_1d.shape[0] // 2 - 1 self.register_buffer("kernel", kernel_1d) - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode) weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]]) indices = torch.arange(hidden_states.shape[1], device=hidden_states.device) @@ -305,7 +305,7 @@ class Upsample1d(nn.Module): self.pad = kernel_1d.shape[0] // 2 - 1 self.register_buffer("kernel", kernel_1d) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode) weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]]) indices = torch.arange(hidden_states.shape[1], device=hidden_states.device) @@ -335,7 +335,7 @@ class SelfAttention1d(nn.Module): new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3) return new_projection - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: residual = hidden_states batch, channel_dim, seq = hidden_states.shape @@ -390,7 +390,7 @@ class ResConvBlock(nn.Module): self.group_norm_2 = nn.GroupNorm(1, out_channels) self.gelu_2 = nn.GELU() - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states hidden_states = self.conv_1(hidden_states) @@ -435,7 +435,7 @@ class UNetMidBlock1D(nn.Module): self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.down(hidden_states) for attn, resnet in zip(self.attentions, self.resnets): hidden_states = resnet(hidden_states) @@ -466,7 +466,7 @@ class AttnDownBlock1D(nn.Module): self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.down(hidden_states) for resnet, attn in zip(self.resnets, self.attentions): @@ -490,7 +490,7 @@ class DownBlock1D(nn.Module): self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.down(hidden_states) for resnet in self.resnets: @@ -512,7 +512,7 @@ class DownBlock1DNoSkip(nn.Module): self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = torch.cat([hidden_states, temb], dim=1) for resnet in self.resnets: hidden_states = resnet(hidden_states) @@ -542,10 +542,10 @@ class AttnUpBlock1D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) @@ -574,10 +574,10 @@ class UpBlock1D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) @@ -604,10 +604,10 @@ class UpBlock1DNoSkip(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) diff --git a/src/diffusers/models/unets/unet_2d.py b/src/diffusers/models/unets/unet_2d.py index 5efb638222..0f36afe3f9 100644 --- a/src/diffusers/models/unets/unet_2d.py +++ b/src/diffusers/models/unets/unet_2d.py @@ -30,11 +30,11 @@ class UNet2DOutput(BaseOutput): The output of [`UNet2DModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The hidden states output from the last layer of the model. """ - sample: torch.FloatTensor + sample: torch.Tensor class UNet2DModel(ModelMixin, ConfigMixin): @@ -242,7 +242,7 @@ class UNet2DModel(ModelMixin, ConfigMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], class_labels: Optional[torch.Tensor] = None, return_dict: bool = True, @@ -251,10 +251,10 @@ class UNet2DModel(ModelMixin, ConfigMixin): The [`UNet2DModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - class_labels (`torch.FloatTensor`, *optional*, defaults to `None`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple. diff --git a/src/diffusers/models/unets/unet_2d_blocks.py b/src/diffusers/models/unets/unet_2d_blocks.py index ef75fad25e..93a0a82cdc 100644 --- a/src/diffusers/models/unets/unet_2d_blocks.py +++ b/src/diffusers/models/unets/unet_2d_blocks.py @@ -561,7 +561,7 @@ class AutoencoderTinyBlock(nn.Module): ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`. Returns: - `torch.FloatTensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to + `torch.Tensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to `out_channels`. """ @@ -582,7 +582,7 @@ class AutoencoderTinyBlock(nn.Module): ) self.fuse = nn.ReLU() - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: return self.fuse(self.conv(x) + self.skip(x)) @@ -612,8 +612,8 @@ class UNetMidBlock2D(nn.Module): output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor. Returns: - `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size, - in_channels, height, width)`. + `torch.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size, in_channels, + height, width)`. """ @@ -731,7 +731,7 @@ class UNetMidBlock2D(nn.Module): self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): if attn is not None: @@ -846,13 +846,13 @@ class UNetMidBlock2DCrossAttn(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -986,13 +986,13 @@ class UNetMidBlock2DSimpleCrossAttn(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1118,11 +1118,11 @@ class AttnDownBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1240,14 +1240,14 @@ class CrossAttnDownBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - additional_residuals: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + additional_residuals: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1362,8 +1362,8 @@ class DownBlock2D(nn.Module): self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None, *args, **kwargs + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1465,7 +1465,7 @@ class DownEncoderBlock2D(nn.Module): else: self.downsamplers = None - def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1567,7 +1567,7 @@ class AttnDownEncoderBlock2D(nn.Module): else: self.downsamplers = None - def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1666,12 +1666,12 @@ class AttnSkipDownBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - skip_sample: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + skip_sample: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...], torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1757,12 +1757,12 @@ class SkipDownBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - skip_sample: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + skip_sample: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...], torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1850,8 +1850,8 @@ class ResnetDownsampleBlock2D(nn.Module): self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None, *args, **kwargs + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1986,13 +1986,13 @@ class SimpleCrossAttnDownBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2097,8 +2097,8 @@ class KDownBlock2D(nn.Module): self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None, *args, **kwargs + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -2201,13 +2201,13 @@ class KCrossAttnDownBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2358,13 +2358,13 @@ class AttnUpBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -2481,15 +2481,15 @@ class CrossAttnUpBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2616,13 +2616,13 @@ class UpBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -2741,7 +2741,7 @@ class UpDecoderBlock2D(nn.Module): self.resolution_idx = resolution_idx - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: for resnet in self.resnets: hidden_states = resnet(hidden_states, temb=temb) @@ -2839,7 +2839,7 @@ class AttnUpDecoderBlock2D(nn.Module): self.resolution_idx = resolution_idx - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: for resnet, attn in zip(self.resnets, self.attentions): hidden_states = resnet(hidden_states, temb=temb) hidden_states = attn(hidden_states, temb=temb) @@ -2947,13 +2947,13 @@ class AttnSkipUpBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, skip_sample=None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3059,13 +3059,13 @@ class SkipUpBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, skip_sample=None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3166,13 +3166,13 @@ class ResnetUpsampleBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3310,15 +3310,15 @@ class SimpleCrossAttnUpBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -3428,13 +3428,13 @@ class KUpBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3558,15 +3558,15 @@ class KCrossAttnUpBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states_tuple = res_hidden_states_tuple[-1] if res_hidden_states_tuple is not None: hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1) @@ -3684,23 +3684,23 @@ class KAttentionBlock(nn.Module): cross_attention_norm=cross_attention_norm, ) - def _to_3d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor: + def _to_3d(self, hidden_states: torch.Tensor, height: int, weight: int) -> torch.Tensor: return hidden_states.permute(0, 2, 3, 1).reshape(hidden_states.shape[0], height * weight, -1) - def _to_4d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor: + def _to_4d(self, hidden_states: torch.Tensor, height: int, weight: int) -> torch.Tensor: return hidden_states.permute(0, 2, 1).reshape(hidden_states.shape[0], -1, height, weight) def forward( self, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, # TODO: mark emb as non-optional (self.norm2 requires it). # requires assessing impact of change to positional param interface. - emb: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index 16928f67fc..d07100b10e 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -60,11 +60,11 @@ class UNet2DConditionOutput(BaseOutput): The output of [`UNet2DConditionModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor = None + sample: torch.Tensor = None class UNet2DConditionModel( @@ -1042,7 +1042,7 @@ class UNet2DConditionModel( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -1060,10 +1060,10 @@ class UNet2DConditionModel( The [`UNet2DConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. diff --git a/src/diffusers/models/unets/unet_3d_blocks.py b/src/diffusers/models/unets/unet_3d_blocks.py index 75827258f6..35a732bdb9 100644 --- a/src/diffusers/models/unets/unet_3d_blocks.py +++ b/src/diffusers/models/unets/unet_3d_blocks.py @@ -411,13 +411,13 @@ class UNetMidBlock3DCrossAttn(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames) for attn, temp_attn, resnet, temp_conv in zip( @@ -544,13 +544,13 @@ class CrossAttnDownBlock3D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, cross_attention_kwargs: Dict[str, Any] = None, - ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: # TODO(Patrick, William) - attention mask is not used output_states = () @@ -651,10 +651,10 @@ class DownBlock3D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, num_frames: int = 1, - ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () for resnet, temp_conv in zip(self.resnets, self.temp_convs): @@ -769,15 +769,15 @@ class CrossAttnUpBlock3D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, cross_attention_kwargs: Dict[str, Any] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: is_freeu_enabled = ( getattr(self, "s1", None) and getattr(self, "s2", None) @@ -891,12 +891,12 @@ class UpBlock3D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: is_freeu_enabled = ( getattr(self, "s1", None) and getattr(self, "s2", None) @@ -1008,12 +1008,12 @@ class DownBlockMotion(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, num_frames: int = 1, *args, **kwargs, - ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1174,14 +1174,14 @@ class CrossAttnDownBlockMotion(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - additional_residuals: Optional[torch.FloatTensor] = None, + additional_residuals: Optional[torch.Tensor] = None, ): if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: @@ -1357,16 +1357,16 @@ class CrossAttnUpBlockMotion(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1518,14 +1518,14 @@ class UpBlockMotion(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size=None, num_frames: int = 1, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1699,14 +1699,14 @@ class UNetMidBlockCrossAttnMotion(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1811,8 +1811,8 @@ class MidBlockTemporalDecoder(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - image_only_indicator: torch.FloatTensor, + hidden_states: torch.Tensor, + image_only_indicator: torch.Tensor, ): hidden_states = self.resnets[0]( hidden_states, @@ -1862,9 +1862,9 @@ class UpBlockTemporalDecoder(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - image_only_indicator: torch.FloatTensor, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + image_only_indicator: torch.Tensor, + ) -> torch.Tensor: for resnet in self.resnets: hidden_states = resnet( hidden_states, @@ -1935,11 +1935,11 @@ class UNetMidBlockSpatioTemporal(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: hidden_states = self.resnets[0]( hidden_states, temb, @@ -2031,10 +2031,10 @@ class DownBlockSpatioTemporal(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () for resnet in self.resnets: if self.training and self.gradient_checkpointing: @@ -2141,11 +2141,11 @@ class CrossAttnDownBlockSpatioTemporal(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () blocks = list(zip(self.resnets, self.attentions)) @@ -2240,11 +2240,11 @@ class UpBlockSpatioTemporal(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: for resnet in self.resnets: # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] @@ -2349,12 +2349,12 @@ class CrossAttnUpBlockSpatioTemporal(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: for resnet, attn in zip(self.resnets, self.attentions): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py index 6c353c4259..b4879fe963 100644 --- a/src/diffusers/models/unets/unet_3d_condition.py +++ b/src/diffusers/models/unets/unet_3d_condition.py @@ -55,11 +55,11 @@ class UNet3DConditionOutput(BaseOutput): The output of [`UNet3DConditionModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor + sample: torch.Tensor class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): @@ -560,7 +560,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -570,15 +570,15 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, mid_block_additional_residual: Optional[torch.Tensor] = None, return_dict: bool = True, - ) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]: + ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]: r""" The [`UNet3DConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_channels, num_frames, height, width`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index 0a5f71ed00..dbfb4f8025 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -81,8 +81,8 @@ class I2VGenXLTransformerTemporalEncoder(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + ) -> torch.Tensor: norm_hidden_states = self.norm1(hidden_states) attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None) hidden_states = attn_output + hidden_states @@ -514,7 +514,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], fps: torch.Tensor, image_latents: torch.Tensor, @@ -523,19 +523,19 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): timestep_cond: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, - ) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]: + ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]: r""" The [`I2VGenXLUNet`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition". - image_latents (`torch.FloatTensor`): Image encodings from the VAE. - image_embeddings (`torch.FloatTensor`): + image_latents (`torch.Tensor`): Image encodings from the VAE. + image_embeddings (`torch.Tensor`): Projection embeddings of the conditioning image computed with a vision encoder. - encoder_hidden_states (`torch.FloatTensor`): + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under diff --git a/src/diffusers/models/unets/unet_kandinsky3.py b/src/diffusers/models/unets/unet_kandinsky3.py index b981c8e17e..ff8ce25fd2 100644 --- a/src/diffusers/models/unets/unet_kandinsky3.py +++ b/src/diffusers/models/unets/unet_kandinsky3.py @@ -31,7 +31,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name @dataclass class Kandinsky3UNetOutput(BaseOutput): - sample: torch.FloatTensor = None + sample: torch.Tensor = None class Kandinsky3EncoderProj(nn.Module): diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index 81cc4b1f7a..a092daa662 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -786,7 +786,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, timestep_cond: Optional[torch.Tensor] = None, @@ -801,10 +801,10 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): The [`UNetMotionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`): Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py index 0f89df8c6b..5613e3618d 100644 --- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py +++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py @@ -22,11 +22,11 @@ class UNetSpatioTemporalConditionOutput(BaseOutput): The output of [`UNetSpatioTemporalConditionModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor = None + sample: torch.Tensor = None class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): @@ -356,7 +356,7 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, added_time_ids: torch.Tensor, @@ -366,12 +366,12 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL The [`UNetSpatioTemporalConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`. - added_time_ids: (`torch.FloatTensor`): + added_time_ids: (`torch.Tensor`): The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal embeddings and added to the time embeddings. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py index 72967411f7..75a3dbc8ed 100644 --- a/src/diffusers/models/unets/unet_stable_cascade.py +++ b/src/diffusers/models/unets/unet_stable_cascade.py @@ -131,7 +131,7 @@ class UpDownBlock2d(nn.Module): @dataclass class StableCascadeUNetOutput(BaseOutput): - sample: torch.FloatTensor = None + sample: torch.Tensor = None class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalModelMixin): diff --git a/src/diffusers/models/upsampling.py b/src/diffusers/models/upsampling.py index af6e15db30..572844d2de 100644 --- a/src/diffusers/models/upsampling.py +++ b/src/diffusers/models/upsampling.py @@ -138,9 +138,7 @@ class Upsample2D(nn.Module): else: self.Conv2d_0 = conv - def forward( - self, hidden_states: torch.FloatTensor, output_size: Optional[int] = None, *args, **kwargs - ) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -217,12 +215,12 @@ class FirUpsample2D(nn.Module): def _upsample_2d( self, - hidden_states: torch.FloatTensor, - weight: Optional[torch.FloatTensor] = None, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + weight: Optional[torch.Tensor] = None, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """Fused `upsample_2d()` followed by `Conv2d()`. Padding is performed only once at the beginning, not between the operations. The fused op is considerably more @@ -230,19 +228,19 @@ class FirUpsample2D(nn.Module): arbitrary order. Args: - hidden_states (`torch.FloatTensor`): + hidden_states (`torch.Tensor`): Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - weight (`torch.FloatTensor`, *optional*): + weight (`torch.Tensor`, *optional*): Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling. factor (`int`, *optional*): Integer upsampling factor (default: 2). gain (`float`, *optional*): Scaling factor for signal magnitude (default: 1.0). Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same datatype as `hidden_states`. """ @@ -310,7 +308,7 @@ class FirUpsample2D(nn.Module): return output - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if self.use_conv: height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel) height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1) @@ -401,11 +399,11 @@ def upfirdn2d_native( def upsample_2d( - hidden_states: torch.FloatTensor, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, -) -> torch.FloatTensor: +) -> torch.Tensor: r"""Upsample2D a batch of 2D images with the given filter. Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified @@ -413,9 +411,9 @@ def upsample_2d( a: multiple of the upsampling factor. Args: - hidden_states (`torch.FloatTensor`): + hidden_states (`torch.Tensor`): Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling. factor (`int`, *optional*, default to `2`): @@ -424,7 +422,7 @@ def upsample_2d( Scaling factor for signal magnitude (default: 1.0). Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H * factor, W * factor]` """ assert isinstance(factor, int) and factor >= 1 diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py index e5184446ce..2e38d5b671 100644 --- a/src/diffusers/models/vq_model.py +++ b/src/diffusers/models/vq_model.py @@ -30,11 +30,11 @@ class VQEncoderOutput(BaseOutput): Output of VQModel encoding method. Args: - latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The encoded output sample from the last layer of the model. """ - latents: torch.FloatTensor + latents: torch.Tensor class VQModel(ModelMixin, ConfigMixin): @@ -127,7 +127,7 @@ class VQModel(ModelMixin, ConfigMixin): ) @apply_forward_hook - def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput: h = self.encoder(x) h = self.quant_conv(h) @@ -138,8 +138,8 @@ class VQModel(ModelMixin, ConfigMixin): @apply_forward_hook def decode( - self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None - ) -> Union[DecoderOutput, torch.FloatTensor]: + self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None + ) -> Union[DecoderOutput, torch.Tensor]: # also go through quantization layer if not force_not_quantize: quant, _, _ = self.quantize(h) @@ -156,13 +156,13 @@ class VQModel(ModelMixin, ConfigMixin): return DecoderOutput(sample=dec) def forward( - self, sample: torch.FloatTensor, return_dict: bool = True - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor, ...]]: + self, sample: torch.Tensor, return_dict: bool = True + ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]: r""" The [`VQModel`] forward method. Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple. diff --git a/src/diffusers/pipelines/amused/pipeline_amused.py b/src/diffusers/pipelines/amused/pipeline_amused.py index 994455ff29..a8c24b0aee 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused.py +++ b/src/diffusers/pipelines/amused/pipeline_amused.py @@ -88,7 +88,7 @@ class AmusedPipeline(DiffusionPipeline): negative_encoder_hidden_states: Optional[torch.Tensor] = None, output_type="pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, micro_conditioning_aesthetic_score: int = 6, @@ -122,16 +122,16 @@ class AmusedPipeline(DiffusionPipeline): latents (`torch.IntTensor`, *optional*): Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image gneration. If not provided, the starting latents will be completely masked. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. A single vector from the pooled and projected final hidden states. - encoder_hidden_states (`torch.FloatTensor`, *optional*): + encoder_hidden_states (`torch.Tensor`, *optional*): Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): + negative_encoder_hidden_states (`torch.Tensor`, *optional*): Analogous to `encoder_hidden_states` for the positive prompt. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. @@ -140,7 +140,7 @@ class AmusedPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py index 1218e7a44c..c74275b414 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py @@ -102,7 +102,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline): negative_encoder_hidden_states: Optional[torch.Tensor] = None, output_type="pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, micro_conditioning_aesthetic_score: int = 6, @@ -115,7 +115,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a @@ -141,16 +141,16 @@ class AmusedImg2ImgPipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. A single vector from the pooled and projected final hidden states. - encoder_hidden_states (`torch.FloatTensor`, *optional*): + encoder_hidden_states (`torch.Tensor`, *optional*): Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): + negative_encoder_hidden_states (`torch.Tensor`, *optional*): Analogous to `encoder_hidden_states` for the positive prompt. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. @@ -159,7 +159,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py index ab0a55cdd3..24801e0ef9 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py @@ -119,7 +119,7 @@ class AmusedInpaintPipeline(DiffusionPipeline): negative_encoder_hidden_states: Optional[torch.Tensor] = None, output_type="pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, micro_conditioning_aesthetic_score: int = 6, @@ -132,13 +132,13 @@ class AmusedInpaintPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one @@ -165,16 +165,16 @@ class AmusedInpaintPipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. A single vector from the pooled and projected final hidden states. - encoder_hidden_states (`torch.FloatTensor`, *optional*): + encoder_hidden_states (`torch.Tensor`, *optional*): Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): + negative_encoder_hidden_states (`torch.Tensor`, *optional*): Analogous to `encoder_hidden_states` for the positive prompt. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. @@ -183,7 +183,7 @@ class AmusedInpaintPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index 94654c4a7e..175671ece5 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -148,8 +148,8 @@ class AnimateDiffPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -169,10 +169,10 @@ class AnimateDiffPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -563,11 +563,11 @@ class AnimateDiffPipeline( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -604,27 +604,26 @@ class AnimateDiffPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py index 11ccafdf57..50ed54001e 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py @@ -312,10 +312,10 @@ class AnimateDiffSDXLPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -341,17 +341,17 @@ class AnimateDiffSDXLPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -784,7 +784,7 @@ class AnimateDiffSDXLPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -797,7 +797,7 @@ class AnimateDiffSDXLPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -866,13 +866,13 @@ class AnimateDiffSDXLPipeline( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -949,27 +949,27 @@ class AnimateDiffSDXLPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index 975dca5d48..2adc5cdf82 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -252,8 +252,8 @@ class AnimateDiffVideoToVideoPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -273,10 +273,10 @@ class AnimateDiffVideoToVideoPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -741,11 +741,11 @@ class AnimateDiffVideoToVideoPipeline( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -790,27 +790,26 @@ class AnimateDiffVideoToVideoPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple. cross_attention_kwargs (`dict`, *optional*): diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py index 78b730ea91..105ca40f77 100644 --- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py +++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py @@ -103,8 +103,8 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin): num_waveforms_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -122,10 +122,10 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin): The prompt or prompts not to guide the audio generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -360,11 +360,11 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin): num_waveforms_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, output_type: Optional[str] = "np", @@ -394,21 +394,21 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index 948caf97d2..7131083435 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -64,7 +64,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput): """ Args: Class for AudioLDM2 projection layer's outputs. - hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states obtained by linearly projecting the hidden-states for each of the text encoders and subsequently concatenating them together. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -75,7 +75,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput): - 0 for tokens that are **masked**. """ - hidden_states: torch.FloatTensor + hidden_states: torch.Tensor attention_mask: Optional[torch.LongTensor] = None @@ -125,8 +125,8 @@ class AudioLDM2ProjectionModel(ModelMixin, ConfigMixin): def forward( self, - hidden_states: Optional[torch.FloatTensor] = None, - hidden_states_1: Optional[torch.FloatTensor] = None, + hidden_states: Optional[torch.Tensor] = None, + hidden_states_1: Optional[torch.Tensor] = None, attention_mask: Optional[torch.LongTensor] = None, attention_mask_1: Optional[torch.LongTensor] = None, ): @@ -680,7 +680,7 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -696,10 +696,10 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad The [`AudioLDM2UNet2DConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. encoder_attention_mask (`torch.Tensor`): A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If @@ -710,7 +710,7 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad tuple. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. - encoder_hidden_states_1 (`torch.FloatTensor`, *optional*): + encoder_hidden_states_1 (`torch.Tensor`, *optional*): A second set of encoder hidden states with shape `(batch, sequence_length_2, feature_dim_2)`. Can be used to condition the model on a different set of embeddings to `encoder_hidden_states`. encoder_attention_mask_1 (`torch.Tensor`, *optional*): @@ -1091,14 +1091,14 @@ class CrossAttnDownBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states_1: Optional[torch.FloatTensor] = None, - encoder_attention_mask_1: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states_1: Optional[torch.Tensor] = None, + encoder_attention_mask_1: Optional[torch.Tensor] = None, ): output_states = () num_layers = len(self.resnets) @@ -1270,15 +1270,15 @@ class UNetMidBlock2DCrossAttn(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states_1: Optional[torch.FloatTensor] = None, - encoder_attention_mask_1: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states_1: Optional[torch.Tensor] = None, + encoder_attention_mask_1: Optional[torch.Tensor] = None, + ) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) num_attention_per_layer = len(self.attentions) // (len(self.resnets) - 1) @@ -1437,16 +1437,16 @@ class CrossAttnUpBlock2D(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states_1: Optional[torch.FloatTensor] = None, - encoder_attention_mask_1: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states_1: Optional[torch.Tensor] = None, + encoder_attention_mask_1: Optional[torch.Tensor] = None, ): num_layers = len(self.resnets) num_attention_per_layer = len(self.attentions) // num_layers diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index a498831877..49440830d0 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -273,7 +273,7 @@ class AudioLDM2Pipeline(DiffusionPipeline): Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs. Parameters: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): The sequence used as a prompt for the generation. max_new_tokens (`int`): Number of new tokens to generate. @@ -282,7 +282,7 @@ class AudioLDM2Pipeline(DiffusionPipeline): function of the model. Return: - `inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): The sequence of generated hidden-states. """ max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens @@ -311,10 +311,10 @@ class AudioLDM2Pipeline(DiffusionPipeline): do_classifier_free_guidance, transcription=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - generated_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + generated_prompt_embeds: Optional[torch.Tensor] = None, + negative_generated_prompt_embeds: Optional[torch.Tensor] = None, attention_mask: Optional[torch.LongTensor] = None, negative_attention_mask: Optional[torch.LongTensor] = None, max_new_tokens: Optional[int] = None, @@ -337,18 +337,18 @@ class AudioLDM2Pipeline(DiffusionPipeline): The prompt or prompts not to guide the audio generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from `negative_prompt` input argument. - generated_prompt_embeds (`torch.FloatTensor`, *optional*): + generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from `negative_prompt` input argument. @@ -361,11 +361,11 @@ class AudioLDM2Pipeline(DiffusionPipeline): max_new_tokens (`int`, *optional*, defaults to None): The number of new tokens to generate with the GPT2 language model. Returns: - prompt_embeds (`torch.FloatTensor`): + prompt_embeds (`torch.Tensor`): Text embeddings from the Flan T5 model. attention_mask (`torch.LongTensor`): Attention mask to be applied to the `prompt_embeds`. - generated_prompt_embeds (`torch.FloatTensor`): + generated_prompt_embeds (`torch.Tensor`): Text embeddings generated from the GPT2 langauge model. Example: @@ -821,16 +821,16 @@ class AudioLDM2Pipeline(DiffusionPipeline): num_waveforms_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - generated_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + generated_prompt_embeds: Optional[torch.Tensor] = None, + negative_generated_prompt_embeds: Optional[torch.Tensor] = None, attention_mask: Optional[torch.LongTensor] = None, negative_attention_mask: Optional[torch.LongTensor] = None, max_new_tokens: Optional[int] = None, return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, output_type: Optional[str] = "np", @@ -865,21 +865,21 @@ class AudioLDM2Pipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - generated_prompt_embeds (`torch.FloatTensor`, *optional*): + generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from `negative_prompt` input argument. @@ -897,7 +897,7 @@ class AudioLDM2Pipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py b/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py index d71a148103..d92a076690 100644 --- a/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py +++ b/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py @@ -298,7 +298,7 @@ class BlipImageProcessor(BaseImageProcessor): return encoded_outputs # Follows diffusers.VaeImageProcessor.postprocess - def postprocess(self, sample: torch.FloatTensor, output_type: str = "pil"): + def postprocess(self, sample: torch.Tensor, output_type: str = "pil"): if output_type not in ["pt", "np", "pil"]: raise ValueError( f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']" diff --git a/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py b/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py index c8869ad9db..1be4761a99 100644 --- a/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py +++ b/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py @@ -117,7 +117,7 @@ class Blip2VisionEmbeddings(nn.Module): self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: batch_size = pixel_values.shape[0] target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] @@ -376,7 +376,7 @@ class Blip2VisionModel(Blip2PreTrainedModel): @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig) def forward( self, - pixel_values: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -524,15 +524,15 @@ class Blip2QFormerModel(Blip2PreTrainedModel): return_dict=None, ): r""" - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): + encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. - past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of: shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key diff --git a/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py b/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py index c6772fc888..d29dddf64b 100644 --- a/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +++ b/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py @@ -186,7 +186,7 @@ class ContextCLIPTextEmbeddings(nn.Module): ctx_begin_pos: list, input_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: if ctx_embeddings is None: ctx_len = 0 diff --git a/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py b/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py index ba43b2e53d..ff23247b5f 100644 --- a/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +++ b/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py @@ -191,7 +191,7 @@ class BlipDiffusionPipeline(DiffusionPipeline): reference_image: PIL.Image.Image, source_subject_category: List[str], target_subject_category: List[str], - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 7.5, height: int = 512, width: int = 512, @@ -215,7 +215,7 @@ class BlipDiffusionPipeline(DiffusionPipeline): The source subject category. target_subject_category (`List[str]`): The target subject category. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by random sampling. diff --git a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py index befac79c63..b0c11362ff 100644 --- a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py +++ b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py @@ -105,7 +105,7 @@ class ConsistencyModelPipeline(DiffusionPipeline): return latents # Follows diffusers.VaeImageProcessor.postprocess - def postprocess_image(self, sample: torch.FloatTensor, output_type: str = "pil"): + def postprocess_image(self, sample: torch.Tensor, output_type: str = "pil"): if output_type not in ["pt", "np", "pil"]: raise ValueError( f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']" @@ -173,10 +173,10 @@ class ConsistencyModelPipeline(DiffusionPipeline): num_inference_steps: int = 1, timesteps: List[int] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -195,7 +195,7 @@ class ConsistencyModelPipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -205,7 +205,7 @@ class ConsistencyModelPipeline(DiffusionPipeline): Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/controlnet/multicontrolnet.py b/src/diffusers/pipelines/controlnet/multicontrolnet.py index 7d284f2d26..98e9eec94a 100644 --- a/src/diffusers/pipelines/controlnet/multicontrolnet.py +++ b/src/diffusers/pipelines/controlnet/multicontrolnet.py @@ -31,7 +31,7 @@ class MultiControlNetModel(ModelMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, controlnet_cond: List[torch.tensor], diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 0bf5f1fc8e..e64dcdc554 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -261,8 +261,8 @@ class StableDiffusionControlNetPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -294,8 +294,8 @@ class StableDiffusionControlNetPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -315,10 +315,10 @@ class StableDiffusionControlNetPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -846,7 +846,7 @@ class StableDiffusionControlNetPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -859,7 +859,7 @@ class StableDiffusionControlNetPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -913,11 +913,11 @@ class StableDiffusionControlNetPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -936,14 +936,14 @@ class StableDiffusionControlNetPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single ControlNet, each will be paired with each prompt in the `prompt` list. This also applies to multiple ControlNets, where a list of image lists can be passed to batch for each prompt and each ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -975,18 +975,18 @@ class StableDiffusionControlNetPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -998,7 +998,7 @@ class StableDiffusionControlNetPipeline( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py index b983a3f8d4..86e0ddef66 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py @@ -240,7 +240,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline): condtioning_image: PIL.Image.Image, source_subject_category: List[str], target_subject_category: List[str], - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 7.5, height: int = 512, width: int = 512, @@ -266,7 +266,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline): The source subject category. target_subject_category (`List[str]`): The target subject category. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by random sampling. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 022f30d819..2e44efa78b 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -239,8 +239,8 @@ class StableDiffusionControlNetImg2ImgPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -272,8 +272,8 @@ class StableDiffusionControlNetImg2ImgPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -293,10 +293,10 @@ class StableDiffusionControlNetImg2ImgPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -904,11 +904,11 @@ class StableDiffusionControlNetImg2ImgPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -927,18 +927,18 @@ class StableDiffusionControlNetImg2ImgPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image to be used as the starting point for the image generation process. Can also accept image latents as `image`, and if passing latents directly they are not encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -966,18 +966,18 @@ class StableDiffusionControlNetImg2ImgPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 47dbb26eb3..cdc34819d5 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -364,8 +364,8 @@ class StableDiffusionControlNetInpaintPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -397,8 +397,8 @@ class StableDiffusionControlNetInpaintPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -418,10 +418,10 @@ class StableDiffusionControlNetInpaintPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1121,11 +1121,11 @@ class StableDiffusionControlNetInpaintPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1144,14 +1144,14 @@ class StableDiffusionControlNetInpaintPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, NumPy array or tensor representing an image batch to be used as the starting point. For both NumPy array and PyTorch tensor, the expected value range is between `[0, 1]`. If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a NumPy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, NumPy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a @@ -1159,14 +1159,14 @@ class StableDiffusionControlNetInpaintPipeline( color channel (L) instead of 3, so the expected shape for PyTorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for NumPy array, it would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, - `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, + `List[List[torch.Tensor]]`, or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -1201,18 +1201,18 @@ class StableDiffusionControlNetInpaintPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index b9c4e3c003..3cfdefa9d4 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -261,10 +261,10 @@ class StableDiffusionXLControlNetInpaintPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -290,17 +290,17 @@ class StableDiffusionXLControlNetInpaintPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1157,13 +1157,13 @@ class StableDiffusionXLControlNetInpaintPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1250,23 +1250,23 @@ class StableDiffusionXLControlNetInpaintPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1278,7 +1278,7 @@ class StableDiffusionXLControlNetInpaintPipeline( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index 2307b856ad..73bb8be89e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -233,10 +233,10 @@ class StableDiffusionXLControlNetPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -262,17 +262,17 @@ class StableDiffusionXLControlNetPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -876,7 +876,7 @@ class StableDiffusionXLControlNetPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -889,7 +889,7 @@ class StableDiffusionXLControlNetPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -948,13 +948,13 @@ class StableDiffusionXLControlNetPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -982,14 +982,14 @@ class StableDiffusionXLControlNetPipeline( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. Anything below 512 pixels won't work well for [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) @@ -1025,25 +1025,25 @@ class StableDiffusionXLControlNetPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index dfd3cc239b..dbd406d928 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -287,10 +287,10 @@ class StableDiffusionXLControlNetImg2ImgPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -316,17 +316,17 @@ class StableDiffusionXLControlNetImg2ImgPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1082,13 +1082,13 @@ class StableDiffusionXLControlNetImg2ImgPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1119,18 +1119,18 @@ class StableDiffusionXLControlNetImg2ImgPipeline( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image will be used as the starting point for the image generation process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can - also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If - height and/or width are passed, `image` is resized according to them. If multiple ControlNets are - specified in init, images must be passed as a list such that each element of the list can be correctly - batched for input to a single controlnet. + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also + be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height + and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in + init, images must be passed as a list such that each element of the list can be correctly batched for + input to a single controlnet. height (`int`, *optional*, defaults to the size of control_image): The height in pixels of the generated image. Anything below 512 pixels won't work well for [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) @@ -1169,26 +1169,26 @@ class StableDiffusionXLControlNetImg2ImgPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py index 622bac8c5f..50cd24e4fa 100644 --- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py @@ -188,8 +188,8 @@ class StableDiffusionControlNetXSPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -221,8 +221,8 @@ class StableDiffusionControlNetXSPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -242,10 +242,10 @@ class StableDiffusionControlNetXSPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -638,9 +638,9 @@ class StableDiffusionControlNetXSPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -657,14 +657,14 @@ class StableDiffusionControlNetXSPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -686,14 +686,14 @@ class StableDiffusionControlNetXSPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py index 3ab535a054..e572412f6e 100644 --- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py @@ -213,10 +213,10 @@ class StableDiffusionXLControlNetXSPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -242,17 +242,17 @@ class StableDiffusionXLControlNetXSPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -721,11 +721,11 @@ class StableDiffusionXLControlNetXSPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -751,14 +751,14 @@ class StableDiffusionXLControlNetXSPipeline( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. Anything below 512 pixels won't work well for [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) @@ -787,20 +787,20 @@ class StableDiffusionXLControlNetXSPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py index 5bd396b20f..1d438bcf87 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py @@ -164,8 +164,8 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -184,10 +184,10 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -549,11 +549,11 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -593,10 +593,10 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -607,7 +607,7 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py index 50e2cda25a..c5d9eed3ca 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py @@ -188,8 +188,8 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -208,10 +208,10 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -421,7 +421,7 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -665,11 +665,11 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -681,7 +681,7 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.7): @@ -714,10 +714,10 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -728,7 +728,7 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py index 89eb97a087..cb7e9ef6f3 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py @@ -340,8 +340,8 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -360,10 +360,10 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -576,7 +576,7 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -607,7 +607,7 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`original_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -735,7 +735,7 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor], + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], original_image: Union[ PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] ] = None, @@ -748,11 +748,11 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 250, @@ -762,10 +762,10 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): Function invoked when calling the pipeline for generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - original_image (`torch.FloatTensor` or `PIL.Image.Image`): + original_image (`torch.Tensor` or `PIL.Image.Image`): The original image that `image` was varied from. strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -800,10 +800,10 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -814,7 +814,7 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py index aabe1107fc..cb592aa567 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py @@ -192,8 +192,8 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -212,10 +212,10 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -428,7 +428,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -459,7 +459,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`mask_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -760,11 +760,11 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -776,7 +776,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. mask_image (`PIL.Image.Image`): @@ -814,10 +814,10 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -828,7 +828,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py index 1798e0dec7..aa70eb7b40 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py @@ -342,8 +342,8 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -362,10 +362,10 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -579,7 +579,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -610,7 +610,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`original_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -643,7 +643,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`mask_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -823,7 +823,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor], + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], original_image: Union[ PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] ] = None, @@ -839,11 +839,11 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 0, @@ -853,10 +853,10 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): Function invoked when calling the pipeline for generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - original_image (`torch.FloatTensor` or `PIL.Image.Image`): + original_image (`torch.Tensor` or `PIL.Image.Image`): The original image that `image` was varied from. mask_image (`PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be @@ -896,10 +896,10 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -910,7 +910,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py index 36ed34cba9..fd38a87243 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py @@ -298,8 +298,8 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -318,10 +318,10 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -537,7 +537,7 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -608,7 +608,7 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): prompt: Union[str, List[str]] = None, height: int = None, width: int = None, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor] = None, + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor] = None, num_inference_steps: int = 50, timesteps: List[int] = None, guidance_scale: float = 4.0, @@ -616,11 +616,11 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 250, @@ -637,7 +637,7 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): The height in pixels of the generated image. width (`int`, *optional*, defaults to None): The width in pixels of the generated image. - image (`PIL.Image.Image`, `np.ndarray`, `torch.FloatTensor`): + image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`): The image to be upscaled. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -663,10 +663,10 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -677,7 +677,7 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py b/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py index f73ef15d7d..f69f905b56 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py @@ -13,27 +13,27 @@ class TransformationModelOutput(ModelOutput): Base class for text model's outputs that also contains a pooling of the last hidden states. Args: - text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + text_embeds (`torch.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): The text embeddings obtained by applying the projection layer to the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + last_hidden_state (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one + for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - projection_state: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None + projection_state: Optional[torch.Tensor] = None + last_hidden_state: torch.Tensor = None + hidden_states: Optional[Tuple[torch.Tensor]] = None + attentions: Optional[Tuple[torch.Tensor]] = None class RobertaSeriesConfig(XLMRobertaConfig): diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py index d1be4bbe63..11d81b13ea 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py @@ -278,8 +278,8 @@ class AltDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -310,8 +310,8 @@ class AltDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -331,10 +331,10 @@ class AltDiffusionPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -637,7 +637,7 @@ class AltDiffusionPipeline( data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -694,9 +694,9 @@ class AltDiffusionPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -738,14 +738,14 @@ class AltDiffusionPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py index 53d67a1b2d..145579da0c 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -318,8 +318,8 @@ class AltDiffusionImg2ImgPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -350,8 +350,8 @@ class AltDiffusionImg2ImgPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -371,10 +371,10 @@ class AltDiffusionImg2ImgPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -721,7 +721,7 @@ class AltDiffusionImg2ImgPipeline( data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -774,8 +774,8 @@ class AltDiffusionImg2ImgPipeline( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -791,7 +791,7 @@ class AltDiffusionImg2ImgPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a @@ -824,10 +824,10 @@ class AltDiffusionImg2ImgPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. diff --git a/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py b/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py index c03a3d8fc3..101d315dfe 100644 --- a/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +++ b/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py @@ -112,9 +112,9 @@ class RePaintPipeline(DiffusionPipeline): The call function to the pipeline for generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): The original image to inpaint on. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): The mask_image where 0.0 define which part of the original image to inpaint. num_inference_steps (`int`, *optional*, defaults to 1000): The number of denoising steps. More denoising steps usually lead to a higher quality image at the diff --git a/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 475da0b6d1..b8ac8e1416 100644 --- a/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -134,7 +134,7 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline): num_inference_steps: int = 100, return_dict: bool = True, output_type: str = "np", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ) -> Union[AudioPipelineOutput, Tuple]: if (callback_steps is None) or ( @@ -161,7 +161,7 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline): The output format of the generated audio. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py index 0581effef2..edcc0fed68 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py @@ -255,8 +255,8 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -288,8 +288,8 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -309,10 +309,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -638,10 +638,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -652,7 +652,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. strength (`float`, *optional*, defaults to 0.8): @@ -678,10 +678,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -691,7 +691,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py index c7dff9eeef..cd29665fdb 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py @@ -48,7 +48,7 @@ def preprocess_image(image, batch_size): def preprocess_mask(mask, batch_size, scale_factor=8): - if not isinstance(mask, torch.FloatTensor): + if not isinstance(mask, torch.Tensor): mask = mask.convert("L") w, h = mask.size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 @@ -225,8 +225,8 @@ class StableDiffusionInpaintPipelineLegacy( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -258,8 +258,8 @@ class StableDiffusionInpaintPipelineLegacy( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -279,10 +279,10 @@ class StableDiffusionInpaintPipelineLegacy( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -557,8 +557,8 @@ class StableDiffusionInpaintPipelineLegacy( def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -567,11 +567,11 @@ class StableDiffusionInpaintPipelineLegacy( add_predicted_noise: Optional[bool] = False, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -583,10 +583,10 @@ class StableDiffusionInpaintPipelineLegacy( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the @@ -620,10 +620,10 @@ class StableDiffusionInpaintPipelineLegacy( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -635,7 +635,7 @@ class StableDiffusionInpaintPipelineLegacy( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -693,7 +693,7 @@ class StableDiffusionInpaintPipelineLegacy( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) # 4. Preprocess image and mask - if not isinstance(image, torch.FloatTensor): + if not isinstance(image, torch.Tensor): image = preprocess_image(image, batch_size) mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor) diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py index f44a1ca74e..901816391d 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py @@ -163,8 +163,8 @@ class StableDiffusionModelEditingPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -196,8 +196,8 @@ class StableDiffusionModelEditingPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -217,10 +217,10 @@ class StableDiffusionModelEditingPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -620,12 +620,12 @@ class StableDiffusionModelEditingPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -657,14 +657,14 @@ class StableDiffusionModelEditingPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -674,7 +674,7 @@ class StableDiffusionModelEditingPipeline( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py index 9421531d27..74ecaa995b 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py @@ -154,8 +154,8 @@ class StableDiffusionParadigmsPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -187,8 +187,8 @@ class StableDiffusionParadigmsPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -208,10 +208,10 @@ class StableDiffusionParadigmsPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -492,12 +492,12 @@ class StableDiffusionParadigmsPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, debug: bool = False, @@ -537,14 +537,14 @@ class StableDiffusionParadigmsPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -554,7 +554,7 @@ class StableDiffusionParadigmsPipeline( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py index 5f74457881..d1f2c5de97 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py @@ -60,14 +60,14 @@ class Pix2PixInversionPipelineOutput(BaseOutput, TextualInversionLoaderMixin): Output class for Stable Diffusion pipelines. Args: - latents (`torch.FloatTensor`) + latents (`torch.Tensor`) inverted latents tensor images (`List[PIL.Image.Image]` or `np.ndarray`) List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. """ - latents: torch.FloatTensor + latents: torch.Tensor images: Union[List[PIL.Image.Image], np.ndarray] @@ -377,8 +377,8 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -410,8 +410,8 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -431,10 +431,10 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -707,7 +707,7 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0) @torch.no_grad() - def get_embeds(self, prompt: List[str], batch_size: int = 16) -> torch.FloatTensor: + def get_embeds(self, prompt: List[str], batch_size: int = 16) -> torch.Tensor: num_prompts = len(prompt) embeds = [] for i in range(0, num_prompts, batch_size): @@ -827,13 +827,13 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, cross_attention_guidance_amount: float = 0.1, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -876,14 +876,14 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -897,7 +897,7 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -1112,12 +1112,12 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin num_inference_steps: int = 50, guidance_scale: float = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, cross_attention_guidance_amount: float = 0.1, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, lambda_auto_corr: float = 20.0, @@ -1132,7 +1132,7 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch which will be used for conditioning. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. num_inference_steps (`int`, *optional*, defaults to 50): @@ -1147,11 +1147,11 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. cross_attention_guidance_amount (`float`, defaults to 0.1): @@ -1164,7 +1164,7 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py index c84caa1fad..9e172ec2dc 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py @@ -1048,7 +1048,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -1066,10 +1066,10 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): The [`UNetFlatConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. @@ -1590,8 +1590,8 @@ class DownBlockFlat(nn.Module): self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () for resnet in self.resnets: @@ -1719,14 +1719,14 @@ class CrossAttnDownBlockFlat(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - additional_residuals: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + additional_residuals: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () blocks = list(zip(self.resnets, self.attentions)) @@ -1837,13 +1837,13 @@ class UpBlockFlat(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1994,15 +1994,15 @@ class CrossAttnUpBlockFlat(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2104,8 +2104,8 @@ class UNetMidBlockFlat(nn.Module): output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor. Returns: - `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size, - in_channels, height, width)`. + `torch.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size, in_channels, + height, width)`. """ @@ -2223,7 +2223,7 @@ class UNetMidBlockFlat(nn.Module): self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): if attn is not None: @@ -2339,13 +2339,13 @@ class UNetMidBlockFlatCrossAttn(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2480,13 +2480,13 @@ class UNetMidBlockFlatSimpleCrossAttn(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py index 4455d20df2..c8dc18e2e8 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py @@ -81,7 +81,7 @@ class VersatileDiffusionPipeline(DiffusionPipeline): @torch.no_grad() def image_variation( self, - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -90,10 +90,10 @@ class VersatileDiffusionPipeline(DiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -123,7 +123,7 @@ class VersatileDiffusionPipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -134,7 +134,7 @@ class VersatileDiffusionPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -202,10 +202,10 @@ class VersatileDiffusionPipeline(DiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -235,7 +235,7 @@ class VersatileDiffusionPipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -246,7 +246,7 @@ class VersatileDiffusionPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -311,10 +311,10 @@ class VersatileDiffusionPipeline(DiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -344,7 +344,7 @@ class VersatileDiffusionPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -355,7 +355,7 @@ class VersatileDiffusionPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index b1117044cf..2212651fbb 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -395,10 +395,10 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -429,7 +429,7 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -439,7 +439,7 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline): Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index 59aa370ec2..62d3e83a47 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -197,7 +197,7 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -247,10 +247,10 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -281,7 +281,7 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -292,7 +292,7 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index 0c76e5837b..de4c2ac9b7 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -333,10 +333,10 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -367,7 +367,7 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -378,7 +378,7 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py b/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py index 0c55d04e67..8dee000df0 100644 --- a/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +++ b/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py @@ -169,10 +169,10 @@ class VQDiffusionPipeline(DiffusionPipeline): truncation_rate: float = 1.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ) -> Union[ImagePipelineOutput, Tuple]: """ @@ -196,7 +196,7 @@ class VQDiffusionPipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor` of shape (batch), *optional*): + latents (`torch.Tensor` of shape (batch), *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Must be valid embedding indices.If not provided, a latents tensor will be generated of completely masked latent pixels. @@ -206,7 +206,7 @@ class VQDiffusionPipeline(DiffusionPipeline): Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -301,7 +301,7 @@ class VQDiffusionPipeline(DiffusionPipeline): return ImagePipelineOutput(images=image) - def truncate(self, log_p_x_0: torch.FloatTensor, truncation_rate: float) -> torch.FloatTensor: + def truncate(self, log_p_x_0: torch.Tensor, truncation_rate: float) -> torch.Tensor: """ Truncates `log_p_x_0` such that for each column vector, the total cumulative probability is `truncation_rate` The lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py index a38918e1a0..f872f1ebdb 100644 --- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py @@ -154,8 +154,8 @@ class I2VGenXLPipeline( device, num_videos_per_prompt, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clip_skip: Optional[int] = None, ): r""" @@ -174,10 +174,10 @@ class I2VGenXLPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -434,7 +434,7 @@ class I2VGenXLPipeline( and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -513,9 +513,9 @@ class I2VGenXLPipeline( num_videos_per_prompt: Optional[int] = 1, decode_chunk_size: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -527,7 +527,7 @@ class I2VGenXLPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): Image or images to guide image generation. If you provide a tensor, it needs to be compatible with [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -559,14 +559,14 @@ class I2VGenXLPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py index 34b5a47c25..b2041e1015 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py @@ -233,8 +233,8 @@ class KandinskyPipeline(DiffusionPipeline): def __call__( self, prompt: Union[str, List[str]], - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, @@ -242,9 +242,9 @@ class KandinskyPipeline(DiffusionPipeline): guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -254,9 +254,9 @@ class KandinskyPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -279,7 +279,7 @@ class KandinskyPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -288,7 +288,7 @@ class KandinskyPipeline(DiffusionPipeline): (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py index cbe66a63f4..fe99097703 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py @@ -226,9 +226,9 @@ class KandinskyCombinedPipeline(DiffusionPipeline): prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -268,7 +268,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -277,7 +277,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline): (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -436,7 +436,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -447,9 +447,9 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline): prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -459,7 +459,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -499,7 +499,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -508,7 +508,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline): (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -677,8 +677,8 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -688,9 +688,9 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline): prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -700,7 +700,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -739,7 +739,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -748,7 +748,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline): (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py index 4d091e7d7a..ef5241fee5 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py @@ -266,10 +266,10 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline): # add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32) alphas = 1.0 - betas alphas_cumprod = torch.cumprod(alphas, dim=0) @@ -295,9 +295,9 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - image_embeds: torch.FloatTensor, - negative_image_embeds: torch.FloatTensor, + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + image_embeds: torch.Tensor, + negative_image_embeds: torch.Tensor, negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, @@ -307,7 +307,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline): num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -317,12 +317,12 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`): + image (`torch.Tensor`, `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -356,7 +356,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline): (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py index d8d9e96e6f..778b6e314c 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py @@ -398,10 +398,10 @@ class KandinskyInpaintPipeline(DiffusionPipeline): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - image_embeds: torch.FloatTensor, - negative_image_embeds: torch.FloatTensor, + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray], + image_embeds: torch.Tensor, + negative_image_embeds: torch.Tensor, negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, @@ -409,9 +409,9 @@ class KandinskyInpaintPipeline(DiffusionPipeline): guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -421,10 +421,10 @@ class KandinskyInpaintPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image` or `np.ndarray`): + image (`torch.Tensor`, `PIL.Image.Image` or `np.ndarray`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - mask_image (`PIL.Image.Image`,`torch.FloatTensor` or `np.ndarray`): + mask_image (`PIL.Image.Image`,`torch.Tensor` or `np.ndarray`): `Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if the image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, so the @@ -432,9 +432,9 @@ class KandinskyInpaintPipeline(DiffusionPipeline): image or numpy array, mask should also be a either PIL image or numpy array. If it is a PIL image, it will be converted to a single channel (luminance) before use. If it is a nummpy array, the expected shape is `(H, W)`. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -457,7 +457,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -466,7 +466,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline): (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py index 7d9be4570f..b5152d71cb 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py @@ -115,14 +115,14 @@ class KandinskyPriorPipelineOutput(BaseOutput): Output class for KandinskyPriorPipeline. Args: - image_embeds (`torch.FloatTensor`) + image_embeds (`torch.Tensor`) clip image embeddings for text prompt negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`) clip image embeddings for unconditional tokens """ - image_embeds: Union[torch.FloatTensor, np.ndarray] - negative_image_embeds: Union[torch.FloatTensor, np.ndarray] + image_embeds: Union[torch.Tensor, np.ndarray] + negative_image_embeds: Union[torch.Tensor, np.ndarray] class KandinskyPriorPipeline(DiffusionPipeline): @@ -173,12 +173,12 @@ class KandinskyPriorPipeline(DiffusionPipeline): @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) def interpolate( self, - images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]], weights: List[float], num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, negative_prior_prompt: Optional[str] = None, negative_prompt: str = "", guidance_scale: float = 4.0, @@ -188,7 +188,7 @@ class KandinskyPriorPipeline(DiffusionPipeline): Function invoked when using the prior pipeline for interpolation. Args: - images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`): list of prompts and images to guide the image generation. weights: (`List[float]`): list of weights for each condition in `images_and_prompts` @@ -200,7 +200,7 @@ class KandinskyPriorPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -403,7 +403,7 @@ class KandinskyPriorPipeline(DiffusionPipeline): num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, output_type: Optional[str] = "pt", return_dict: bool = True, @@ -425,7 +425,7 @@ class KandinskyPriorPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py index 4b977af0d6..471db61556 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py @@ -123,15 +123,15 @@ class KandinskyV22Pipeline(DiffusionPipeline): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -142,9 +142,9 @@ class KandinskyV22Pipeline(DiffusionPipeline): Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. @@ -164,7 +164,7 @@ class KandinskyV22Pipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py index 06d94d2cb7..9db767681b 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py @@ -213,9 +213,9 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline): prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -259,7 +259,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -442,7 +442,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -453,9 +453,9 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline): prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -469,7 +469,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -509,7 +509,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -518,7 +518,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline): (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -681,8 +681,8 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -692,7 +692,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline): prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -707,7 +707,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -746,7 +746,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py index de87dd3c34..0130c3951b 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py @@ -151,18 +151,18 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - hint: torch.FloatTensor, + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], + hint: torch.Tensor, height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -172,11 +172,11 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline): Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - hint (`torch.FloatTensor`): + hint (`torch.Tensor`): The controlnet condition. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -199,7 +199,7 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -208,7 +208,7 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline): (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py index c3ac7bcf60..12be1534c6 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py @@ -206,10 +206,10 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - hint: torch.FloatTensor, + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], + hint: torch.Tensor, height: int = 512, width: int = 512, num_inference_steps: int = 100, @@ -218,7 +218,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline): num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -226,9 +226,9 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline): Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -238,9 +238,9 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline): denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - hint (`torch.FloatTensor`): + hint (`torch.Tensor`): The controlnet condition. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. @@ -265,7 +265,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline): (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py index 3fdae934ad..899273a1a7 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py @@ -190,9 +190,9 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], height: int = 512, width: int = 512, num_inference_steps: int = 100, @@ -210,9 +210,9 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline): Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -222,7 +222,7 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline): denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py index 2fb8731f8a..b5ba7a0011 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py @@ -294,17 +294,17 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -315,7 +315,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline): Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. image (`PIL.Image.Image`): `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will @@ -325,7 +325,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline): black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. @@ -345,7 +345,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py index 1455e20ab5..f2134b22b4 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py @@ -132,12 +132,12 @@ class KandinskyV22PriorPipeline(DiffusionPipeline): @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) def interpolate( self, - images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]], weights: List[float], num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, negative_prior_prompt: Optional[str] = None, negative_prompt: str = "", guidance_scale: float = 4.0, @@ -147,7 +147,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline): Function invoked when using the prior pipeline for interpolation. Args: - images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`): list of prompts and images to guide the image generation. weights: (`List[float]`): list of weights for each condition in `images_and_prompts` @@ -159,7 +159,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -376,7 +376,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline): num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, output_type: Optional[str] = "pt", # pt only return_dict: bool = True, @@ -400,7 +400,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py index 26442735e8..ec6509bb3c 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py @@ -156,12 +156,12 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline): @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) def interpolate( self, - images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]], weights: List[float], num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, negative_prior_prompt: Optional[str] = None, negative_prompt: str = "", guidance_scale: float = 4.0, @@ -171,7 +171,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline): Function invoked when using the prior pipeline for interpolation. Args: - images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`): list of prompts and images to guide the image generation. weights: (`List[float]`): list of weights for each condition in `images_and_prompts` @@ -183,7 +183,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -418,7 +418,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline): Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. - emb (`torch.FloatTensor`): + emb (`torch.Tensor`): The image embedding. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py index 85d6418d07..d7ff59e001 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py @@ -87,11 +87,11 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt=1, device=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, _cut_context=False, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -109,16 +109,16 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. """ if prompt is not None and negative_prompt is not None: @@ -334,10 +334,10 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin): height: Optional[int] = 1024, width: Optional[int] = 1024, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, latents=None, @@ -380,16 +380,16 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between @@ -398,7 +398,7 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py index 16a57b6b8c..df46756a17 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py @@ -112,11 +112,11 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): num_images_per_prompt=1, device=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, _cut_context=False, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -134,16 +134,16 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. """ if prompt is not None and negative_prompt is not None: @@ -403,17 +403,17 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None, strength: float = 0.3, num_inference_steps: int = 25, guidance_scale: float = 3.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -427,7 +427,7 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): @@ -454,16 +454,16 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index a84263eaee..11e3781aaf 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -237,8 +237,8 @@ class LatentConsistencyModelImg2ImgPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -258,10 +258,10 @@ class LatentConsistencyModelImg2ImgPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -565,7 +565,7 @@ class LatentConsistencyModelImg2ImgPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -578,7 +578,7 @@ class LatentConsistencyModelImg2ImgPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -628,7 +628,7 @@ class LatentConsistencyModelImg2ImgPipeline( prompt: Union[str, List[str]], strength: float, callback_steps: int, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image=None, ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, @@ -709,10 +709,10 @@ class LatentConsistencyModelImg2ImgPipeline( guidance_scale: float = 8.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -754,16 +754,16 @@ class LatentConsistencyModelImg2ImgPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index 1bbb1bd91b..7f34952584 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -221,8 +221,8 @@ class LatentConsistencyModelPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -242,10 +242,10 @@ class LatentConsistencyModelPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -512,7 +512,7 @@ class LatentConsistencyModelPipeline( def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -525,7 +525,7 @@ class LatentConsistencyModelPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -565,7 +565,7 @@ class LatentConsistencyModelPipeline( height: int, width: int, callback_steps: int, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image=None, ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, @@ -646,10 +646,10 @@ class LatentConsistencyModelPipeline( guidance_scale: float = 8.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -691,16 +691,16 @@ class LatentConsistencyModelPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py index f39cbc8396..f6f3531a88 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py @@ -74,7 +74,7 @@ class LDMTextToImagePipeline(DiffusionPipeline): guidance_scale: Optional[float] = 1.0, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, **kwargs, @@ -98,7 +98,7 @@ class LDMTextToImagePipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -465,17 +465,17 @@ class LDMBertEncoderLayer(nn.Module): def forward( self, - hidden_states: torch.FloatTensor, - attention_mask: torch.FloatTensor, - layer_head_mask: torch.FloatTensor, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, output_attentions: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size + hidden_states (`torch.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (`torch.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + layer_head_mask (`torch.Tensor`): mask for attention heads in a given layer of size `(encoder_attention_heads,)`. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under @@ -587,7 +587,7 @@ class LDMBertEncoder(LDMBertPreTrainedModel): attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -615,7 +615,7 @@ class LDMBertEncoder(LDMBertPreTrainedModel): - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py index 619be13a8f..fdcaa85e9f 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py @@ -502,8 +502,8 @@ class LEditsPPPipelineStableDiffusion( enable_edit_guidance, negative_prompt=None, editing_prompt=None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - editing_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + editing_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -523,10 +523,10 @@ class LEditsPPPipelineStableDiffusion( less than `1`). editing_prompt (`str` or `List[str]`, *optional*): Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead. - editing_prompt_embeds (`torch.FloatTensor`, *optional*): + editing_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -704,13 +704,13 @@ class LEditsPPPipelineStableDiffusion( return_dict: bool = True, editing_prompt: Optional[Union[str, List[str]]] = None, editing_prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, reverse_editing_direction: Optional[Union[bool, List[bool]]] = False, edit_guidance_scale: Optional[Union[float, List[float]]] = 5, edit_warmup_steps: Optional[Union[int, List[int]]] = 0, edit_cooldown_steps: Optional[Union[int, List[int]]] = None, edit_threshold: Optional[Union[float, List[float]]] = 0.9, - user_mask: Optional[torch.FloatTensor] = None, + user_mask: Optional[torch.Tensor] = None, sem_guidance: Optional[List[torch.Tensor]] = None, use_cross_attn_mask: bool = False, use_intersect_mask: bool = True, @@ -748,7 +748,7 @@ class LEditsPPPipelineStableDiffusion( editing_prompt_embeds (`torch.Tensor>`, *optional*): Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be specified via `reverse_editing_direction`. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): @@ -765,7 +765,7 @@ class LEditsPPPipelineStableDiffusion( Masking threshold of guidance. Threshold should be proportional to the image region that is modified. 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). - user_mask (`torch.FloatTensor`, *optional*): + user_mask (`torch.Tensor`, *optional*): User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit masks do not meet user preferences. sem_guidance (`List[torch.Tensor]`, *optional*): diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py index 5ea7c2c145..d2a62c01ee 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py @@ -409,14 +409,14 @@ class LEditsPPPipelineStableDiffusionXL( num_images_per_prompt: int = 1, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, enable_edit_guidance: bool = True, editing_prompt: Optional[str] = None, - editing_prompt_embeds: Optional[torch.FloatTensor] = None, - editing_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + editing_prompt_embeds: Optional[torch.Tensor] = None, + editing_pooled_prompt_embeds: Optional[torch.Tensor] = None, ) -> object: r""" Encodes the prompt into text encoder hidden states. @@ -432,11 +432,11 @@ class LEditsPPPipelineStableDiffusionXL( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -450,11 +450,11 @@ class LEditsPPPipelineStableDiffusionXL( editing_prompt (`str` or `List[str]`, *optional*): Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass `editing_prompt_embeds` instead. - editing_prompt_embeds (`torch.FloatTensor`, *optional*): + editing_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input argument. - editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + editing_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt` input argument. @@ -713,7 +713,7 @@ class LEditsPPPipelineStableDiffusionXL( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -726,7 +726,7 @@ class LEditsPPPipelineStableDiffusionXL( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -804,8 +804,8 @@ class LEditsPPPipelineStableDiffusionXL( denoising_end: Optional[float] = None, negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt_2: Optional[Union[str, List[str]]] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -824,7 +824,7 @@ class LEditsPPPipelineStableDiffusionXL( sem_guidance: Optional[List[torch.Tensor]] = None, use_cross_attn_mask: bool = False, use_intersect_mask: bool = False, - user_mask: Optional[torch.FloatTensor] = None, + user_mask: Optional[torch.Tensor] = None, attn_store_steps: Optional[List[int]] = [], store_averaged_over_steps: bool = True, clip_skip: Optional[int] = None, @@ -851,11 +851,11 @@ class LEditsPPPipelineStableDiffusionXL( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -869,7 +869,7 @@ class LEditsPPPipelineStableDiffusionXL( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py index 2bd828f0df..728635da6d 100644 --- a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py +++ b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py @@ -120,8 +120,8 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin): num_waveforms_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -139,10 +139,10 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin): The prompt or prompts not to guide the audio generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -427,11 +427,11 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin): num_waveforms_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, output_type: Optional[str] = "np", @@ -465,21 +465,21 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index 263d507bbc..b225fd71ed 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -266,7 +266,7 @@ class PaintByExamplePipeline(DiffusionPipeline, StableDiffusionMixin): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -393,9 +393,9 @@ class PaintByExamplePipeline(DiffusionPipeline, StableDiffusionMixin): @torch.no_grad() def __call__( self, - example_image: Union[torch.FloatTensor, PIL.Image.Image], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + example_image: Union[torch.Tensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -404,22 +404,22 @@ class PaintByExamplePipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" The call function to the pipeline for generation. Args: - example_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): + example_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): An example image to guide image generation. - image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): + image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): `Image` or tensor representing an image batch to be inpainted (parts of the image are masked out with `mask_image` and repainted according to `prompt`). - mask_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): + mask_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted, while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the @@ -445,7 +445,7 @@ class PaintByExamplePipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -456,7 +456,7 @@ class PaintByExamplePipeline(DiffusionPipeline, StableDiffusionMixin): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index a2723187f2..1ec418b0e8 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -207,8 +207,8 @@ class PIAPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -228,10 +228,10 @@ class PIAPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -680,11 +680,11 @@ class PIAPipeline( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, motion_scale: int = 0, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -725,20 +725,20 @@ class PIAPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -749,8 +749,7 @@ class PIAPipeline( added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5 to create looping motion. Set between 6-8 to perform motion with image style transfer. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index c1bb5b4584..9794b89f76 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -296,10 +296,10 @@ class PixArtAlphaPipeline(DiffusionPipeline): negative_prompt: str = "", num_images_per_prompt: int = 1, device: Optional[torch.device] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, clean_caption: bool = False, max_sequence_length: int = 120, **kwargs, @@ -320,10 +320,10 @@ class PixArtAlphaPipeline(DiffusionPipeline): number of images that should be generated per prompt device: (`torch.device`, *optional*): torch device to place the resulting embeddings on - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" string. clean_caption (`bool`, defaults to `False`): @@ -694,14 +694,14 @@ class PixArtAlphaPipeline(DiffusionPipeline): width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, use_resolution_binning: bool = True, @@ -748,18 +748,18 @@ class PixArtAlphaPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings. + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_attention_mask (`torch.FloatTensor`, *optional*): + negative_prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for negative text embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between @@ -768,7 +768,7 @@ class PixArtAlphaPipeline(DiffusionPipeline): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py index 0389ac06f3..5e3cc668d6 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py @@ -222,10 +222,10 @@ class PixArtSigmaPipeline(DiffusionPipeline): negative_prompt: str = "", num_images_per_prompt: int = 1, device: Optional[torch.device] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, clean_caption: bool = False, max_sequence_length: int = 120, **kwargs, @@ -246,10 +246,10 @@ class PixArtSigmaPipeline(DiffusionPipeline): number of images that should be generated per prompt device: (`torch.device`, *optional*): torch device to place the resulting embeddings on - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" string. clean_caption (`bool`, defaults to `False`): @@ -621,14 +621,14 @@ class PixArtSigmaPipeline(DiffusionPipeline): width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, use_resolution_binning: bool = True, @@ -675,18 +675,18 @@ class PixArtSigmaPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings. + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_attention_mask (`torch.FloatTensor`, *optional*): + negative_prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for negative text embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between @@ -695,7 +695,7 @@ class PixArtSigmaPipeline(DiffusionPipeline): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index fe83a860ae..e068387b61 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -224,10 +224,10 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: int = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, editing_prompt: Optional[Union[str, List[str]]] = None, editing_prompt_embeddings: Optional[torch.Tensor] = None, @@ -268,7 +268,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -279,7 +279,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py index 1ef10e17cb..f87f28e06c 100644 --- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py +++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py @@ -69,7 +69,7 @@ class ShapEPipelineOutput(BaseOutput): Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`]. Args: - images (`torch.FloatTensor`) + images (`torch.Tensor`) A list of images for 3D rendering. """ @@ -187,7 +187,7 @@ class ShapEPipeline(DiffusionPipeline): num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, frame_size: int = 64, output_type: Optional[str] = "pil", # pil, np, latent, mesh @@ -207,7 +207,7 @@ class ShapEPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py index 700ca5db6f..7cc145e4c3 100644 --- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py @@ -70,7 +70,7 @@ class ShapEPipelineOutput(BaseOutput): Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`]. Args: - images (`torch.FloatTensor`) + images (`torch.Tensor`) A list of images for 3D rendering. """ @@ -169,7 +169,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline): num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, frame_size: int = 64, output_type: Optional[str] = "pil", # pil, np, latent, mesh @@ -179,7 +179,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline): The call function to the pipeline for generation. Args: - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as image, but if passing latents directly it is not encoded again. num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -190,7 +190,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py index 65ac21f220..af80cf805a 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py @@ -129,10 +129,10 @@ class StableCascadeDecoderPipeline(DiffusionPipeline): do_classifier_free_guidance, prompt=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, ): if prompt_embeds is None: # get prompt text embeddings @@ -285,18 +285,18 @@ class StableCascadeDecoderPipeline(DiffusionPipeline): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeddings: Union[torch.Tensor, List[torch.Tensor]], prompt: Union[str, List[str]] = None, num_inference_steps: int = 10, guidance_scale: float = 0.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -306,7 +306,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline): Function invoked when calling the pipeline for generation. Args: - image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embedding (`torch.Tensor` or `List[torch.Tensor]`): Image Embeddings either extracted from an image or generated by a Prior Model. prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. @@ -322,17 +322,17 @@ class StableCascadeDecoderPipeline(DiffusionPipeline): negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `decoder_guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + negative_prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input argument. @@ -341,7 +341,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py index 3b96ef1418..6724b60cc4 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py @@ -162,13 +162,13 @@ class StableCascadeCombinedPipeline(DiffusionPipeline): num_inference_steps: int = 12, decoder_guidance_scale: float = 0.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -187,17 +187,17 @@ class StableCascadeCombinedPipeline(DiffusionPipeline): negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + negative_prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -230,7 +230,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py index 55fb4c28f6..dc6c81e1a8 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py @@ -54,19 +54,19 @@ class StableCascadePriorPipelineOutput(BaseOutput): Output class for WuerstchenPriorPipeline. Args: - image_embeddings (`torch.FloatTensor` or `np.ndarray`) + image_embeddings (`torch.Tensor` or `np.ndarray`) Prior image embeddings for text prompt - prompt_embeds (`torch.FloatTensor`): + prompt_embeds (`torch.Tensor`): Text embeddings for the prompt. - negative_prompt_embeds (`torch.FloatTensor`): + negative_prompt_embeds (`torch.Tensor`): Text embeddings for the negative prompt. """ - image_embeddings: Union[torch.FloatTensor, np.ndarray] - prompt_embeds: Union[torch.FloatTensor, np.ndarray] - prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray] - negative_prompt_embeds: Union[torch.FloatTensor, np.ndarray] - negative_prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray] + image_embeddings: Union[torch.Tensor, np.ndarray] + prompt_embeds: Union[torch.Tensor, np.ndarray] + prompt_embeds_pooled: Union[torch.Tensor, np.ndarray] + negative_prompt_embeds: Union[torch.Tensor, np.ndarray] + negative_prompt_embeds_pooled: Union[torch.Tensor, np.ndarray] class StableCascadePriorPipeline(DiffusionPipeline): @@ -150,10 +150,10 @@ class StableCascadePriorPipeline(DiffusionPipeline): do_classifier_free_guidance, prompt=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, ): if prompt_embeds is None: # get prompt text embeddings @@ -374,14 +374,14 @@ class StableCascadePriorPipeline(DiffusionPipeline): timesteps: List[float] = None, guidance_scale: float = 4.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pt", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -409,21 +409,21 @@ class StableCascadePriorPipeline(DiffusionPipeline): negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `decoder_guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + negative_prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If not provided, image embeddings will be generated from `image` input argument if existing. num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -431,7 +431,7 @@ class StableCascadePriorPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py index 311347dcca..2e34dcb83c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py @@ -288,7 +288,7 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline): prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`): + image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`): `Image`, or tensor representing an image batch which will be upscaled. * num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -329,7 +329,7 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline): plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index bee6ea7b11..2ec60fbd61 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -395,7 +395,7 @@ class OnnxStableDiffusionUpscalePipeline(DiffusionPipeline): [`schedulers.DDIMScheduler`], will be ignored for others. generator (`np.random.RandomState`, *optional*): A np.random.RandomState to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 3612039f09..e8ab72421d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -274,8 +274,8 @@ class StableDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -306,8 +306,8 @@ class StableDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -327,10 +327,10 @@ class StableDiffusionPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -691,7 +691,7 @@ class StableDiffusionPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -704,7 +704,7 @@ class StableDiffusionPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -765,11 +765,11 @@ class StableDiffusionPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -814,18 +814,18 @@ class StableDiffusionPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 1f82297156..d29e519974 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -156,8 +156,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -189,8 +189,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -210,10 +210,10 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -609,7 +609,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader self, prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, - depth_map: Optional[torch.FloatTensor] = None, + depth_map: Optional[torch.Tensor] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -617,8 +617,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -633,10 +633,10 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can accept image latents as `image` only if `depth_map` is not `None`. - depth_map (`torch.FloatTensor`, *optional*): + depth_map (`torch.Tensor`, *optional*): Depth prediction to be used as additional conditioning for the image generation process. If not defined, it automatically predicts the depth with `self.depth_estimator`. strength (`float`, *optional*, defaults to 0.8): @@ -662,10 +662,10 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index c300c7a2f3..93a8bd1603 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -207,7 +207,7 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMi and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -248,7 +248,7 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMi @torch.no_grad() def __call__( self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], + image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor], height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -256,17 +256,17 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMi num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" The call function to the pipeline for generation. Args: - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): Image or images to guide image generation. If you provide a tensor, it needs to be compatible with [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -287,7 +287,7 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMi generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -298,7 +298,7 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMi plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index daded18ebd..f2a5de8154 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -315,8 +315,8 @@ class StableDiffusionImg2ImgPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -348,8 +348,8 @@ class StableDiffusionImg2ImgPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -369,10 +369,10 @@ class StableDiffusionImg2ImgPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -784,7 +784,7 @@ class StableDiffusionImg2ImgPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -797,7 +797,7 @@ class StableDiffusionImg2ImgPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -854,10 +854,10 @@ class StableDiffusionImg2ImgPipeline( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -872,7 +872,7 @@ class StableDiffusionImg2ImgPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a @@ -909,14 +909,14 @@ class StableDiffusionImg2ImgPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index ab2a0eae73..71dec964fd 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -387,8 +387,8 @@ class StableDiffusionInpaintPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -420,8 +420,8 @@ class StableDiffusionInpaintPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -441,10 +441,10 @@ class StableDiffusionInpaintPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -931,7 +931,7 @@ class StableDiffusionInpaintPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -944,7 +944,7 @@ class StableDiffusionInpaintPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -992,7 +992,7 @@ class StableDiffusionInpaintPipeline( prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, - masked_image_latents: torch.FloatTensor = None, + masked_image_latents: torch.Tensor = None, height: Optional[int] = None, width: Optional[int] = None, padding_mask_crop: Optional[int] = None, @@ -1005,11 +1005,11 @@ class StableDiffusionInpaintPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1024,14 +1024,14 @@ class StableDiffusionInpaintPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one @@ -1080,18 +1080,18 @@ class StableDiffusionInpaintPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 0bf5a92a4f..b2b2b14009 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -168,11 +168,11 @@ class StableDiffusionInstructPix2PixPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -185,7 +185,7 @@ class StableDiffusionInstructPix2PixPipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. num_inference_steps (`int`, *optional*, defaults to 100): @@ -210,14 +210,14 @@ class StableDiffusionInstructPix2PixPipeline( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): @@ -471,8 +471,8 @@ class StableDiffusionInstructPix2PixPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -490,10 +490,10 @@ class StableDiffusionInstructPix2PixPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 918dffe519..978e2dbb60 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -267,10 +267,10 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix guidance_scale: float = 9.0, negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -279,7 +279,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix Args: prompt (`str` or `List[str]`): The prompt or prompts to guide image upscaling. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be upscaled. If it's a tensor, it can be either a latent output from a Stable Diffusion model or an image tensor in the range `[-1, 1]`. It is considered a `latent` if `image.shape[1]` is `4`; otherwise, it is considered to be an image representation and @@ -299,7 +299,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -310,7 +310,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 2d04cf41d9..3981c8a461 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -176,8 +176,8 @@ class StableDiffusionUpscalePipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -209,8 +209,8 @@ class StableDiffusionUpscalePipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -230,10 +230,10 @@ class StableDiffusionUpscalePipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -542,12 +542,12 @@ class StableDiffusionUpscalePipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: int = None, @@ -558,7 +558,7 @@ class StableDiffusionUpscalePipeline( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be upscaled. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -577,14 +577,14 @@ class StableDiffusionUpscalePipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -594,7 +594,7 @@ class StableDiffusionUpscalePipeline( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index 02ddc65c71..1839c45110 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -257,8 +257,8 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -290,8 +290,8 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -311,10 +311,10 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -588,7 +588,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver self, image_embeds: torch.Tensor, noise_level: int, - noise: Optional[torch.FloatTensor] = None, + noise: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, ): """ @@ -644,19 +644,19 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 0, # prior args prior_num_inference_steps: int = 25, prior_guidance_scale: float = 4.0, - prior_latents: Optional[torch.FloatTensor] = None, + prior_latents: Optional[torch.Tensor] = None, clip_skip: Optional[int] = None, ): """ @@ -686,14 +686,14 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -702,7 +702,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -718,7 +718,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver prior_guidance_scale (`float`, *optional*, defaults to 4.0): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - prior_latents (`torch.FloatTensor`, *optional*): + prior_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image embedding generation in the prior denoising process. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 134ec39eff..13915b39f0 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -166,8 +166,8 @@ class StableUnCLIPImg2ImgPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -254,8 +254,8 @@ class StableUnCLIPImg2ImgPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -275,10 +275,10 @@ class StableUnCLIPImg2ImgPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -537,7 +537,7 @@ class StableUnCLIPImg2ImgPipeline( and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -569,7 +569,7 @@ class StableUnCLIPImg2ImgPipeline( self, image_embeds: torch.Tensor, noise_level: int, - noise: Optional[torch.FloatTensor] = None, + noise: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, ): """ @@ -615,7 +615,7 @@ class StableUnCLIPImg2ImgPipeline( @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, prompt: Union[str, List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, @@ -625,16 +625,16 @@ class StableUnCLIPImg2ImgPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 0, - image_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.Tensor] = None, clip_skip: Optional[int] = None, ): r""" @@ -644,7 +644,7 @@ class StableUnCLIPImg2ImgPipeline( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, either `prompt_embeds` will be used or prompt is initialized to `""`. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image` or tensor representing an image batch. The image is encoded to its CLIP embedding which the `unet` is conditioned on. The image is _not_ encoded by the `vae` and then used as the latents in the denoising process like it is in the standard Stable Diffusion text-guided image variation process. @@ -669,14 +669,14 @@ class StableUnCLIPImg2ImgPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -685,7 +685,7 @@ class StableUnCLIPImg2ImgPipeline( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -695,7 +695,7 @@ class StableUnCLIPImg2ImgPipeline( noise_level (`int`, *optional*, defaults to `0`): The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated CLIP embeddings to condition the `unet` on. These latents are not used in the denoising process. If you want to provide pre-generated latents, pass them to `__call__` as `latents`. clip_skip (`int`, *optional*): diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py index 3e6dec3e0b..3a0e86409e 100644 --- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py @@ -100,7 +100,7 @@ class StableDiffusionSafetyChecker(PreTrainedModel): return images, has_nsfw_concepts @torch.no_grad() - def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor): + def forward_onnx(self, clip_input: torch.Tensor, images: torch.Tensor): pooled_output = self.vision_model(clip_input)[1] # pooled_output image_embeds = self.visual_projection(pooled_output) diff --git a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py index 2adcb0a8c0..347d573e1f 100644 --- a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py @@ -254,8 +254,8 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -287,8 +287,8 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -308,10 +308,10 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -746,12 +746,12 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM num_images_per_prompt: int = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, max_iter_to_alter: int = 25, @@ -789,14 +789,14 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -806,7 +806,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py index e89d7f77e8..1323542e40 100644 --- a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py @@ -53,7 +53,7 @@ class DiffEditInversionPipelineOutput(BaseOutput): Output class for Stable Diffusion pipelines. Args: - latents (`torch.FloatTensor`) + latents (`torch.Tensor`) inverted latents tensor images (`List[PIL.Image.Image]` or `np.ndarray`) List of denoised PIL images of length `num_timesteps * batch_size` or numpy array of shape `(num_timesteps, @@ -61,7 +61,7 @@ class DiffEditInversionPipelineOutput(BaseOutput): diffusion pipeline. """ - latents: torch.FloatTensor + latents: torch.Tensor images: Union[List[PIL.Image.Image], np.ndarray] @@ -381,8 +381,8 @@ class StableDiffusionDiffEditPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -414,8 +414,8 @@ class StableDiffusionDiffEditPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -435,10 +435,10 @@ class StableDiffusionDiffEditPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -831,15 +831,15 @@ class StableDiffusionDiffEditPipeline( @replace_example_docstring(EXAMPLE_DOC_STRING) def generate_mask( self, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, target_prompt: Optional[Union[str, List[str]]] = None, target_negative_prompt: Optional[Union[str, List[str]]] = None, - target_prompt_embeds: Optional[torch.FloatTensor] = None, - target_negative_prompt_embeds: Optional[torch.FloatTensor] = None, + target_prompt_embeds: Optional[torch.Tensor] = None, + target_negative_prompt_embeds: Optional[torch.Tensor] = None, source_prompt: Optional[Union[str, List[str]]] = None, source_negative_prompt: Optional[Union[str, List[str]]] = None, - source_prompt_embeds: Optional[torch.FloatTensor] = None, - source_negative_prompt_embeds: Optional[torch.FloatTensor] = None, + source_prompt_embeds: Optional[torch.Tensor] = None, + source_negative_prompt_embeds: Optional[torch.Tensor] = None, num_maps_per_mask: Optional[int] = 10, mask_encode_strength: Optional[float] = 0.5, mask_thresholding_ratio: Optional[float] = 3.0, @@ -861,10 +861,10 @@ class StableDiffusionDiffEditPipeline( target_negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - target_prompt_embeds (`torch.FloatTensor`, *optional*): + target_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - target_negative_prompt_embeds (`torch.FloatTensor`, *optional*): + target_negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. source_prompt (`str` or `List[str]`, *optional*): @@ -873,11 +873,11 @@ class StableDiffusionDiffEditPipeline( source_negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide semantic mask generation away from using DiffEdit. If not defined, you need to pass `source_negative_prompt_embeds` or `source_image` instead. - source_prompt_embeds (`torch.FloatTensor`, *optional*): + source_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings to guide the semantic mask generation. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from `source_prompt` input argument. - source_negative_prompt_embeds (`torch.FloatTensor`, *optional*): + source_negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings to negatively guide the semantic mask generation. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from `source_negative_prompt` input argument. @@ -1051,18 +1051,18 @@ class StableDiffusionDiffEditPipeline( def invert( self, prompt: Optional[Union[str, List[str]]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, num_inference_steps: int = 50, inpaint_strength: float = 0.8, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, decode_latents: bool = False, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, lambda_auto_corr: float = 20.0, @@ -1095,10 +1095,10 @@ class StableDiffusionDiffEditPipeline( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. decode_latents (`bool`, *optional*, defaults to `False`): @@ -1111,7 +1111,7 @@ class StableDiffusionDiffEditPipeline( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -1289,8 +1289,8 @@ class StableDiffusionDiffEditPipeline( def __call__( self, prompt: Optional[Union[str, List[str]]] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, - image_latents: Union[torch.FloatTensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, + image_latents: Union[torch.Tensor, PIL.Image.Image] = None, inpaint_strength: Optional[float] = 0.8, num_inference_steps: int = 50, guidance_scale: float = 7.5, @@ -1298,12 +1298,12 @@ class StableDiffusionDiffEditPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: int = None, @@ -1319,7 +1319,7 @@ class StableDiffusionDiffEditPipeline( repainted, while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, 1, H, W)`. - image_latents (`PIL.Image.Image` or `torch.FloatTensor`): + image_latents (`PIL.Image.Image` or `torch.Tensor`): Partially noised image latents from the inversion process to be used as inputs for image generation. inpaint_strength (`float`, *optional*, defaults to 0.8): Indicates extent to inpaint the masked area. Must be between 0 and 1. When `inpaint_strength` is 1, the @@ -1343,14 +1343,14 @@ class StableDiffusionDiffEditPipeline( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -1360,7 +1360,7 @@ class StableDiffusionDiffEditPipeline( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py index 94043b7285..4b96e317dc 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py @@ -180,8 +180,8 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -213,8 +213,8 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -234,10 +234,10 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -541,12 +541,12 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -592,14 +592,14 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -609,7 +609,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin): plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py index c20e940b4d..963f29f21f 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py @@ -238,8 +238,8 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -259,10 +259,10 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -705,12 +705,12 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, gligen_normalize_constant: float = 28.7, @@ -764,14 +764,14 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -781,7 +781,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py index e2096be7e8..067ae5f6f1 100755 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -154,8 +154,8 @@ class StableDiffusionKDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -187,8 +187,8 @@ class StableDiffusionKDiffusionPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -208,10 +208,10 @@ class StableDiffusionKDiffusionPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -469,12 +469,12 @@ class StableDiffusionKDiffusionPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, use_karras_sigmas: Optional[bool] = False, noise_sampler_seed: Optional[int] = None, @@ -512,14 +512,14 @@ class StableDiffusionKDiffusionPipeline( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -531,7 +531,7 @@ class StableDiffusionKDiffusionPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py index 3cfda4064d..f7ca0c7c4d 100644 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py @@ -207,10 +207,10 @@ class StableDiffusionXLKDiffusionPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -236,17 +236,17 @@ class StableDiffusionXLKDiffusionPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -584,11 +584,11 @@ class StableDiffusionXLKDiffusionPipeline( negative_prompt_2: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, original_size: Optional[Tuple[int, int]] = None, @@ -642,21 +642,21 @@ class StableDiffusionXLKDiffusionPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index fcb6db4574..a100a38b04 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -254,8 +254,8 @@ class StableDiffusionLDM3DPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -287,8 +287,8 @@ class StableDiffusionLDM3DPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -308,10 +308,10 @@ class StableDiffusionLDM3DPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -666,7 +666,7 @@ class StableDiffusionLDM3DPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -679,7 +679,7 @@ class StableDiffusionLDM3DPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -740,11 +740,11 @@ class StableDiffusionLDM3DPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -789,19 +789,19 @@ class StableDiffusionLDM3DPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index 31af62b609..2b80d28568 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -226,8 +226,8 @@ class StableDiffusionPanoramaPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -259,8 +259,8 @@ class StableDiffusionPanoramaPipeline( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -280,10 +280,10 @@ class StableDiffusionPanoramaPipeline( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -677,7 +677,7 @@ class StableDiffusionPanoramaPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -690,7 +690,7 @@ class StableDiffusionPanoramaPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -790,11 +790,11 @@ class StableDiffusionPanoramaPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -839,19 +839,19 @@ class StableDiffusionPanoramaPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py index 63b8c6108a..cd59cf5186 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py @@ -516,11 +516,11 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAda num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, sld_guidance_scale: Optional[float] = 1000, sld_warmup_steps: Optional[int] = 10, @@ -555,7 +555,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAda generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -568,7 +568,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAda plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py b/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py index 549747e971..338e4c65c5 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py @@ -85,7 +85,7 @@ class SafeStableDiffusionSafetyChecker(PreTrainedModel): return images, has_nsfw_concepts @torch.no_grad() - def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor): + def forward_onnx(self, clip_input: torch.Tensor, images: torch.Tensor): pooled_output = self.vision_model(clip_input)[1] # pooled_output image_embeds = self.visual_projection(pooled_output) diff --git a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py index cb29ce386f..04c62efb46 100644 --- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py @@ -169,8 +169,8 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -202,8 +202,8 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -223,10 +223,10 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -570,14 +570,14 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -611,19 +611,19 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -633,7 +633,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index b7bc832ec3..52d0b07fb3 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -282,10 +282,10 @@ class StableDiffusionXLPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -311,17 +311,17 @@ class StableDiffusionXLPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -762,7 +762,7 @@ class StableDiffusionXLPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -775,7 +775,7 @@ class StableDiffusionXLPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -843,13 +843,13 @@ class StableDiffusionXLPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -924,26 +924,26 @@ class StableDiffusionXLPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index ada981d4de..b8698a0083 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -303,10 +303,10 @@ class StableDiffusionXLImg2ImgPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -332,17 +332,17 @@ class StableDiffusionXLImg2ImgPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -902,7 +902,7 @@ class StableDiffusionXLImg2ImgPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -915,7 +915,7 @@ class StableDiffusionXLImg2ImgPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -988,13 +988,13 @@ class StableDiffusionXLImg2ImgPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1022,7 +1022,7 @@ class StableDiffusionXLImg2ImgPipeline( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. strength (`float`, *optional*, defaults to 0.3): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -1078,26 +1078,26 @@ class StableDiffusionXLImg2ImgPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 9361cc6a13..38f5cec931 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -531,10 +531,10 @@ class StableDiffusionXLInpaintPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -560,17 +560,17 @@ class StableDiffusionXLInpaintPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1132,7 +1132,7 @@ class StableDiffusionXLInpaintPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -1145,7 +1145,7 @@ class StableDiffusionXLInpaintPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1207,7 +1207,7 @@ class StableDiffusionXLInpaintPipeline( prompt_2: Optional[Union[str, List[str]]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, - masked_image_latents: torch.FloatTensor = None, + masked_image_latents: torch.Tensor = None, height: Optional[int] = None, width: Optional[int] = None, padding_mask_crop: Optional[int] = None, @@ -1223,13 +1223,13 @@ class StableDiffusionXLInpaintPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1329,22 +1329,22 @@ class StableDiffusionXLInpaintPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -1357,7 +1357,7 @@ class StableDiffusionXLInpaintPipeline( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py index d9380020b3..3cdef82303 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py @@ -222,10 +222,10 @@ class StableDiffusionXLInstructPix2PixPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -250,17 +250,17 @@ class StableDiffusionXLInstructPix2PixPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -622,14 +622,14 @@ class StableDiffusionXLInstructPix2PixPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -647,7 +647,7 @@ class StableDiffusionXLInstructPix2PixPipeline( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -689,21 +689,21 @@ class StableDiffusionXLInstructPix2PixPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -715,7 +715,7 @@ class StableDiffusionXLInstructPix2PixPipeline( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py index f457cdbdb1..70d06bb632 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py @@ -21,7 +21,7 @@ class StableDiffusionXLWatermarker: self.encoder.set_watermark("bits", self.watermark) - def apply_watermark(self, images: torch.FloatTensor): + def apply_watermark(self, images: torch.Tensor): # can't encode images that are smaller than 256 if images.shape[-1] < 256: return images diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py index d815adab04..a805a660e8 100644 --- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py @@ -128,12 +128,12 @@ class StableVideoDiffusionPipelineOutput(BaseOutput): Output class for Stable Video Diffusion pipeline. Args: - frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]): + frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]): List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size, num_frames, height, width, num_channels)`. """ - frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor] + frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor] class StableVideoDiffusionPipeline(DiffusionPipeline): @@ -186,7 +186,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): device: Union[str, torch.device], num_videos_per_prompt: int, do_classifier_free_guidance: bool, - ) -> torch.FloatTensor: + ) -> torch.Tensor: dtype = next(self.image_encoder.parameters()).dtype if not isinstance(image, torch.Tensor): @@ -279,7 +279,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): return add_time_ids - def decode_latents(self, latents: torch.FloatTensor, num_frames: int, decode_chunk_size: int = 14): + def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14): # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width] latents = latents.flatten(0, 1) @@ -315,7 +315,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -332,7 +332,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): dtype: torch.dtype, device: Union[str, torch.device], generator: torch.Generator, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ): shape = ( batch_size, @@ -377,7 +377,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], + image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor], height: int = 576, width: int = 1024, num_frames: Optional[int] = None, @@ -391,7 +391,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): decode_chunk_size: Optional[int] = None, num_videos_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], @@ -401,7 +401,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): The call function to the pipeline for generation. Args: - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -440,7 +440,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -465,8 +465,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): Returns: [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`: If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is - returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) - is returned. + returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.Tensor`) is + returned. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index 5b8dbafdec..ab4ab87d2a 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -271,8 +271,8 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -304,8 +304,8 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -325,10 +325,10 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -635,7 +635,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -648,7 +648,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -690,12 +690,12 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, adapter_conditioning_scale: Union[float, List[float]] = 1.0, @@ -708,9 +708,9 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -745,14 +745,14 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -764,7 +764,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 6d742b4b7c..2aa2415a47 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -296,10 +296,10 @@ class StableDiffusionXLAdapterPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -325,17 +325,17 @@ class StableDiffusionXLAdapterPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -806,7 +806,7 @@ class StableDiffusionXLAdapterPipeline( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -819,7 +819,7 @@ class StableDiffusionXLAdapterPipeline( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -864,16 +864,16 @@ class StableDiffusionXLAdapterPipeline( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -897,9 +897,9 @@ class StableDiffusionXLAdapterPipeline( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. Anything below 512 pixels won't work well for @@ -948,26 +948,26 @@ class StableDiffusionXLAdapterPipeline( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -980,7 +980,7 @@ class StableDiffusionXLAdapterPipeline( instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 0ef769f32a..69828fd208 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -114,8 +114,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -147,8 +147,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -168,10 +168,10 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -442,12 +442,12 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve negative_prompt: Optional[Union[str, List[str]]] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "np", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -482,25 +482,25 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): - The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`. + The output format of the generated video. Choose between `torch.Tensor` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index 0dc1ca93f8..d903974071 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -149,8 +149,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -182,8 +182,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -203,10 +203,10 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -499,19 +499,19 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv def __call__( self, prompt: Union[str, List[str]] = None, - video: Union[List[np.ndarray], torch.FloatTensor] = None, + video: Union[List[np.ndarray], torch.Tensor] = None, strength: float = 0.6, num_inference_steps: int = 50, guidance_scale: float = 15.0, negative_prompt: Optional[Union[str, List[str]]] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "np", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -522,7 +522,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - video (`List[np.ndarray]` or `torch.FloatTensor`): + video (`List[np.ndarray]` or `torch.Tensor`): `video` frames or tensor representing a video batch to be used as the starting point for the process. Can also accept video latents as `image`, if passing latents directly, it will not be encoded again. strength (`float`, *optional*, defaults to 0.8): @@ -546,25 +546,25 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): - The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`. + The output format of the generated video. Choose between `torch.Tensor` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index dddd650796..5ba6713592 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -392,7 +392,7 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -529,12 +529,12 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, motion_field_strength_x: float = 12, motion_field_strength_y: float = 12, output_type: Optional[str] = "tensor", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, t0: int = 44, t1: int = 47, @@ -569,7 +569,7 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -581,7 +581,7 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -795,8 +795,8 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -816,10 +816,10 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py index 2dbd928b91..2679fd1ea5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py @@ -581,10 +581,10 @@ class TextToVideoZeroSDXLPipeline( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -610,17 +610,17 @@ class TextToVideoZeroSDXLPipeline( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -861,7 +861,7 @@ class TextToVideoZeroSDXLPipeline( `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -933,16 +933,16 @@ class TextToVideoZeroSDXLPipeline( eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, frame_ids: Optional[List[int]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + latents: Optional[torch.Tensor] = None, motion_field_strength_x: float = 12, motion_field_strength_y: float = 12, output_type: Optional[str] = "tensor", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -1002,21 +1002,21 @@ class TextToVideoZeroSDXLPipeline( frame_ids (`List[int]`, *optional*): Indexes of the frames that are being generated. This is used when generating longer videos chunk-by-chunk. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -1034,7 +1034,7 @@ class TextToVideoZeroSDXLPipeline( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip.py b/src/diffusers/pipelines/unclip/pipeline_unclip.py index 72e5b31139..25c6739d87 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py @@ -217,9 +217,9 @@ class UnCLIPPipeline(DiffusionPipeline): decoder_num_inference_steps: int = 25, super_res_num_inference_steps: int = 7, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prior_latents: Optional[torch.FloatTensor] = None, - decoder_latents: Optional[torch.FloatTensor] = None, - super_res_latents: Optional[torch.FloatTensor] = None, + prior_latents: Optional[torch.Tensor] = None, + decoder_latents: Optional[torch.Tensor] = None, + super_res_latents: Optional[torch.Tensor] = None, text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None, text_attention_mask: Optional[torch.Tensor] = None, prior_guidance_scale: float = 4.0, @@ -248,11 +248,11 @@ class UnCLIPPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prior_latents (`torch.FloatTensor` of shape (batch size, embeddings dimension), *optional*): + prior_latents (`torch.Tensor` of shape (batch size, embeddings dimension), *optional*): Pre-generated noisy latents to be used as inputs for the prior. - decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): + decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. - super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): + super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. prior_guidance_scale (`float`, *optional*, defaults to 4.0): A higher guidance scale value encourages the model to generate images closely linked to the text diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py index 6c646a7df3..2a0e7e90e4 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py @@ -199,13 +199,13 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline): @torch.no_grad() def __call__( self, - image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]] = None, + image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor]] = None, num_images_per_prompt: int = 1, decoder_num_inference_steps: int = 25, super_res_num_inference_steps: int = 7, generator: Optional[torch.Generator] = None, - decoder_latents: Optional[torch.FloatTensor] = None, - super_res_latents: Optional[torch.FloatTensor] = None, + decoder_latents: Optional[torch.Tensor] = None, + super_res_latents: Optional[torch.Tensor] = None, image_embeddings: Optional[torch.Tensor] = None, decoder_guidance_scale: float = 8.0, output_type: Optional[str] = "pil", @@ -215,7 +215,7 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline): The call function to the pipeline for generation. Args: - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): `Image` or tensor representing an image batch to be used as the starting point. If you provide a tensor, it needs to be compatible with the [`CLIPImageProcessor`] [configuration](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). @@ -231,9 +231,9 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline): generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): + decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. - super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): + super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. decoder_guidance_scale (`float`, *optional*, defaults to 4.0): A higher guidance scale value encourages the model to generate images closely linked to the text diff --git a/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py index bf0a4eb475..75e5d43678 100644 --- a/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py +++ b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py @@ -220,7 +220,7 @@ class UniDiffuserTextDecoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds` must be supplied. - input_embeds (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + input_embeds (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): An embedded representation to directly pass to the transformer as a prefix for beam search. One of `input_ids` and `input_embeds` must be supplied. device: diff --git a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py index 6579e272a3..abc51edf6d 100644 --- a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py +++ b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py @@ -739,8 +739,7 @@ class UTransformer2DModel(ModelMixin, ConfigMixin): """ Args: hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. - When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input - hidden_states + When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. @@ -1038,9 +1037,9 @@ class UniDiffuserModel(ModelMixin, ConfigMixin): def forward( self, - latent_image_embeds: torch.FloatTensor, - image_embeds: torch.FloatTensor, - prompt_embeds: torch.FloatTensor, + latent_image_embeds: torch.Tensor, + image_embeds: torch.Tensor, + prompt_embeds: torch.Tensor, timestep_img: Union[torch.Tensor, float, int], timestep_text: Union[torch.Tensor, float, int], data_type: Optional[Union[torch.Tensor, float, int]] = 1, @@ -1049,11 +1048,11 @@ class UniDiffuserModel(ModelMixin, ConfigMixin): ): """ Args: - latent_image_embeds (`torch.FloatTensor` of shape `(batch size, latent channels, height, width)`): + latent_image_embeds (`torch.Tensor` of shape `(batch size, latent channels, height, width)`): Latent image representation from the VAE encoder. - image_embeds (`torch.FloatTensor` of shape `(batch size, 1, clip_img_dim)`): + image_embeds (`torch.Tensor` of shape `(batch size, 1, clip_img_dim)`): CLIP-embedded image representation (unsqueezed in the first dimension). - prompt_embeds (`torch.FloatTensor` of shape `(batch size, seq_len, text_dim)`): + prompt_embeds (`torch.Tensor` of shape `(batch size, seq_len, text_dim)`): CLIP-embedded text representation. timestep_img (`torch.long` or `float` or `int`): Current denoising step for the image. diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py index 5d61b1054e..44a03ef56e 100644 --- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -304,7 +304,7 @@ class UniDiffuserPipeline(DiffusionPipeline): if isinstance(image, PIL.Image.Image): batch_size = 1 else: - # Image must be available and type either PIL.Image.Image or torch.FloatTensor. + # Image must be available and type either PIL.Image.Image or torch.Tensor. # Not currently supporting something like image_embeds. batch_size = image.shape[0] multiplier = num_prompts_per_image @@ -353,8 +353,8 @@ class UniDiffuserPipeline(DiffusionPipeline): num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -386,8 +386,8 @@ class UniDiffuserPipeline(DiffusionPipeline): num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -407,10 +407,10 @@ class UniDiffuserPipeline(DiffusionPipeline): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1080,7 +1080,7 @@ class UniDiffuserPipeline(DiffusionPipeline): def __call__( self, prompt: Optional[Union[str, List[str]]] = None, - image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, + image: Optional[Union[torch.Tensor, PIL.Image.Image]] = None, height: Optional[int] = None, width: Optional[int] = None, data_type: Optional[int] = 1, @@ -1091,15 +1091,15 @@ class UniDiffuserPipeline(DiffusionPipeline): num_prompts_per_image: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_latents: Optional[torch.FloatTensor] = None, - vae_latents: Optional[torch.FloatTensor] = None, - clip_latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_latents: Optional[torch.Tensor] = None, + vae_latents: Optional[torch.Tensor] = None, + clip_latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -1109,7 +1109,7 @@ class UniDiffuserPipeline(DiffusionPipeline): prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. Required for text-conditioned image generation (`text2img`) mode. - image (`torch.FloatTensor` or `PIL.Image.Image`, *optional*): + image (`torch.Tensor` or `PIL.Image.Image`, *optional*): `Image` or tensor representing an image batch. Required for image-conditioned text generation (`img2text`) mode. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -1144,29 +1144,29 @@ class UniDiffuserPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for joint image-text generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. This assumes a full set of VAE, CLIP, and text latents, if supplied, overrides the value of `prompt_latents`, `vae_latents`, and `clip_latents`. - prompt_latents (`torch.FloatTensor`, *optional*): + prompt_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for text generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - vae_latents (`torch.FloatTensor`, *optional*): + vae_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - clip_latents (`torch.FloatTensor`, *optional*): + clip_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. Used in text-conditioned image generation (`text2img`) mode. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are be generated from the `negative_prompt` input argument. Used in text-conditioned image generation (`text2img`) mode. @@ -1176,7 +1176,7 @@ class UniDiffuserPipeline(DiffusionPipeline): Whether or not to return a [`~pipelines.ImageTextPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py index 3b21dfb5f1..b2cf8cbc97 100644 --- a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +++ b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py @@ -130,7 +130,7 @@ class PaellaVQModel(ModelMixin, ConfigMixin): ) @apply_forward_hook - def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput: h = self.in_block(x) h = self.down_blocks(h) @@ -141,8 +141,8 @@ class PaellaVQModel(ModelMixin, ConfigMixin): @apply_forward_hook def decode( - self, h: torch.FloatTensor, force_not_quantize: bool = True, return_dict: bool = True - ) -> Union[DecoderOutput, torch.FloatTensor]: + self, h: torch.Tensor, force_not_quantize: bool = True, return_dict: bool = True + ) -> Union[DecoderOutput, torch.Tensor]: if not force_not_quantize: quant, _, _ = self.vquantizer(h) else: @@ -155,10 +155,10 @@ class PaellaVQModel(ModelMixin, ConfigMixin): return DecoderOutput(sample=dec) - def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + def forward(self, sample: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. """ diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py index e4277d58a0..b08421415b 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py @@ -209,7 +209,7 @@ class WuerstchenDecoderPipeline(DiffusionPipeline): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeddings: Union[torch.Tensor, List[torch.Tensor]], prompt: Union[str, List[str]] = None, num_inference_steps: int = 12, timesteps: Optional[List[float]] = None, @@ -217,7 +217,7 @@ class WuerstchenDecoderPipeline(DiffusionPipeline): negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -228,7 +228,7 @@ class WuerstchenDecoderPipeline(DiffusionPipeline): Function invoked when calling the pipeline for generation. Args: - image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embedding (`torch.Tensor` or `List[torch.Tensor]`): Image Embeddings either extracted from an image or generated by a Prior Model. prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. @@ -252,7 +252,7 @@ class WuerstchenDecoderPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py index da6b5e0258..7819c8c0a0 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py @@ -154,11 +154,11 @@ class WuerstchenCombinedPipeline(DiffusionPipeline): decoder_timesteps: Optional[List[float]] = None, decoder_guidance_scale: float = 0.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -176,10 +176,10 @@ class WuerstchenCombinedPipeline(DiffusionPipeline): negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -218,7 +218,7 @@ class WuerstchenCombinedPipeline(DiffusionPipeline): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py index 4640f76967..4dddd18c30 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py @@ -54,12 +54,12 @@ class WuerstchenPriorPipelineOutput(BaseOutput): Output class for WuerstchenPriorPipeline. Args: - image_embeddings (`torch.FloatTensor` or `np.ndarray`) + image_embeddings (`torch.Tensor` or `np.ndarray`) Prior image embeddings for text prompt """ - image_embeddings: Union[torch.FloatTensor, np.ndarray] + image_embeddings: Union[torch.Tensor, np.ndarray] class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): @@ -136,8 +136,8 @@ class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): do_classifier_free_guidance, prompt=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -288,11 +288,11 @@ class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): timesteps: List[float] = None, guidance_scale: float = 8.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pt", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -324,10 +324,10 @@ class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `decoder_guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -336,7 +336,7 @@ class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py b/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py index d776d989a1..f5f9bd256c 100644 --- a/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py +++ b/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py @@ -31,19 +31,19 @@ class KarrasVeOutput(BaseOutput): Output class for the scheduler's step function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - derivative (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + derivative (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Derivative of predicted original image sample (x_0). - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample (x_{0}) based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - derivative: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + derivative: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None class KarrasVeScheduler(SchedulerMixin, ConfigMixin): @@ -94,21 +94,21 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin): # setable values self.num_inference_steps: int = None self.timesteps: np.IntTensor = None - self.schedule: torch.FloatTensor = None # sigma(t_i) + self.schedule: torch.Tensor = None # sigma(t_i) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -136,14 +136,14 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin): self.schedule = torch.tensor(schedule, dtype=torch.float32, device=device) def add_noise_to_input( - self, sample: torch.FloatTensor, sigma: float, generator: Optional[torch.Generator] = None - ) -> Tuple[torch.FloatTensor, float]: + self, sample: torch.Tensor, sigma: float, generator: Optional[torch.Generator] = None + ) -> Tuple[torch.Tensor, float]: """ Explicit Langevin-like "churn" step of adding noise to the sample according to a `gamma_i ≥ 0` to reach a higher noise level `sigma_hat = sigma_i + gamma_i*sigma_i`. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. sigma (`float`): generator (`torch.Generator`, *optional*): @@ -163,10 +163,10 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, sigma_hat: float, sigma_prev: float, - sample_hat: torch.FloatTensor, + sample_hat: torch.Tensor, return_dict: bool = True, ) -> Union[KarrasVeOutput, Tuple]: """ @@ -174,11 +174,11 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. sigma_hat (`float`): sigma_prev (`float`): - sample_hat (`torch.FloatTensor`): + sample_hat (`torch.Tensor`): return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`. @@ -202,25 +202,25 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin): def step_correct( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, sigma_hat: float, sigma_prev: float, - sample_hat: torch.FloatTensor, - sample_prev: torch.FloatTensor, - derivative: torch.FloatTensor, + sample_hat: torch.Tensor, + sample_prev: torch.Tensor, + derivative: torch.Tensor, return_dict: bool = True, ) -> Union[KarrasVeOutput, Tuple]: """ Corrects the predicted sample based on the `model_output` of the network. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. sigma_hat (`float`): TODO sigma_prev (`float`): TODO - sample_hat (`torch.FloatTensor`): TODO - sample_prev (`torch.FloatTensor`): TODO - derivative (`torch.FloatTensor`): TODO + sample_hat (`torch.Tensor`): TODO + sample_prev (`torch.Tensor`): TODO + derivative (`torch.Tensor`): TODO return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`. diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py index 51fbe6a4dc..238b8d8691 100644 --- a/src/diffusers/schedulers/scheduling_amused.py +++ b/src/diffusers/schedulers/scheduling_amused.py @@ -29,16 +29,16 @@ class AmusedSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: torch.FloatTensor = None + prev_sample: torch.Tensor + pred_original_sample: torch.Tensor = None class AmusedScheduler(SchedulerMixin, ConfigMixin): @@ -70,7 +70,7 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: torch.long, sample: torch.LongTensor, starting_mask_ratio: int = 1, diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py index 37efb7dc7c..d7af018b28 100644 --- a/src/diffusers/schedulers/scheduling_consistency_decoder.py +++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py @@ -61,12 +61,12 @@ class ConsistencyDecoderSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor class ConsistencyDecoderScheduler(SchedulerMixin, ConfigMixin): @@ -113,28 +113,28 @@ class ConsistencyDecoderScheduler(SchedulerMixin, ConfigMixin): def init_noise_sigma(self): return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]] - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample * self.c_in[timestep] def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[ConsistencyDecoderSchedulerOutput, Tuple]: @@ -143,11 +143,11 @@ class ConsistencyDecoderScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`float`): The current timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py index 3abf52f0af..8b8da5d110 100644 --- a/src/diffusers/schedulers/scheduling_consistency_models.py +++ b/src/diffusers/schedulers/scheduling_consistency_models.py @@ -33,12 +33,12 @@ class CMStochasticIterativeSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): @@ -126,20 +126,18 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Scales the consistency model input by `(sigma**2 + sigma_data**2) ** 0.5`. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ # Get sigma corresponding to timestep @@ -278,7 +276,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): Args: - sigma (`torch.FloatTensor`): + sigma (`torch.Tensor`): The current sigma in the Karras sigma schedule. Returns: @@ -319,9 +317,9 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[CMStochasticIterativeSchedulerOutput, Tuple]: @@ -330,11 +328,11 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`float`): The current timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -417,10 +415,10 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 718514abf9..1c4b55e89a 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -35,16 +35,16 @@ class DDIMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -98,11 +98,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -233,19 +233,19 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -261,7 +261,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -341,13 +341,13 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: """ @@ -355,11 +355,11 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -370,7 +370,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): @@ -470,10 +470,10 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -495,9 +495,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py index 9ca6ed4aca..1648f49530 100644 --- a/src/diffusers/schedulers/scheduling_ddim_inverse.py +++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py @@ -33,16 +33,16 @@ class DDIMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -97,11 +97,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -231,19 +231,19 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin): self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps).copy().astype(np.int64)) # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -288,9 +288,9 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: """ @@ -298,11 +298,11 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -311,7 +311,7 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin): because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would coincide with the one provided as input and `use_clipped_model_output` has no effect. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py index 995478f273..de13cd3077 100644 --- a/src/diffusers/schedulers/scheduling_ddim_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py @@ -35,16 +35,16 @@ class DDIMParallelSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -99,11 +99,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -241,19 +241,19 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -283,7 +283,7 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -364,13 +364,13 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[DDIMParallelSchedulerOutput, Tuple]: """ @@ -378,9 +378,9 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped @@ -388,7 +388,7 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would coincide with the one provided as input and `use_clipped_model_output` will have not effect. generator: random number generator. - variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we + variance_noise (`torch.Tensor`): instead of generating noise for the variance using `generator`, we can directly provide the noise for the variance itself. This is useful for methods such as CycleDiffusion. (https://arxiv.org/abs/2210.05559) return_dict (`bool`): option for returning tuple rather than DDIMParallelSchedulerOutput class @@ -486,12 +486,12 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): def batch_step_no_noise( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timesteps: List[int], - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once. Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise @@ -501,10 +501,10 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timesteps (`List[int]`): current discrete timesteps in the diffusion chain. This is now a list of integers. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped @@ -513,7 +513,7 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): coincide with the one provided as input and `use_clipped_model_output` will have not effect. Returns: - `torch.FloatTensor`: sample tensor at previous timestep. + `torch.Tensor`: sample tensor at previous timestep. """ if self.num_inference_steps is None: @@ -595,10 +595,10 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -620,9 +620,7 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 978fb63737..d35da4bcd8 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -33,16 +33,16 @@ class DDPMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None def betas_for_alpha_bar( @@ -96,11 +96,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -231,19 +231,19 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): self.variance_type = variance_type - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -363,7 +363,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): return variance - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -398,9 +398,9 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[DDPMSchedulerOutput, Tuple]: @@ -409,11 +409,11 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -498,10 +498,10 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -522,9 +522,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py index 30fbad29f1..583d8ba8b4 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -34,16 +34,16 @@ class DDPMParallelSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -98,11 +98,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -240,19 +240,19 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): self.variance_type = variance_type # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -375,7 +375,7 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -410,9 +410,9 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[DDPMParallelSchedulerOutput, Tuple]: @@ -421,9 +421,9 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. generator: random number generator. return_dict (`bool`): option for returning tuple rather than DDPMParallelSchedulerOutput class @@ -506,10 +506,10 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): def batch_step_no_noise( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timesteps: List[int], - sample: torch.FloatTensor, - ) -> torch.FloatTensor: + sample: torch.Tensor, + ) -> torch.Tensor: """ Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once. Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise @@ -519,14 +519,14 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timesteps (`List[int]`): current discrete timesteps in the diffusion chain. This is now a list of integers. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. Returns: - `torch.FloatTensor`: sample tensor at previous timestep. + `torch.Tensor`: sample tensor at previous timestep. """ t = timesteps num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps @@ -587,10 +587,10 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -612,9 +612,7 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py index 6a0f4f5efe..71b5669b05 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py +++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py @@ -33,12 +33,12 @@ class DDPMWuerstchenSchedulerOutput(BaseOutput): Output class for the scheduler's step function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor def betas_for_alpha_bar( @@ -125,17 +125,17 @@ class DDPMWuerstchenScheduler(SchedulerMixin, ConfigMixin): ) ** 2 / self._init_alpha_cumprod.to(device) return alpha_cumprod.clamp(0.0001, 0.9999) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): input sample + sample (`torch.Tensor`): input sample timestep (`int`, optional): current timestep Returns: - `torch.FloatTensor`: scaled input sample + `torch.Tensor`: scaled input sample """ return sample @@ -163,9 +163,9 @@ class DDPMWuerstchenScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[DDPMWuerstchenSchedulerOutput, Tuple]: @@ -174,9 +174,9 @@ class DDPMWuerstchenScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. generator: random number generator. return_dict (`bool`): option for returning tuple rather than DDPMWuerstchenSchedulerOutput class @@ -209,10 +209,10 @@ class DDPMWuerstchenScheduler(SchedulerMixin, ConfigMixin): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: device = original_samples.device dtype = original_samples.dtype alpha_cumprod = self._alpha_cumprod(timesteps, device=device).view( diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index f25fbf029b..ea34d1bd2c 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -276,7 +276,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -341,7 +341,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -368,24 +368,24 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DEIS algorithm needs. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -425,26 +425,26 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): def deis_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DEIS (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -483,22 +483,22 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): def multistep_deis_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order multistep DEIS. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -552,22 +552,22 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): def multistep_deis_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order multistep DEIS. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -673,9 +673,9 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -683,11 +683,11 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): the multistep DEIS. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -736,17 +736,17 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -754,10 +754,10 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index cd54148b43..0b370e689d 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -78,11 +78,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -408,7 +408,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -472,7 +472,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -497,7 +497,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho return sigmas - def _convert_to_lu(self, in_lambdas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Lu et al. (2022).""" lambda_min: float = in_lambdas[-1].item() @@ -512,11 +512,11 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -530,13 +530,13 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -611,23 +611,23 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -680,23 +680,23 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def multistep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -803,22 +803,22 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def multistep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -919,11 +919,11 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -931,15 +931,15 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): the multistep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`LEdits++`]. return_dict (`bool`): @@ -1006,27 +1006,27 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index 428eaea6a6..9b2cd03bdb 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -295,7 +295,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -360,7 +360,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -388,11 +388,11 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.convert_model_output def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -406,13 +406,13 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -488,23 +488,23 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.dpm_solver_first_order_update def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -558,23 +558,23 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_second_order_update def multistep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -682,22 +682,22 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_third_order_update def multistep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -786,11 +786,11 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -798,15 +798,15 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): the multistep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`): @@ -867,27 +867,27 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin): return SchedulerOutput(prev_sample=prev_sample) # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py index 54455b0f2e..a9dbb69b04 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py @@ -257,21 +257,21 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -395,7 +395,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin): return t # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min: float = in_sigmas[-1].item() @@ -414,9 +414,9 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], return_dict: bool = True, s_noise: float = 1.0, ) -> Union[SchedulerOutput, Tuple]: @@ -425,11 +425,11 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor` or `np.ndarray`): + model_output (`torch.Tensor` or `np.ndarray`): The direct output from learned diffusion model. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor` or `np.ndarray`): + sample (`torch.Tensor` or `np.ndarray`): A current instance of a sample created by the diffusion process. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -450,10 +450,10 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin): self.noise_sampler = BrownianTreeNoiseSampler(sample, min_sigma, max_sigma, self.noise_sampler_seed) # Define functions to compute sigma and t from each other - def sigma_fn(_t: torch.FloatTensor) -> torch.FloatTensor: + def sigma_fn(_t: torch.Tensor) -> torch.Tensor: return _t.neg().exp() - def t_fn(_sigma: torch.FloatTensor) -> torch.FloatTensor: + def t_fn(_sigma: torch.Tensor) -> torch.Tensor: return _sigma.log().neg() if self.state_in_first_order: @@ -526,10 +526,10 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 25118f4329..a274dbd6bc 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -361,7 +361,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -426,7 +426,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -453,11 +453,11 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -471,13 +471,13 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -542,26 +542,26 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -598,27 +598,27 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): def singlestep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the time `timestep_list[-2]`. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. timestep (`int`): The current and latter discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -692,27 +692,27 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): def singlestep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the time `timestep_list[-3]`. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. timestep (`int`): The current and latter discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -796,29 +796,29 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): def singlestep_dpm_solver_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, order: int = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the singlestep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. timestep (`int`): The current and latter discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. order (`int`): The solver order at this step. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -891,9 +891,9 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -901,11 +901,11 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): the singlestep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -950,17 +950,17 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -968,10 +968,10 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index dfc7978a2e..ea2165c768 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -206,21 +206,19 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): return denoised # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.scale_model_input - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -276,7 +274,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas - def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min = sigma_min or self.config.sigma_min sigma_max = sigma_max or self.config.sigma_max @@ -289,7 +287,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): return sigmas # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas - def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Implementation closely follows k-diffusion. https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26 @@ -300,7 +298,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): return sigmas # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -365,9 +363,9 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def convert_model_output( self, - model_output: torch.FloatTensor, - sample: torch.FloatTensor = None, - ) -> torch.FloatTensor: + model_output: torch.Tensor, + sample: torch.Tensor = None, + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -381,13 +379,13 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ sigma = self.sigmas[self.step_index] @@ -400,21 +398,21 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + model_output: torch.Tensor, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index] @@ -438,21 +436,21 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def multistep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + model_output_list: List[torch.Tensor], + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, + ) -> torch.Tensor: """ One step for the second-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ sigma_t, sigma_s0, sigma_s1 = ( @@ -509,20 +507,20 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def multistep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], - sample: torch.FloatTensor = None, - ) -> torch.FloatTensor: + model_output_list: List[torch.Tensor], + sample: torch.Tensor = None, + ) -> torch.Tensor: """ One step for the third-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ sigma_t, sigma_s0, sigma_s1, sigma_s2 = ( @@ -596,9 +594,9 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -607,11 +605,11 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): the multistep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -675,10 +673,10 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py index 0ef9263c9e..b37e6e0fd7 100644 --- a/src/diffusers/schedulers/scheduling_edm_euler.py +++ b/src/diffusers/schedulers/scheduling_edm_euler.py @@ -35,16 +35,16 @@ class EDMEulerSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None class EDMEulerScheduler(SchedulerMixin, ConfigMixin): @@ -174,21 +174,19 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin): return denoised - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -227,7 +225,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin): self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 - def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min = sigma_min or self.config.sigma_min sigma_max = sigma_max or self.config.sigma_max @@ -239,7 +237,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin): return sigmas - def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Implementation closely follows k-diffusion. https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26 @@ -275,9 +273,9 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, s_churn: float = 0.0, s_tmin: float = 0.0, s_tmax: float = float("inf"), @@ -290,11 +288,11 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. s_churn (`float`): s_tmin (`float`): @@ -375,10 +373,10 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py index 6631ef63a8..085683e56f 100644 --- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -35,16 +35,16 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -99,11 +99,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -250,21 +250,19 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ @@ -346,9 +344,9 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]: @@ -357,11 +355,11 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -450,10 +448,10 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index 476940d0e4..5f9db844ff 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -35,16 +35,16 @@ class EulerDiscreteSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -99,11 +99,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -274,21 +274,19 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -445,7 +443,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): return t # Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -494,9 +492,9 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, s_churn: float = 0.0, s_tmin: float = 0.0, s_tmax: float = float("inf"), @@ -509,11 +507,11 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. s_churn (`float`): s_tmin (`float`): @@ -606,10 +604,10 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): @@ -637,9 +635,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): noisy_samples = original_samples + noise * sigma return noisy_samples - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.FloatTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor: if ( isinstance(timesteps, int) or isinstance(timesteps, torch.IntTensor) diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py index 34b2bd1160..4ce70dffec 100644 --- a/src/diffusers/schedulers/scheduling_heun_discrete.py +++ b/src/diffusers/schedulers/scheduling_heun_discrete.py @@ -198,21 +198,21 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -329,7 +329,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin): return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -369,9 +369,9 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -379,11 +379,11 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -469,10 +469,10 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_ipndm.py b/src/diffusers/schedulers/scheduling_ipndm.py index afc8fd940e..9f15f1fe0a 100644 --- a/src/diffusers/schedulers/scheduling_ipndm.py +++ b/src/diffusers/schedulers/scheduling_ipndm.py @@ -137,9 +137,9 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -147,11 +147,11 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin): the linear multistep method. It performs one forward pass multiple times to approximate the solution. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -193,17 +193,17 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin): return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py index fd2b94759f..2ab686fe63 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py @@ -175,21 +175,21 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -321,7 +321,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -376,9 +376,9 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -387,11 +387,11 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -477,10 +477,10 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py index 57a1af6b79..0672db3147 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py @@ -175,21 +175,21 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -334,7 +334,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin): return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -361,9 +361,9 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -371,11 +371,11 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -452,10 +452,10 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_karras_ve_flax.py b/src/diffusers/schedulers/scheduling_karras_ve_flax.py index 4d099604a9..0d387b53ac 100644 --- a/src/diffusers/schedulers/scheduling_karras_ve_flax.py +++ b/src/diffusers/schedulers/scheduling_karras_ve_flax.py @@ -176,10 +176,10 @@ class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin): Args: state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class. - model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model. + model_output (`torch.Tensor` or `np.ndarray`): direct output from learned diffusion model. sigma_hat (`float`): TODO sigma_prev (`float`): TODO - sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO + sample_hat (`torch.Tensor` or `np.ndarray`): TODO return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class Returns: @@ -213,12 +213,12 @@ class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin): Args: state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class. - model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model. + model_output (`torch.Tensor` or `np.ndarray`): direct output from learned diffusion model. sigma_hat (`float`): TODO sigma_prev (`float`): TODO - sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO - sample_prev (`torch.FloatTensor` or `np.ndarray`): TODO - derivative (`torch.FloatTensor` or `np.ndarray`): TODO + sample_hat (`torch.Tensor` or `np.ndarray`): TODO + sample_prev (`torch.Tensor` or `np.ndarray`): TODO + derivative (`torch.Tensor` or `np.ndarray`): TODO return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class Returns: diff --git a/src/diffusers/schedulers/scheduling_lcm.py b/src/diffusers/schedulers/scheduling_lcm.py index f15fe0adf2..e73ce3e420 100644 --- a/src/diffusers/schedulers/scheduling_lcm.py +++ b/src/diffusers/schedulers/scheduling_lcm.py @@ -37,16 +37,16 @@ class LCMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - denoised: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + denoised: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -95,17 +95,17 @@ def betas_for_alpha_bar( # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr -def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor: +def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor: """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -296,24 +296,24 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): """ self._begin_index = begin_index - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -497,9 +497,9 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: @@ -508,11 +508,11 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -594,10 +594,10 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -619,9 +619,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py index 61a91783c2..272e9b8572 100644 --- a/src/diffusers/schedulers/scheduling_lms_discrete.py +++ b/src/diffusers/schedulers/scheduling_lms_discrete.py @@ -32,16 +32,16 @@ class LMSDiscreteSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -202,21 +202,19 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ @@ -351,7 +349,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin): return t # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min: float = in_sigmas[-1].item() @@ -366,9 +364,9 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, order: int = 4, return_dict: bool = True, ) -> Union[LMSDiscreteSchedulerOutput, Tuple]: @@ -377,11 +375,11 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. order (`int`, defaults to 4): The order of the linear multistep method. @@ -444,10 +442,10 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py index e7c861ea38..baf29d12c0 100644 --- a/src/diffusers/schedulers/scheduling_pndm.py +++ b/src/diffusers/schedulers/scheduling_pndm.py @@ -225,9 +225,9 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -236,11 +236,11 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): or [`~PNDMScheduler.step_plms`] depending on the internal variable `counter`. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -258,9 +258,9 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): def step_prk( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -269,11 +269,11 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): equation. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -318,9 +318,9 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): def step_plms( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -328,11 +328,11 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): the linear multistep method. It performs one forward pass multiple times to approximate the solution. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -387,17 +387,17 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -448,10 +448,10 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py index ccd3d27431..72175e7e02 100644 --- a/src/diffusers/schedulers/scheduling_repaint.py +++ b/src/diffusers/schedulers/scheduling_repaint.py @@ -31,16 +31,16 @@ class RePaintSchedulerOutput(BaseOutput): Output class for the scheduler's step function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample (x_{0}) based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: torch.FloatTensor + prev_sample: torch.Tensor + pred_original_sample: torch.Tensor # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -160,19 +160,19 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin): self.eta = eta - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -245,11 +245,11 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, - original_image: torch.FloatTensor, - mask: torch.FloatTensor, + sample: torch.Tensor, + original_image: torch.Tensor, + mask: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[RePaintSchedulerOutput, Tuple]: @@ -258,15 +258,15 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. - original_image (`torch.FloatTensor`): + original_image (`torch.Tensor`): The original image to inpaint on. - mask (`torch.FloatTensor`): + mask (`torch.Tensor`): The mask where a value of 0.0 indicates which part of the original image to inpaint. generator (`torch.Generator`, *optional*): A random number generator. @@ -351,10 +351,10 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.") def __len__(self): diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py index b8d95c609b..94243aff14 100644 --- a/src/diffusers/schedulers/scheduling_sasolver.py +++ b/src/diffusers/schedulers/scheduling_sasolver.py @@ -305,7 +305,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -370,7 +370,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -397,11 +397,11 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. Noise_prediction is designed to discretize an integral of the noise prediction model, and data_prediction is @@ -415,13 +415,13 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -686,29 +686,29 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): def stochastic_adams_bashforth_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor, - noise: torch.FloatTensor, + sample: torch.Tensor, + noise: torch.Tensor, order: int, - tau: torch.FloatTensor, + tau: torch.Tensor, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the SA-Predictor. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model at the current timestep. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. order (`int`): The order of SA-Predictor at this timestep. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None) @@ -813,32 +813,32 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): def stochastic_adams_moulton_update( self, - this_model_output: torch.FloatTensor, + this_model_output: torch.Tensor, *args, - last_sample: torch.FloatTensor, - last_noise: torch.FloatTensor, - this_sample: torch.FloatTensor, + last_sample: torch.Tensor, + last_noise: torch.Tensor, + this_sample: torch.Tensor, order: int, - tau: torch.FloatTensor, + tau: torch.Tensor, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the SA-Corrector. Args: - this_model_output (`torch.FloatTensor`): + this_model_output (`torch.Tensor`): The model outputs at `x_t`. this_timestep (`int`): The current timestep `t`. - last_sample (`torch.FloatTensor`): + last_sample (`torch.Tensor`): The generated sample before the last predictor `x_{t-1}`. - this_sample (`torch.FloatTensor`): + this_sample (`torch.Tensor`): The generated sample after the last predictor `x_{t}`. order (`int`): The order of SA-Corrector at this step. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The corrected sample tensor at the current timestep. """ @@ -979,9 +979,9 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -990,11 +990,11 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): the SA-Solver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -1079,17 +1079,17 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -1097,10 +1097,10 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py index 8f8dd18773..cedfbf7d6a 100644 --- a/src/diffusers/schedulers/scheduling_sde_ve.py +++ b/src/diffusers/schedulers/scheduling_sde_ve.py @@ -32,15 +32,15 @@ class SdeVeOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - prev_sample_mean (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample_mean (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Mean averaged `prev_sample` over previous timesteps. """ - prev_sample: torch.FloatTensor - prev_sample_mean: torch.FloatTensor + prev_sample: torch.Tensor + prev_sample_mean: torch.Tensor class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): @@ -86,19 +86,19 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -159,9 +159,9 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): def step_pred( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[SdeVeOutput, Tuple]: @@ -170,11 +170,11 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -227,8 +227,8 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): def step_correct( self, - model_output: torch.FloatTensor, - sample: torch.FloatTensor, + model_output: torch.Tensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -237,9 +237,9 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): making the prediction for the previous timestep. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -282,10 +282,10 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples timesteps = timesteps.to(original_samples.device) sigmas = self.discrete_sigmas.to(original_samples.device)[timesteps] diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py index 0216b7afc8..09aaa2531a 100644 --- a/src/diffusers/schedulers/scheduling_tcd.py +++ b/src/diffusers/schedulers/scheduling_tcd.py @@ -37,15 +37,15 @@ class TCDSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_noised_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_noised_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted noised sample `(x_{s})` based on the model output from the current timestep. """ - prev_sample: torch.FloatTensor - pred_noised_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_noised_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -94,17 +94,17 @@ def betas_for_alpha_bar( # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr -def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor: +def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor: """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -297,19 +297,19 @@ class TCDScheduler(SchedulerMixin, ConfigMixin): """ self._begin_index = begin_index - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -326,7 +326,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -524,9 +524,9 @@ class TCDScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.3, generator: Optional[torch.Generator] = None, return_dict: bool = True, @@ -536,11 +536,11 @@ class TCDScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every @@ -631,10 +631,10 @@ class TCDScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -656,9 +656,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin): return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py index c99e97cd85..6e1580290f 100644 --- a/src/diffusers/schedulers/scheduling_unclip.py +++ b/src/diffusers/schedulers/scheduling_unclip.py @@ -32,16 +32,16 @@ class UnCLIPSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -146,17 +146,17 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin): self.variance_type = variance_type - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): input sample + sample (`torch.Tensor`): input sample timestep (`int`, optional): current timestep Returns: - `torch.FloatTensor`: scaled input sample + `torch.Tensor`: scaled input sample """ return sample @@ -215,9 +215,9 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, prev_timestep: Optional[int] = None, generator=None, return_dict: bool = True, @@ -227,9 +227,9 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin): process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. prev_timestep (`int`, *optional*): The previous timestep to predict the previous sample at. Used to dynamically compute beta. If not given, `t-1` is used and the pre-computed beta is used. @@ -327,10 +327,10 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 74e97a33f1..4517ee86e9 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -78,11 +78,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -360,7 +360,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -425,7 +425,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -452,24 +452,24 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Convert the model output to the corresponding type the UniPC algorithm needs. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -522,27 +522,27 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): def multistep_uni_p_bh_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, order: int = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model at the current timestep. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. order (`int`): The order of UniP at this timestep (corresponds to the *p* in UniPC-p). Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None) @@ -651,30 +651,30 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): def multistep_uni_c_bh_update( self, - this_model_output: torch.FloatTensor, + this_model_output: torch.Tensor, *args, - last_sample: torch.FloatTensor = None, - this_sample: torch.FloatTensor = None, + last_sample: torch.Tensor = None, + this_sample: torch.Tensor = None, order: int = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the UniC (B(h) version). Args: - this_model_output (`torch.FloatTensor`): + this_model_output (`torch.Tensor`): The model outputs at `x_t`. this_timestep (`int`): The current timestep `t`. - last_sample (`torch.FloatTensor`): + last_sample (`torch.Tensor`): The generated sample before the last predictor `x_{t-1}`. - this_sample (`torch.FloatTensor`): + this_sample (`torch.Tensor`): The generated sample after the last predictor `x_{t}`. order (`int`): The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The corrected sample tensor at the current timestep. """ this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None) @@ -821,9 +821,9 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -831,11 +831,11 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): the multistep UniPC. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -900,17 +900,17 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -918,10 +918,10 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py index 6faf9ee38c..33d34e26d8 100644 --- a/src/diffusers/schedulers/scheduling_utils.py +++ b/src/diffusers/schedulers/scheduling_utils.py @@ -63,12 +63,12 @@ class SchedulerOutput(BaseOutput): Base class for the output of a scheduler's `step` function. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor class SchedulerMixin(PushToHubMixin): diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py index 03ba95cad6..bd8d255fa9 100644 --- a/src/diffusers/schedulers/scheduling_vq_diffusion.py +++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py @@ -38,7 +38,7 @@ class VQDiffusionSchedulerOutput(BaseOutput): prev_sample: torch.LongTensor -def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTensor: +def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.Tensor: """ Convert batch of vector of class indices into batch of log onehot vectors @@ -50,7 +50,7 @@ def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTen number of classes to be used for the onehot vectors Returns: - `torch.FloatTensor` of shape `(batch size, num classes, vector length)`: + `torch.Tensor` of shape `(batch size, num classes, vector length)`: Log onehot vectors """ x_onehot = F.one_hot(x, num_classes) @@ -59,7 +59,7 @@ def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTen return log_x -def gumbel_noised(logits: torch.FloatTensor, generator: Optional[torch.Generator]) -> torch.FloatTensor: +def gumbel_noised(logits: torch.Tensor, generator: Optional[torch.Generator]) -> torch.Tensor: """ Apply gumbel noise to `logits` """ @@ -199,7 +199,7 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: torch.long, sample: torch.LongTensor, generator: Optional[torch.Generator] = None, @@ -210,7 +210,7 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin): [`~VQDiffusionScheduler.q_posterior`] for more details about how the distribution is computer. Args: - log_p_x_0: (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`): + log_p_x_0: (`torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`): The log probabilities for the predicted classes of the initial latent pixels. Does not include a prediction for the masked class as the initial unnoised image cannot be masked. t (`torch.long`): @@ -251,7 +251,7 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin): ``` Args: - log_p_x_0 (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`): + log_p_x_0 (`torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`): The log probabilities for the predicted classes of the initial latent pixels. Does not include a prediction for the masked class as the initial unnoised image cannot be masked. x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`): @@ -260,7 +260,7 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin): The timestep that determines which transition matrix is used. Returns: - `torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`: + `torch.Tensor` of shape `(batch size, num classes, num latent pixels)`: The log probabilities for the predicted classes of the image at timestep `t-1`. """ log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed) @@ -354,7 +354,7 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin): return log_p_x_t_min_1 def log_Q_t_transitioning_to_known_class( - self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.FloatTensor, cumulative: bool + self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.Tensor, cumulative: bool ): """ Calculates the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each @@ -365,14 +365,14 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin): The timestep that determines which transition matrix is used. x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`): The classes of each latent pixel at time `t`. - log_onehot_x_t (`torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`): + log_onehot_x_t (`torch.Tensor` of shape `(batch size, num classes, num latent pixels)`): The log one-hot vectors of `x_t`. cumulative (`bool`): If cumulative is `False`, the single step transition matrix `t-1`->`t` is used. If cumulative is `True`, the cumulative transition matrix `0`->`t` is used. Returns: - `torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`: + `torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`: Each _column_ of the returned matrix is a _row_ of log probabilities of the complete probability transition matrix. diff --git a/tests/others/test_check_copies.py b/tests/others/test_check_copies.py index 6e1c8fcfa5..5835712343 100644 --- a/tests/others/test_check_copies.py +++ b/tests/others/test_check_copies.py @@ -32,16 +32,16 @@ REFERENCE_CODE = """ \""" Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. \""" - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None """ diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index a0fc8c5ef0..5cb22bd3c8 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -1041,7 +1041,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): def test_stable_diffusion_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 6353411834..d691565248 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -472,7 +472,7 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase): def test_stable_diffusion_img2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index a85ea9c260..5da8669215 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -353,7 +353,7 @@ class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase): def test_stable_diffusion_pix2pix_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 63e1cb30e2..494036482a 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -416,7 +416,7 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase): def test_stable_diffusion_text2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 97dc88cd3d..c260d565a8 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -461,7 +461,7 @@ class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase): def test_stable_diffusion_depth2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index d1efd8202b..c3b1b9b854 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -475,7 +475,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): def test_stable_diffusion_text2img_intermediate_state_v_pred(self): number_of_steps = 0 - def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def test_callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: test_callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py index 99cd8b2e7d..6ddb562aac 100644 --- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py @@ -213,7 +213,7 @@ class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase): def test_stable_diffusion_img_variation_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py index 35a6545420..4e36dab5ac 100644 --- a/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py @@ -349,7 +349,7 @@ class StableDiffusionPanoramaNightlyTests(unittest.TestCase): def test_stable_diffusion_panorama_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1