mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
* Add AudioLDM
* up
* add vocoder
* start unet
* unconditional unet
* clap, vocoder and vae
* clean-up: conversion scripts
* fix: conversion script token_type_ids
* clean-up: pipeline docstring
* tests: from SD
* clean-up: cpu offload vocoder instead of safety checker
* feat: adapt tests to audioldm
* feat: add docs
* clean-up: amend pipeline docstrings
* clean-up: make style
* clean-up: make fix-copies
* fix: add doc path to toctree
* clean-up: args for conversion script
* clean-up: paths to checkpoints
* fix: use conditional unet
* clean-up: make style
* fix: type hints for UNet
* clean-up: docstring for UNet
* clean-up: make style
* clean-up: remove duplicate in docstring
* clean-up: make style
* clean-up: make fix-copies
* clean-up: move imports to start in code snippet
* fix: pass cross_attention_dim as a list/tuple to unet
* clean-up: make fix-copies
* fix: update checkpoint path
* fix: unet cross_attention_dim in tests
* film embeddings -> class embeddings
* Apply suggestions from code review
Co-authored-by: Will Berman <wlbberman@gmail.com>
* fix: unet film embed to use existing args
* fix: unet tests to use existing args
* fix: make style
* fix: transformers import and version in init
* clean-up: make style
* Revert "clean-up: make style"
This reverts commit 5d6d1f8b32.
* clean-up: make style
* clean-up: use pipeline tester mixin tests where poss
* clean-up: skip attn slicing test
* fix: add torch dtype to docs
* fix: remove conversion script out of src
* fix: remove .detach from 1d waveform
* fix: reduce default num inf steps
* fix: swap height/width -> audio_length_in_s
* clean-up: make style
* fix: remove nightly tests
* fix: imports in conversion script
* clean-up: slim-down to two slow tests
* clean-up: slim-down fast tests
* fix: batch consistent tests
* clean-up: make style
* clean-up: remove vae slicing fast test
* clean-up: propagate changes to doc
* fix: increase test tol to 1e-2
* clean-up: finish docs
* clean-up: make style
* feat: vocoder / VAE compatibility check
* feat: possibly expand / cut audio waveform
* fix: pipeline call signature test
* fix: slow tests output len
* clean-up: make style
* make style
---------
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: William Berman <WLBberman@gmail.com>
122 lines
3.1 KiB
Python
122 lines
3.1 KiB
Python
# These are canonical sets of parameters for different types of pipelines.
|
|
# They are set on subclasses of `PipelineTesterMixin` as `params` and
|
|
# `batch_params`.
|
|
#
|
|
# If a pipeline's set of arguments has minor changes from one of the common sets
|
|
# of arguments, do not make modifications to the existing common sets of arguments.
|
|
# I.e. a text to image pipeline with non-configurable height and width arguments
|
|
# should set its attribute as `params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`.
|
|
|
|
TEXT_TO_IMAGE_PARAMS = frozenset(
|
|
[
|
|
"prompt",
|
|
"height",
|
|
"width",
|
|
"guidance_scale",
|
|
"negative_prompt",
|
|
"prompt_embeds",
|
|
"negative_prompt_embeds",
|
|
"cross_attention_kwargs",
|
|
]
|
|
)
|
|
|
|
TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
|
|
|
|
IMAGE_VARIATION_PARAMS = frozenset(
|
|
[
|
|
"image",
|
|
"height",
|
|
"width",
|
|
"guidance_scale",
|
|
]
|
|
)
|
|
|
|
IMAGE_VARIATION_BATCH_PARAMS = frozenset(["image"])
|
|
|
|
TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset(
|
|
[
|
|
"prompt",
|
|
"image",
|
|
"height",
|
|
"width",
|
|
"guidance_scale",
|
|
"negative_prompt",
|
|
"prompt_embeds",
|
|
"negative_prompt_embeds",
|
|
]
|
|
)
|
|
|
|
TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(["prompt", "image", "negative_prompt"])
|
|
|
|
TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
|
|
[
|
|
# Text guided image variation with an image mask
|
|
"prompt",
|
|
"image",
|
|
"mask_image",
|
|
"height",
|
|
"width",
|
|
"guidance_scale",
|
|
"negative_prompt",
|
|
"prompt_embeds",
|
|
"negative_prompt_embeds",
|
|
]
|
|
)
|
|
|
|
TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["prompt", "image", "mask_image", "negative_prompt"])
|
|
|
|
IMAGE_INPAINTING_PARAMS = frozenset(
|
|
[
|
|
# image variation with an image mask
|
|
"image",
|
|
"mask_image",
|
|
"height",
|
|
"width",
|
|
"guidance_scale",
|
|
]
|
|
)
|
|
|
|
IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["image", "mask_image"])
|
|
|
|
IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
|
|
[
|
|
"example_image",
|
|
"image",
|
|
"mask_image",
|
|
"height",
|
|
"width",
|
|
"guidance_scale",
|
|
]
|
|
)
|
|
|
|
IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["example_image", "image", "mask_image"])
|
|
|
|
CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS = frozenset(["class_labels"])
|
|
|
|
CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS = frozenset(["class_labels"])
|
|
|
|
UNCONDITIONAL_IMAGE_GENERATION_PARAMS = frozenset(["batch_size"])
|
|
|
|
UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS = frozenset([])
|
|
|
|
UNCONDITIONAL_AUDIO_GENERATION_PARAMS = frozenset(["batch_size"])
|
|
|
|
UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
|
|
|
|
TEXT_TO_AUDIO_PARAMS = frozenset(
|
|
[
|
|
"prompt",
|
|
"audio_length_in_s",
|
|
"guidance_scale",
|
|
"negative_prompt",
|
|
"prompt_embeds",
|
|
"negative_prompt_embeds",
|
|
"cross_attention_kwargs",
|
|
]
|
|
)
|
|
|
|
TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
|
|
TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
|
|
|
|
TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"])
|