mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
@@ -21,7 +21,7 @@ This guide will show you how to use SVD to generate short videos from images.
|
||||
Before you begin, make sure you have the following libraries installed:
|
||||
|
||||
```py
|
||||
!pip install -q -U diffusers transformers accelerate
|
||||
!pip install -q -U diffusers transformers accelerate
|
||||
```
|
||||
|
||||
The are two variants of this model, [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.
|
||||
@@ -86,7 +86,7 @@ Video generation is very memory intensive because you're essentially generating
|
||||
+ frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
|
||||
```
|
||||
|
||||
Using all these tricks togethere should lower the memory requirement to less than 8GB VRAM.
|
||||
Using all these tricks together should lower the memory requirement to less than 8GB VRAM.
|
||||
|
||||
## Micro-conditioning
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
|
||||
Tokenizer of class
|
||||
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
||||
prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
text_proj ([`UnCLIPTextProjModel`]):
|
||||
Utility class to prepare and combine the embeddings before they are passed to the decoder.
|
||||
decoder ([`UNet2DConditionModel`]):
|
||||
|
||||
@@ -129,7 +129,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
|
||||
movq ([`VQModel`]):
|
||||
MoVQ Decoder to generate the image from the latents.
|
||||
prior_prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
Frozen image-encoder.
|
||||
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
@@ -346,7 +346,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
||||
movq ([`VQModel`]):
|
||||
MoVQ Decoder to generate the image from the latents.
|
||||
prior_prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
Frozen image-encoder.
|
||||
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
@@ -586,7 +586,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
||||
movq ([`VQModel`]):
|
||||
MoVQ Decoder to generate the image from the latents.
|
||||
prior_prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
Frozen image-encoder.
|
||||
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
|
||||
@@ -134,7 +134,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
||||
|
||||
Args:
|
||||
prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
Frozen image-encoder.
|
||||
text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
|
||||
@@ -119,7 +119,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
||||
movq ([`VQModel`]):
|
||||
MoVQ Decoder to generate the image from the latents.
|
||||
prior_prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
Frozen image-encoder.
|
||||
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
@@ -346,7 +346,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
||||
movq ([`VQModel`]):
|
||||
MoVQ Decoder to generate the image from the latents.
|
||||
prior_prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
Frozen image-encoder.
|
||||
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
@@ -594,7 +594,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
||||
movq ([`VQModel`]):
|
||||
MoVQ Decoder to generate the image from the latents.
|
||||
prior_prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
Frozen image-encoder.
|
||||
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
|
||||
@@ -90,7 +90,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
|
||||
|
||||
Args:
|
||||
prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
Frozen image-encoder.
|
||||
text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
|
||||
@@ -108,7 +108,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
|
||||
|
||||
Args:
|
||||
prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
image_encoder ([`CLIPVisionModelWithProjection`]):
|
||||
Frozen image-encoder.
|
||||
text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
|
||||
@@ -86,7 +86,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
Args:
|
||||
prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
image_encoder ([`~transformers.CLIPVisionModel`]):
|
||||
Frozen image-encoder.
|
||||
image_processor ([`~transformers.CLIPImageProcessor`]):
|
||||
|
||||
@@ -700,8 +700,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> init_image = Image.open(requests.get(url, stream=True).raw)
|
||||
>>> prompt = "two tigers"
|
||||
>>> n_propmt = "bad, deformed, ugly, bad anotomy"
|
||||
>>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_propmt, strength=0.7).images[0]
|
||||
>>> n_prompt = "bad, deformed, ugly, bad anotomy"
|
||||
>>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_prompt, strength=0.7).images[0]
|
||||
```
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -194,7 +194,7 @@ class StableDiffusionInstructPix2PixPipeline(
|
||||
A higher guidance scale value encourages the model to generate images closely linked to the text
|
||||
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
|
||||
image_guidance_scale (`float`, *optional*, defaults to 1.5):
|
||||
Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
|
||||
Push the generated image towards the initial `image`. Image guidance scale is enabled by setting
|
||||
`image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
|
||||
linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
|
||||
value of at least `1`.
|
||||
|
||||
@@ -76,7 +76,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
||||
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
||||
Frozen [`CLIPTextModelWithProjection`] text-encoder.
|
||||
prior ([`PriorTransformer`]):
|
||||
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
|
||||
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
||||
prior_scheduler ([`KarrasDiffusionSchedulers`]):
|
||||
Scheduler used in the prior denoising process.
|
||||
image_normalizer ([`StableUnCLIPImageNormalizer`]):
|
||||
|
||||
@@ -659,7 +659,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
image_guidance_scale (`float`, *optional*, defaults to 1.5):
|
||||
Image guidance scale is to push the generated image towards the inital image `image`. Image guidance
|
||||
Image guidance scale is to push the generated image towards the initial image `image`. Image guidance
|
||||
scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
|
||||
generate images that are closely linked to the source image `image`, usually at the expense of lower
|
||||
image quality. This pipeline requires a value of at least `1`.
|
||||
|
||||
@@ -438,7 +438,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -775,7 +775,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -1018,7 +1018,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -547,7 +547,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -968,7 +968,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -673,7 +673,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -371,7 +371,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -471,7 +471,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -566,7 +566,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -472,7 +472,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -498,7 +498,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -473,7 +473,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -465,7 +465,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
@@ -869,7 +869,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# add_noise is called after first denoising step (for inpainting)
|
||||
step_indices = [self.step_index] * timesteps.shape[0]
|
||||
else:
|
||||
# add noise is called bevore first denoising step to create inital latent(img2img)
|
||||
# add noise is called before first denoising step to create initial latent(img2img)
|
||||
step_indices = [self.begin_index] * timesteps.shape[0]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
|
||||
Reference in New Issue
Block a user