1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-27 17:22:53 +03:00
* Fix typos

* Fix typo in SVD.md
This commit is contained in:
M. Tolga Cangöz
2024-03-21 04:46:47 +03:00
committed by GitHub
parent b536f39818
commit 3028089e5e
26 changed files with 32 additions and 32 deletions

View File

@@ -21,7 +21,7 @@ This guide will show you how to use SVD to generate short videos from images.
Before you begin, make sure you have the following libraries installed:
```py
!pip install -q -U diffusers transformers accelerate
!pip install -q -U diffusers transformers accelerate
```
The are two variants of this model, [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.
@@ -86,7 +86,7 @@ Video generation is very memory intensive because you're essentially generating
+ frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
```
Using all these tricks togethere should lower the memory requirement to less than 8GB VRAM.
Using all these tricks together should lower the memory requirement to less than 8GB VRAM.
## Micro-conditioning

View File

@@ -48,7 +48,7 @@ class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
text_proj ([`UnCLIPTextProjModel`]):
Utility class to prepare and combine the embeddings before they are passed to the decoder.
decoder ([`UNet2DConditionModel`]):

View File

@@ -129,7 +129,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
movq ([`VQModel`]):
MoVQ Decoder to generate the image from the latents.
prior_prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen image-encoder.
prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -346,7 +346,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
movq ([`VQModel`]):
MoVQ Decoder to generate the image from the latents.
prior_prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen image-encoder.
prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -586,7 +586,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
movq ([`VQModel`]):
MoVQ Decoder to generate the image from the latents.
prior_prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen image-encoder.
prior_text_encoder ([`CLIPTextModelWithProjection`]):

View File

@@ -134,7 +134,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
Args:
prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen image-encoder.
text_encoder ([`CLIPTextModelWithProjection`]):

View File

@@ -119,7 +119,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
movq ([`VQModel`]):
MoVQ Decoder to generate the image from the latents.
prior_prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen image-encoder.
prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -346,7 +346,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
movq ([`VQModel`]):
MoVQ Decoder to generate the image from the latents.
prior_prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen image-encoder.
prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -594,7 +594,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
movq ([`VQModel`]):
MoVQ Decoder to generate the image from the latents.
prior_prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen image-encoder.
prior_text_encoder ([`CLIPTextModelWithProjection`]):

View File

@@ -90,7 +90,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
Args:
prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen image-encoder.
text_encoder ([`CLIPTextModelWithProjection`]):

View File

@@ -108,7 +108,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
Args:
prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
image_encoder ([`CLIPVisionModelWithProjection`]):
Frozen image-encoder.
text_encoder ([`CLIPTextModelWithProjection`]):

View File

@@ -86,7 +86,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
Args:
prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
image_encoder ([`~transformers.CLIPVisionModel`]):
Frozen image-encoder.
image_processor ([`~transformers.CLIPImageProcessor`]):

View File

@@ -700,8 +700,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> init_image = Image.open(requests.get(url, stream=True).raw)
>>> prompt = "two tigers"
>>> n_propmt = "bad, deformed, ugly, bad anotomy"
>>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_propmt, strength=0.7).images[0]
>>> n_prompt = "bad, deformed, ugly, bad anotomy"
>>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_prompt, strength=0.7).images[0]
```
Returns:

View File

@@ -194,7 +194,7 @@ class StableDiffusionInstructPix2PixPipeline(
A higher guidance scale value encourages the model to generate images closely linked to the text
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
image_guidance_scale (`float`, *optional*, defaults to 1.5):
Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
Push the generated image towards the initial `image`. Image guidance scale is enabled by setting
`image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
value of at least `1`.

View File

@@ -76,7 +76,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
prior_text_encoder ([`CLIPTextModelWithProjection`]):
Frozen [`CLIPTextModelWithProjection`] text-encoder.
prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
The canonical unCLIP prior to approximate the image embedding from the text embedding.
prior_scheduler ([`KarrasDiffusionSchedulers`]):
Scheduler used in the prior denoising process.
image_normalizer ([`StableUnCLIPImageNormalizer`]):

View File

@@ -659,7 +659,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
image_guidance_scale (`float`, *optional*, defaults to 1.5):
Image guidance scale is to push the generated image towards the inital image `image`. Image guidance
Image guidance scale is to push the generated image towards the initial image `image`. Image guidance
scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
generate images that are closely linked to the source image `image`, usually at the expense of lower
image quality. This pipeline requires a value of at least `1`.

View File

@@ -438,7 +438,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -775,7 +775,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -1018,7 +1018,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -547,7 +547,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -968,7 +968,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -673,7 +673,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -371,7 +371,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -471,7 +471,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -566,7 +566,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -472,7 +472,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -498,7 +498,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -473,7 +473,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -465,7 +465,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()

View File

@@ -869,7 +869,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
# add_noise is called after first denoising step (for inpainting)
step_indices = [self.step_index] * timesteps.shape[0]
else:
# add noise is called bevore first denoising step to create inital latent(img2img)
# add noise is called before first denoising step to create initial latent(img2img)
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices].flatten()