From 7071b7461b224bdc82b9dd2bde2c1842320ccc66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= <46008593+tolgacangoz@users.noreply.github.com> Date: Sat, 3 Aug 2024 07:24:25 +0300 Subject: [PATCH] Errata: Fix typos & `\s+$` (#9008) * Fix typos * chore: Fix typos * chore: Update README.md for promptdiffusion example * Trim trailing white spaces * Fix a typo * update number * chore: update number * Trim trailing white space * Update README.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update README.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- .github/workflows/benchmark.yml | 6 ++-- .../workflows/mirror_community_pipeline.yml | 4 +-- CONTRIBUTING.md | 2 +- README.md | 4 +-- docs/source/en/api/pipelines/aura_flow.md | 2 +- docs/source/en/api/pipelines/flux.md | 28 +++++++++---------- docs/source/en/api/pipelines/lumina.md | 4 +-- docs/source/en/api/pipelines/stable_audio.md | 2 +- docs/source/en/tutorials/fast_diffusion.md | 4 +-- .../en/tutorials/inference_with_big_models.md | 10 +++---- docs/source/en/using-diffusers/pag.md | 16 +++++------ docs/source/ko/conceptual/philosophy.md | 2 +- docs/source/ko/using-diffusers/sdxl_turbo.md | 4 +-- docs/source/ko/using-diffusers/svd.md | 2 +- examples/community/fresco_v2v.py | 2 +- ...e_stable_diffusion_xl_instandid_img2img.py | 2 +- .../pipeline_stable_diffusion_xl_instantid.py | 2 +- examples/community/rerender_a_video.py | 4 +-- .../stable_diffusion_controlnet_reference.py | 2 +- examples/dreambooth/README_sd3.md | 8 +++--- .../promptdiffusion/README.md | 1 - ...t_original_promptdiffusion_to_diffusers.py | 2 +- .../pipeline_prompt_diffusion.py | 2 +- .../sd3_lora_colab/README.md | 12 ++++---- ...onvert_original_controlnet_to_diffusers.py | 2 +- ..._original_stable_diffusion_to_diffusers.py | 2 +- src/diffusers/models/embeddings.py | 4 +-- src/diffusers/optimization.py | 2 +- .../controlnet/pipeline_controlnet.py | 2 +- .../controlnet/pipeline_controlnet_img2img.py | 2 +- .../controlnet/pipeline_controlnet_inpaint.py | 2 +- .../pipeline_controlnet_inpaint_sd_xl.py | 2 +- .../controlnet/pipeline_controlnet_sd_xl.py | 2 +- .../pipeline_controlnet_sd_xl_img2img.py | 2 +- .../pag/pipeline_pag_controlnet_sd.py | 2 +- utils/tests_fetcher.py | 10 +++---- 36 files changed, 80 insertions(+), 81 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index a85adfc2bf..a8987d177b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -13,13 +13,13 @@ env: jobs: torch_pipelines_cuda_benchmark_tests: - env: + env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_BENCHMARK }} name: Torch Core Pipelines CUDA Benchmarking Tests strategy: fail-fast: false max-parallel: 1 - runs-on: + runs-on: group: aws-g6-4xlarge-plus container: image: diffusers/diffusers-pytorch-compile-cuda @@ -59,7 +59,7 @@ jobs: if: ${{ success() }} run: | pip install requests && python utils/notify_benchmarking_status.py --status=success - + - name: Report failure status if: ${{ failure() }} run: | diff --git a/.github/workflows/mirror_community_pipeline.yml b/.github/workflows/mirror_community_pipeline.yml index e1028c77b7..a7a2a809bb 100644 --- a/.github/workflows/mirror_community_pipeline.yml +++ b/.github/workflows/mirror_community_pipeline.yml @@ -24,7 +24,7 @@ jobs: mirror_community_pipeline: env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_COMMUNITY_MIRROR }} - + runs-on: ubuntu-latest steps: # Checkout to correct ref @@ -95,7 +95,7 @@ jobs: if: ${{ success() }} run: | pip install requests && python utils/notify_community_pipelines_mirror.py --status=success - + - name: Report failure status if: ${{ failure() }} run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 16acc87dde..0aa2a77dbc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -63,7 +63,7 @@ In the same spirit, you are of immense help to the community by answering such q **Please** keep in mind that the more effort you put into asking or answering a question, the higher the quality of the publicly documented knowledge. In the same way, well-posed and well-answered questions create a high-quality knowledge database accessible to everybody, while badly posed questions or answers reduce the overall quality of the public knowledge database. -In short, a high quality question or answer is *precise*, *concise*, *relevant*, *easy-to-understand*, *accessible*, and *well-formated/well-posed*. For more information, please have a look through the [How to write a good issue](#how-to-write-a-good-issue) section. +In short, a high quality question or answer is *precise*, *concise*, *relevant*, *easy-to-understand*, *accessible*, and *well-formatted/well-posed*. For more information, please have a look through the [How to write a good issue](#how-to-write-a-good-issue) section. **NOTE about channels**: [*The forum*](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) is much better indexed by search engines, such as Google. Posts are ranked by popularity rather than chronologically. Hence, it's easier to look up questions and answers that we posted some time ago. diff --git a/README.md b/README.md index 013f306bbb..2d3cd52e7e 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi ## Quickstart -Generating outputs is super easy with πŸ€— Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 27.000+ checkpoints): +Generating outputs is super easy with πŸ€— Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 30,000+ checkpoints): ```python from diffusers import DiffusionPipeline @@ -209,7 +209,7 @@ Also, say πŸ‘‹ in our public Discord channel [!TIP] -> The results reported below are from a 80GB 400W A100 with its clock rate set to the maximum. +> The results reported below are from a 80GB 400W A100 with its clock rate set to the maximum. > If you're interested in the full benchmarking code, take a look at [huggingface/diffusion-fast](https://github.com/huggingface/diffusion-fast). @@ -168,7 +168,7 @@ Using SDPA attention and compiling both the UNet and VAE cuts the latency from 3 > [!TIP] -> From PyTorch 2.3.1, you can control the caching behavior of `torch.compile()`. This is particularly beneficial for compilation modes like `"max-autotune"` which performs a grid-search over several compilation flags to find the optimal configuration. Learn more in the [Compile Time Caching in torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) tutorial. +> From PyTorch 2.3.1, you can control the caching behavior of `torch.compile()`. This is particularly beneficial for compilation modes like `"max-autotune"` which performs a grid-search over several compilation flags to find the optimal configuration. Learn more in the [Compile Time Caching in torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) tutorial. ### Prevent graph breaks diff --git a/docs/source/en/tutorials/inference_with_big_models.md b/docs/source/en/tutorials/inference_with_big_models.md index b3d1067cfc..6700bbad07 100644 --- a/docs/source/en/tutorials/inference_with_big_models.md +++ b/docs/source/en/tutorials/inference_with_big_models.md @@ -18,13 +18,13 @@ A modern diffusion model, like [Stable Diffusion XL (SDXL)](../using-diffusers/s * Two text encoders * A UNet for denoising -Usually, the text encoders and the denoiser are much larger compared to the VAE. +Usually, the text encoders and the denoiser are much larger compared to the VAE. As models get bigger and better, it’s possible your model is so big that even a single copy won’t fit in memory. But that doesn’t mean it can’t be loaded. If you have more than one GPU, there is more memory available to store your model. In this case, it’s better to split your model checkpoint into several smaller *checkpoint shards*. When a text encoder checkpoint has multiple shards, like [T5-xxl for SD3](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers/tree/main/text_encoder_3), it is automatically handled by the [Transformers](https://huggingface.co/docs/transformers/index) library as it is a required dependency of Diffusers when using the [`StableDiffusion3Pipeline`]. More specifically, Transformers will automatically handle the loading of multiple shards within the requested model class and get it ready so that inference can be performed. -The denoiser checkpoint can also have multiple shards and supports inference thanks to the [Accelerate](https://huggingface.co/docs/accelerate/index) library. +The denoiser checkpoint can also have multiple shards and supports inference thanks to the [Accelerate](https://huggingface.co/docs/accelerate/index) library. > [!TIP] > Refer to the [Handling big models for inference](https://huggingface.co/docs/accelerate/main/en/concept_guides/big_model_inference) guide for general guidance when working with big models that are hard to fit into memory. @@ -43,7 +43,7 @@ unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB") The size of the fp32 variant of the SDXL UNet checkpoint is ~10.4GB. Set the `max_shard_size` parameter to 5GB to create 3 shards. After saving, you can load them in [`StableDiffusionXLPipeline`]: ```python -from diffusers import UNet2DConditionModel, StableDiffusionXLPipeline +from diffusers import UNet2DConditionModel, StableDiffusionXLPipeline import torch unet = UNet2DConditionModel.from_pretrained( @@ -57,14 +57,14 @@ image = pipeline("a cute dog running on the grass", num_inference_steps=30).imag image.save("dog.png") ``` -If placing all the model-level components on the GPU at once is not feasible, use [`~DiffusionPipeline.enable_model_cpu_offload`] to help you: +If placing all the model-level components on the GPU at once is not feasible, use [`~DiffusionPipeline.enable_model_cpu_offload`] to help you: ```diff - pipeline.to("cuda") + pipeline.enable_model_cpu_offload() ``` -In general, we recommend sharding when a checkpoint is more than 5GB (in fp32). +In general, we recommend sharding when a checkpoint is more than 5GB (in fp32). ## Device placement diff --git a/docs/source/en/using-diffusers/pag.md b/docs/source/en/using-diffusers/pag.md index e852aec03f..26961d959c 100644 --- a/docs/source/en/using-diffusers/pag.md +++ b/docs/source/en/using-diffusers/pag.md @@ -130,10 +130,10 @@ prompt = "a dog catching a frisbee in the jungle" generator = torch.Generator(device="cpu").manual_seed(0) image = pipeline( - prompt, - image=init_image, - strength=0.8, - guidance_scale=guidance_scale, + prompt, + image=init_image, + strength=0.8, + guidance_scale=guidance_scale, pag_scale=pag_scale, generator=generator).images[0] ``` @@ -161,14 +161,14 @@ pipeline_inpaint = AutoPipelineForInpaiting.from_pretrained("stabilityai/stable- pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_inpaint, enable_pag=True) ``` -This still works when your pipeline has a different task: +This still works when your pipeline has a different task: ```py pipeline_t2i = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16) pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_t2i, enable_pag=True) ``` -Let's generate an image! +Let's generate an image! ```py img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" @@ -258,7 +258,7 @@ for pag_scale in [0.0, 3.0]: -## PAG with IP-Adapter +## PAG with IP-Adapter [IP-Adapter](https://hf.co/papers/2308.06721) is a popular model that can be plugged into diffusion models to enable image prompting without any changes to the underlying model. You can enable PAG on a pipeline with IP-Adapter loaded. @@ -317,7 +317,7 @@ PAG reduces artifacts and improves the overall compposition. -## Configure parameters +## Configure parameters ### pag_applied_layers diff --git a/docs/source/ko/conceptual/philosophy.md b/docs/source/ko/conceptual/philosophy.md index 5d49c075a1..fab2a4d6d3 100644 --- a/docs/source/ko/conceptual/philosophy.md +++ b/docs/source/ko/conceptual/philosophy.md @@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# μ² ν•™ [[philosophy]] +# μ² ν•™ [[philosophy]] 🧨 DiffusersλŠ” λ‹€μ–‘ν•œ λͺ¨λ‹¬λ¦¬ν‹°μ—μ„œ **μ΅œμ‹ μ˜** 사전 ν›ˆλ ¨λœ diffusion λͺ¨λΈμ„ μ œκ³΅ν•©λ‹ˆλ‹€. κ·Έ λͺ©μ μ€ μΆ”λ‘ κ³Ό ν›ˆλ ¨μ„ μœ„ν•œ **λͺ¨λ“ˆμ‹ νˆ΄λ°•μŠ€**둜 μ‚¬μš©λ˜λŠ” κ²ƒμž…λ‹ˆλ‹€. diff --git a/docs/source/ko/using-diffusers/sdxl_turbo.md b/docs/source/ko/using-diffusers/sdxl_turbo.md index 766ac0f10a..99b96fd3b8 100644 --- a/docs/source/ko/using-diffusers/sdxl_turbo.md +++ b/docs/source/ko/using-diffusers/sdxl_turbo.md @@ -52,7 +52,7 @@ pipeline = pipeline.to("cuda") Text-to-image의 경우 ν…μŠ€νŠΈ ν”„λ‘¬ν”„νŠΈλ₯Ό μ „λ‹¬ν•©λ‹ˆλ‹€. 기본적으둜 SDXL TurboλŠ” 512x512 이미지λ₯Ό μƒμ„±ν•˜λ©°, 이 ν•΄μƒλ„μ—μ„œ μ΅œμƒμ˜ κ²°κ³Όλ₯Ό μ œκ³΅ν•©λ‹ˆλ‹€. `height` 및 `width` 맀개 λ³€μˆ˜λ₯Ό 768x768 λ˜λŠ” 1024x1024둜 μ„€μ •ν•  수 μžˆμ§€λ§Œ 이 경우 ν’ˆμ§ˆ μ €ν•˜λ₯Ό μ˜ˆμƒν•  수 μžˆμŠ΅λ‹ˆλ‹€. -λͺ¨λΈμ΄ `guidance_scale` 없이 ν•™μŠ΅λ˜μ—ˆμœΌλ―€λ‘œ 이λ₯Ό 0.0으둜 μ„€μ •ν•΄ λΉ„ν™œμ„±ν™”ν•΄μ•Ό ν•©λ‹ˆλ‹€. 단일 μΆ”λ‘  μŠ€ν…λ§ŒμœΌλ‘œλ„ κ³ ν’ˆμ§ˆ 이미지λ₯Ό 생성할 수 μžˆμŠ΅λ‹ˆλ‹€. +λͺ¨λΈμ΄ `guidance_scale` 없이 ν•™μŠ΅λ˜μ—ˆμœΌλ―€λ‘œ 이λ₯Ό 0.0으둜 μ„€μ •ν•΄ λΉ„ν™œμ„±ν™”ν•΄μ•Ό ν•©λ‹ˆλ‹€. 단일 μΆ”λ‘  μŠ€ν…λ§ŒμœΌλ‘œλ„ κ³ ν’ˆμ§ˆ 이미지λ₯Ό 생성할 수 μžˆμŠ΅λ‹ˆλ‹€. μŠ€ν… 수λ₯Ό 2, 3 λ˜λŠ” 4둜 늘리면 이미지 ν’ˆμ§ˆμ΄ ν–₯μƒλ©λ‹ˆλ‹€. ```py @@ -74,7 +74,7 @@ image ## Image-to-image -Image-to-image μƒμ„±μ˜ 경우 `num_inference_steps * strength`κ°€ 1보닀 ν¬κ±°λ‚˜ 같은지 ν™•μΈν•˜μ„Έμš”. +Image-to-image μƒμ„±μ˜ 경우 `num_inference_steps * strength`κ°€ 1보닀 ν¬κ±°λ‚˜ 같은지 ν™•μΈν•˜μ„Έμš”. Image-to-image νŒŒμ΄ν”„λΌμΈμ€ μ•„λž˜ μ˜ˆμ œμ—μ„œ `0.5 * 2.0 = 1` μŠ€ν…κ³Ό 같이 `int(num_inference_steps * strength)` μŠ€ν…μœΌλ‘œ μ‹€ν–‰λ©λ‹ˆλ‹€. ```py diff --git a/docs/source/ko/using-diffusers/svd.md b/docs/source/ko/using-diffusers/svd.md index 678e21728a..7c5b9f09e6 100644 --- a/docs/source/ko/using-diffusers/svd.md +++ b/docs/source/ko/using-diffusers/svd.md @@ -21,7 +21,7 @@ specific language governing permissions and limitations under the License. μ‹œμž‘ν•˜κΈ° 전에 λ‹€μŒ λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ–΄ μžˆλŠ”μ§€ ν™•μΈν•˜μ„Έμš”: ```py -!pip install -q -U diffusers transformers accelerate +!pip install -q -U diffusers transformers accelerate ``` 이 λͺ¨λΈμ—λŠ” [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid)와 [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) 두 κ°€μ§€ μ’…λ₯˜κ°€ μžˆμŠ΅λ‹ˆλ‹€. SVD μ²΄ν¬ν¬μΈνŠΈλŠ” 14개의 ν”„λ ˆμž„μ„ μƒμ„±ν•˜λ„λ‘ ν•™μŠ΅λ˜μ—ˆκ³ , SVD-XT μ²΄ν¬ν¬μΈνŠΈλŠ” 25개의 ν”„λ ˆμž„μ„ μƒμ„±ν•˜λ„λ‘ νŒŒμΈνŠœλ‹λ˜μ—ˆμŠ΅λ‹ˆλ‹€. diff --git a/examples/community/fresco_v2v.py b/examples/community/fresco_v2v.py index 5a6ae9d1de..ab191ecf0d 100644 --- a/examples/community/fresco_v2v.py +++ b/examples/community/fresco_v2v.py @@ -2436,7 +2436,7 @@ class FrescoV2VPipeline(StableDiffusionControlNetImg2ImgPipeline): ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py b/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py index fb46ff3f38..7aeba79ae9 100644 --- a/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py +++ b/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py @@ -1002,7 +1002,7 @@ class StableDiffusionXLInstantIDImg2ImgPipeline(StableDiffusionXLControlNetImg2I ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/examples/community/pipeline_stable_diffusion_xl_instantid.py b/examples/community/pipeline_stable_diffusion_xl_instantid.py index 6e77261f51..2eead8861e 100644 --- a/examples/community/pipeline_stable_diffusion_xl_instantid.py +++ b/examples/community/pipeline_stable_diffusion_xl_instantid.py @@ -991,7 +991,7 @@ class StableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline): ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/examples/community/rerender_a_video.py b/examples/community/rerender_a_video.py index 6e25b92603..d9c616ab5e 100644 --- a/examples/community/rerender_a_video.py +++ b/examples/community/rerender_a_video.py @@ -864,7 +864,7 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline): ) if guess_mode and do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] @@ -1038,7 +1038,7 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline): ) if guess_mode and do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [ diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py index e4d8e12f85..577c7712e7 100644 --- a/examples/community/stable_diffusion_controlnet_reference.py +++ b/examples/community/stable_diffusion_controlnet_reference.py @@ -752,7 +752,7 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli ) if guess_mode and do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/examples/dreambooth/README_sd3.md b/examples/dreambooth/README_sd3.md index 052e383ef6..6f41c39562 100644 --- a/examples/dreambooth/README_sd3.md +++ b/examples/dreambooth/README_sd3.md @@ -148,12 +148,12 @@ accelerate launch train_dreambooth_lora_sd3.py \ ``` ### Text Encoder Training -Alongside the transformer, LoRA fine-tuning of the CLIP text encoders is now also supported. +Alongside the transformer, LoRA fine-tuning of the CLIP text encoders is now also supported. To do so, just specify `--train_text_encoder` while launching training. Please keep the following points in mind: > [!NOTE] -> SD3 has three text encoders (CLIP L/14, OpenCLIP bigG/14, and T5-v1.1-XXL). -By enabling `--train_text_encoder`, LoRA fine-tuning of both **CLIP encoders** is performed. At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled. +> SD3 has three text encoders (CLIP L/14, OpenCLIP bigG/14, and T5-v1.1-XXL). +By enabling `--train_text_encoder`, LoRA fine-tuning of both **CLIP encoders** is performed. At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled. To perform DreamBooth LoRA with text-encoder training, run: ```bash @@ -185,4 +185,4 @@ accelerate launch train_dreambooth_lora_sd3.py \ 1. We default to the "logit_normal" weighting scheme for the loss following the SD3 paper. Thanks to @bghira for helping us discover that for other weighting schemes supported from the training script, training may incur numerical instabilities. 2. Thanks to `bghira`, `JinxuXiang`, and `bendanzzc` for helping us discover a bug in how VAE encoding was being done previously. This has been fixed in [#8917](https://github.com/huggingface/diffusers/pull/8917). -3. Additionally, we now have the option to control if we want to apply preconditioning to the model outputs via a `--precondition_outputs` CLI arg. It affects how the model `target` is calculated as well. \ No newline at end of file +3. Additionally, we now have the option to control if we want to apply preconditioning to the model outputs via a `--precondition_outputs` CLI arg. It affects how the model `target` is calculated as well. \ No newline at end of file diff --git a/examples/research_projects/promptdiffusion/README.md b/examples/research_projects/promptdiffusion/README.md index 7d76b1baa3..33ffec3125 100644 --- a/examples/research_projects/promptdiffusion/README.md +++ b/examples/research_projects/promptdiffusion/README.md @@ -46,5 +46,4 @@ pipe.enable_model_cpu_offload() # generate image generator = torch.manual_seed(0) image = pipe("a tortoise", num_inference_steps=20, generator=generator, image_pair=[image_a,image_b], image=query).images[0] - ``` diff --git a/examples/research_projects/promptdiffusion/convert_original_promptdiffusion_to_diffusers.py b/examples/research_projects/promptdiffusion/convert_original_promptdiffusion_to_diffusers.py index 76b7b133ad..26b56a21e8 100644 --- a/examples/research_projects/promptdiffusion/convert_original_promptdiffusion_to_diffusers.py +++ b/examples/research_projects/promptdiffusion/convert_original_promptdiffusion_to_diffusers.py @@ -2051,7 +2051,7 @@ if __name__ == "__main__": default=512, type=int, help=( - "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2" + "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2" " Base. Use 768 for Stable Diffusion v2." ), ) diff --git a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py index a0a068d0d1..cb4260d465 100644 --- a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py +++ b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py @@ -1253,7 +1253,7 @@ class PromptDiffusionPipeline( ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/examples/research_projects/sd3_lora_colab/README.md b/examples/research_projects/sd3_lora_colab/README.md index d90a1c9f0a..b7d7eedfb5 100644 --- a/examples/research_projects/sd3_lora_colab/README.md +++ b/examples/research_projects/sd3_lora_colab/README.md @@ -11,28 +11,28 @@ huggingface-cli login This will also allow us to push the trained model parameters to the Hugging Face Hub platform. -For setup, inference code, and details on how to run the code, please follow the Colab Notebook provided above. +For setup, inference code, and details on how to run the code, please follow the Colab Notebook provided above. ## How We make use of several techniques to make this possible: -* Compute the embeddings from the instance prompt and serialize them for later reuse. This is implemented in the [`compute_embeddings.py`](./compute_embeddings.py) script. We use an 8bit (as introduced in [`LLM.int8()`](https://arxiv.org/abs/2208.07339)) T5 to reduce memory requirements to ~10.5GB. +* Compute the embeddings from the instance prompt and serialize them for later reuse. This is implemented in the [`compute_embeddings.py`](./compute_embeddings.py) script. We use an 8bit (as introduced in [`LLM.int8()`](https://arxiv.org/abs/2208.07339)) T5 to reduce memory requirements to ~10.5GB. * In the `train_dreambooth_sd3_lora_miniature.py` script, we make use of: * 8bit Adam for optimization through the `bitsandbytes` library. * Gradient checkpointing and gradient accumulation. * FP16 precision. - * Flash attention through `F.scaled_dot_product_attention()`. + * Flash attention through `F.scaled_dot_product_attention()`. -Computing the text embeddings is arguably the most memory-intensive part in the pipeline as SD3 employs three text encoders. If we run them in FP32, it will take about 20GB of VRAM. With FP16, we are down to 12GB. +Computing the text embeddings is arguably the most memory-intensive part in the pipeline as SD3 employs three text encoders. If we run them in FP32, it will take about 20GB of VRAM. With FP16, we are down to 12GB. ## Gotchas This project is educational. It exists to showcase the possibility of fine-tuning a big diffusion system on consumer GPUs. But additional components might have to be added to obtain state-of-the-art performance. Below are some commonly known gotchas that users should be aware of: -* Training of text encoders is purposefully disabled. -* Techniques such as prior-preservation is unsupported. +* Training of text encoders is purposefully disabled. +* Techniques such as prior-preservation is unsupported. * Custom instance captions for instance images are unsupported, but this should be relatively easy to integrate. Hopefully, this project gives you a template to extend it further to suit your needs. diff --git a/scripts/convert_original_controlnet_to_diffusers.py b/scripts/convert_original_controlnet_to_diffusers.py index 44b22c33fe..92aad4f09e 100644 --- a/scripts/convert_original_controlnet_to_diffusers.py +++ b/scripts/convert_original_controlnet_to_diffusers.py @@ -42,7 +42,7 @@ if __name__ == "__main__": default=512, type=int, help=( - "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2" + "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2" " Base. Use 768 for Stable Diffusion v2." ), ) diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py index 58f0ad292e..7e7925b0a4 100644 --- a/scripts/convert_original_stable_diffusion_to_diffusers.py +++ b/scripts/convert_original_stable_diffusion_to_diffusers.py @@ -67,7 +67,7 @@ if __name__ == "__main__": default=None, type=int, help=( - "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2" + "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2" " Base. Use 768 for Stable Diffusion v2." ), ) diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index 2821ce0330..a81f9e17cd 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -302,7 +302,7 @@ def get_2d_rotary_pos_embed(embed_dim, crops_coords, grid_size, use_real=True): If True, return real part and imaginary part separately. Otherwise, return complex numbers. Returns: - `torch.Tensor`: positional embdding with shape `( grid_size * grid_size, embed_dim/2)`. + `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`. """ start, stop = crops_coords grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32) @@ -902,7 +902,7 @@ class HunyuanCombinedTimestepTextSizeStyleEmbedding(nn.Module): pooled_projections = self.pooler(encoder_hidden_states) # (N, 1024) if self.use_style_cond_and_image_meta_size: - # extra condition2: image meta size embdding + # extra condition2: image meta size embedding image_meta_size = self.size_proj(image_meta_size.view(-1)) image_meta_size = image_meta_size.to(dtype=hidden_dtype) image_meta_size = image_meta_size.view(-1, 6 * 256) # (N, 1536) diff --git a/src/diffusers/optimization.py b/src/diffusers/optimization.py index fbaa143658..f20bd94edf 100644 --- a/src/diffusers/optimization.py +++ b/src/diffusers/optimization.py @@ -87,7 +87,7 @@ def get_piecewise_constant_schedule(optimizer: Optimizer, step_rules: str, last_ The optimizer for which to schedule the learning rate. step_rules (`string`): The rules for the learning rate. ex: rule_steps="1:10,0.1:20,0.01:30,0.005" it means that the learning rate - if multiple 1 for the first 10 steps, mutiple 0.1 for the next 20 steps, multiple 0.01 for the next 30 + if multiple 1 for the first 10 steps, multiple 0.1 for the next 20 steps, multiple 0.01 for the next 30 steps and multiple 0.005 for the other steps. last_epoch (`int`, *optional*, defaults to -1): The index of the last epoch when resuming training. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index b3d12f501e..9b2fefe7b0 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -1272,7 +1272,7 @@ class StableDiffusionControlNetPipeline( ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 4cc24a1cc1..2a4f46d619 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -1244,7 +1244,7 @@ class StableDiffusionControlNetImg2ImgPipeline( ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index aa46f4e9b6..9f7d464f9a 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -1408,7 +1408,7 @@ class StableDiffusionControlNetInpaintPipeline( ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index 9a304401ba..017c1a6f74 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -1739,7 +1739,7 @@ class StableDiffusionXLControlNetInpaintPipeline( ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index a1a687b495..fdebcdf836 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -1487,7 +1487,7 @@ class StableDiffusionXLControlNetPipeline( ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index fac24a03df..bfc54426e7 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -1551,7 +1551,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline( ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py index 6dc21c9d45..9bac883b5c 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py @@ -1249,7 +1249,7 @@ class StableDiffusionControlNetPAGPipeline( ) if guess_mode and self.do_classifier_free_guidance: - # Infered ControlNet only for the conditional batch. + # Inferred ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 352ac5defc..abdc9fd409 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -106,7 +106,7 @@ def checkout_commit(repo: Repo, commit_id: str): def clean_code(content: str) -> str: """ Remove docstrings, empty line or comments from some code (used to detect if a diff is real or only concern - comments or docstings). + comments or docstrings). Args: content (`str`): The code to clean @@ -165,7 +165,7 @@ def keep_doc_examples_only(content: str) -> str: def get_all_tests() -> List[str]: """ Walks the `tests` folder to return a list of files/subfolders. This is used to split the tests to run when using - paralellism. The split is: + parallelism. The split is: - folders under `tests`: (`tokenization`, `pipelines`, etc) except the subfolder `models` is excluded. - folders under `tests/models`: `bert`, `gpt2`, etc. @@ -635,7 +635,7 @@ def get_tree_starting_at(module: str, edges: List[Tuple[str, str]]) -> List[Unio Args: module (`str`): The module that will be the root of the subtree we want. - eges (`List[Tuple[str, str]]`): The list of all edges of the tree. + edges (`List[Tuple[str, str]]`): The list of all edges of the tree. Returns: `List[Union[str, List[str]]]`: The tree to print in the following format: [module, [list of edges @@ -663,7 +663,7 @@ def print_tree_deps_of(module, all_edges=None): Args: module (`str`): The module that will be the root of the subtree we want. - all_eges (`List[Tuple[str, str]]`, *optional*): + all_edges (`List[Tuple[str, str]]`, *optional*): The list of all edges of the tree. Will be set to `create_reverse_dependency_tree()` if not passed. """ if all_edges is None: @@ -706,7 +706,7 @@ def init_test_examples_dependencies() -> Tuple[Dict[str, List[str]], List[str]]: for framework in ["flax", "pytorch", "tensorflow"]: test_files = list((PATH_TO_EXAMPLES / framework).glob("test_*.py")) all_examples.extend(test_files) - # Remove the files at the root of examples/framework since they are not proper examples (they are eith utils + # Remove the files at the root of examples/framework since they are not proper examples (they are either utils # or example test files). examples = [ f for f in (PATH_TO_EXAMPLES / framework).glob("**/*.py") if f.parent != PATH_TO_EXAMPLES / framework