From f040c27d4c017a37ea8f7270533f265c56de6f7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= <46008593+tolgacangoz@users.noreply.github.com> Date: Mon, 24 Jun 2024 20:07:22 +0300 Subject: [PATCH] Errata - Fix typos and improve style (#8571) * Fix typos * Fix typos & up style * chore: Update numbers --------- Co-authored-by: Sayak Paul --- PHILOSOPHY.md | 12 +- README.md | 4 +- docs/source/en/api/pipelines/kandinsky3.md | 2 +- docs/source/en/conceptual/philosophy.md | 12 +- docs/source/ko/conceptual/philosophy.md | 4 +- docs/source/ko/optimization/habana.md | 2 +- examples/community/README.md | 386 ++++++++---------- examples/community/lpw_stable_diffusion_xl.py | 2 +- examples/dreambooth/README_sdxl.md | 2 +- .../instructpix2pix_lora/README.md | 2 +- .../research_projects/intel_opts/README.md | 2 +- .../multi_subject_dreambooth/README.md | 2 +- examples/vqgan/README.md | 6 +- .../pipeline_consistency_models.py | 2 +- tests/schedulers/test_scheduler_edm_euler.py | 4 +- 15 files changed, 210 insertions(+), 234 deletions(-) diff --git a/PHILOSOPHY.md b/PHILOSOPHY.md index a5db5cd7c7..9e25b4cc6d 100644 --- a/PHILOSOPHY.md +++ b/PHILOSOPHY.md @@ -63,14 +63,14 @@ Let's walk through more detailed design decisions for each class. Pipelines are designed to be easy to use (therefore do not follow [*Simple over easy*](#simple-over-easy) 100%), are not feature complete, and should loosely be seen as examples of how to use [models](#models) and [schedulers](#schedulers) for inference. The following design principles are followed: -- Pipelines follow the single-file policy. All pipelines can be found in individual directories under src/diffusers/pipelines. One pipeline folder corresponds to one diffusion paper/project/release. Multiple pipeline files can be gathered in one pipeline folder, as itโ€™s done for [`src/diffusers/pipelines/stable-diffusion`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion). If pipelines share similar functionality, one can make use of the [#Copied from mechanism](https://github.com/huggingface/diffusers/blob/125d783076e5bd9785beb05367a2d2566843a271/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L251). +- Pipelines follow the single-file policy. All pipelines can be found in individual directories under src/diffusers/pipelines. One pipeline folder corresponds to one diffusion paper/project/release. Multiple pipeline files can be gathered in one pipeline folder, as itโ€™s done for [`src/diffusers/pipelines/stable-diffusion`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion). If pipelines share similar functionality, one can make use of the [# Copied from mechanism](https://github.com/huggingface/diffusers/blob/125d783076e5bd9785beb05367a2d2566843a271/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L251). - Pipelines all inherit from [`DiffusionPipeline`]. - Every pipeline consists of different model and scheduler components, that are documented in the [`model_index.json` file](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), are accessible under the same name as attributes of the pipeline and can be shared between pipelines with [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) function. - Every pipeline should be loadable via the [`DiffusionPipeline.from_pretrained`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained) function. - Pipelines should be used **only** for inference. - Pipelines should be very readable, self-explanatory, and easy to tweak. - Pipelines should be designed to build on top of each other and be easy to integrate into higher-level APIs. -- Pipelines are **not** intended to be feature-complete user interfaces. For future complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner). +- Pipelines are **not** intended to be feature-complete user interfaces. For feature-complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner). - Every pipeline should have one and only one way to run it via a `__call__` method. The naming of the `__call__` arguments should be shared across all pipelines. - Pipelines should be named after the task they are intended to solve. - In almost all cases, novel diffusion pipelines shall be implemented in a new pipeline folder/file. @@ -81,7 +81,7 @@ Models are designed as configurable toolboxes that are natural extensions of [Py The following design principles are followed: - Models correspond to **a type of model architecture**. *E.g.* the [`UNet2DConditionModel`] class is used for all UNet variations that expect 2D image inputs and are conditioned on some context. -- All models can be found in [`src/diffusers/models`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and every model architecture shall be defined in its file, e.g. [`unet_2d_condition.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py), [`transformer_2d.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py), etc... +- All models can be found in [`src/diffusers/models`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and every model architecture shall be defined in its file, e.g. [`unets/unet_2d_condition.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unets/unet_2d_condition.py), [`transformers/transformer_2d.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformers/transformer_2d.py), etc... - Models **do not** follow the single-file policy and should make use of smaller model building blocks, such as [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py), etc... **Note**: This is in stark contrast to Transformers' modeling files and shows that models do not really follow the single-file policy. - Models intend to expose complexity, just like PyTorch's `Module` class, and give clear error messages. - Models all inherit from `ModelMixin` and `ConfigMixin`. @@ -90,7 +90,7 @@ The following design principles are followed: - To integrate new model checkpoints whose general architecture can be classified as an architecture that already exists in Diffusers, the existing model architecture shall be adapted to make it work with the new checkpoint. One should only create a new file if the model architecture is fundamentally different. - Models should be designed to be easily extendable to future changes. This can be achieved by limiting public function arguments, configuration arguments, and "foreseeing" future changes, *e.g.* it is usually better to add `string` "...type" arguments that can easily be extended to new future types instead of boolean `is_..._type` arguments. Only the minimum amount of changes shall be made to existing architectures to make a new model checkpoint work. - The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and -readable long-term, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). +readable long-term, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unets/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). ### Schedulers @@ -100,11 +100,11 @@ The following design principles are followed: - All schedulers are found in [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers). - Schedulers are **not** allowed to import from large utils files and shall be kept very self-contained. - One scheduler Python file corresponds to one scheduler algorithm (as might be defined in a paper). -- If schedulers share similar functionalities, we can make use of the `#Copied from` mechanism. +- If schedulers share similar functionalities, we can make use of the `# Copied from` mechanism. - Schedulers all inherit from `SchedulerMixin` and `ConfigMixin`. - Schedulers can be easily swapped out with the [`ConfigMixin.from_config`](https://huggingface.co/docs/diffusers/main/en/api/configuration#diffusers.ConfigMixin.from_config) method as explained in detail [here](./docs/source/en/using-diffusers/schedulers.md). - Every scheduler has to have a `set_num_inference_steps`, and a `step` function. `set_num_inference_steps(...)` has to be called before every denoising process, *i.e.* before `step(...)` is called. - Every scheduler exposes the timesteps to be "looped over" via a `timesteps` attribute, which is an array of timesteps the model will be called upon. - The `step(...)` function takes a predicted model output and the "current" sample (x_t) and returns the "previous", slightly more denoised sample (x_t-1). - Given the complexity of diffusion schedulers, the `step` function does not expose all the complexity and can be a bit of a "black box". -- In almost all cases, novel schedulers shall be implemented in a new scheduling file. +- In almost all cases, novel schedulers shall be implemented in a new scheduling file. \ No newline at end of file diff --git a/README.md b/README.md index a476df71e4..013f306bbb 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi ## Quickstart -Generating outputs is super easy with ๐Ÿค— Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 25.000+ checkpoints): +Generating outputs is super easy with ๐Ÿค— Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 27.000+ checkpoints): ```python from diffusers import DiffusionPipeline @@ -209,7 +209,7 @@ Also, say ๐Ÿ‘‹ in our public Discord channel 77 ) . Running this sequence through the model will result in indexing errors`. Do not worry, it is normal. +If you see `Token indices sequence length is longer than the specified maximum sequence length for this model ( *** > 77 ) . Running this sequence through the model will result in indexing errors`. Do not worry, it is normal. ### Speech to Image @@ -587,7 +579,6 @@ diffuser_pipeline = DiffusionPipeline.from_pretrained( custom_pipeline="speech_to_image_diffusion", speech_model=model, speech_processor=processor, - torch_dtype=torch.float16, ) @@ -647,7 +638,6 @@ import torch pipe = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", custom_pipeline="wildcard_stable_diffusion", - torch_dtype=torch.float16, ) prompt = "__animal__ sitting on a __object__ wearing a __clothing__" @@ -707,7 +697,6 @@ for i in range(args.num_images): images.append(th.from_numpy(np.array(image)).permute(2, 0, 1) / 255.) grid = tvu.make_grid(th.stack(images, dim=0), nrow=4, padding=0) tvu.save_image(grid, f'{prompt}_{args.weights}' + '.png') - ``` ### Imagic Stable Diffusion @@ -721,13 +710,14 @@ from io import BytesIO import torch import os from diffusers import DiffusionPipeline, DDIMScheduler + has_cuda = torch.cuda.is_available() device = torch.device('cpu' if not has_cuda else 'cuda') pipe = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", - safety_checker=None, + safety_checker=None, custom_pipeline="imagic_stable_diffusion", - scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False) + scheduler=DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False) ).to(device) generator = torch.Generator("cuda").manual_seed(0) seed = 0 @@ -837,7 +827,7 @@ image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, ### Multilingual Stable Diffusion Pipeline -The following code can generate an images from texts in different languages using the pre-trained [mBART-50 many-to-one multilingual machine translation model](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) and Stable Diffusion. +The following code can generate images from texts in different languages using the pre-trained [mBART-50 many-to-one multilingual machine translation model](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) and Stable Diffusion. ```python from PIL import Image @@ -881,7 +871,6 @@ diffuser_pipeline = DiffusionPipeline.from_pretrained( detection_pipeline=language_detection_pipeline, translation_model=trans_model, translation_tokenizer=trans_tokenizer, - torch_dtype=torch.float16, ) @@ -905,9 +894,9 @@ This example produces the following images: ### GlueGen Stable Diffusion Pipeline -GlueGen is a minimal adapter that allow alignment between any encoder (Text Encoder of different language, Multilingual Roberta, AudioClip) and CLIP text encoder used in standard Stable Diffusion model. This method allows easy language adaptation to available english Stable Diffusion checkpoints without the need of an image captioning dataset as well as long training hours. +GlueGen is a minimal adapter that allows alignment between any encoder (Text Encoder of different language, Multilingual Roberta, AudioClip) and CLIP text encoder used in standard Stable Diffusion model. This method allows easy language adaptation to available english Stable Diffusion checkpoints without the need of an image captioning dataset as well as long training hours. -Make sure you downloaded `gluenet_French_clip_overnorm_over3_noln.ckpt` for French (there are also pre-trained weights for Chinese, Italian, Japanese, Spanish or train your own) at [GlueGen's official repo](https://github.com/salesforce/GlueGen/tree/main) +Make sure you downloaded `gluenet_French_clip_overnorm_over3_noln.ckpt` for French (there are also pre-trained weights for Chinese, Italian, Japanese, Spanish or train your own) at [GlueGen's official repo](https://github.com/salesforce/GlueGen/tree/main). ```python from PIL import Image @@ -974,7 +963,6 @@ mask_image = PIL.Image.open(mask_path).convert("RGB").resize((512, 512)) pipe = DiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-inpainting", custom_pipeline="img2img_inpainting", - torch_dtype=torch.float16 ) pipe = pipe.to("cuda") @@ -1019,13 +1007,13 @@ image = pipe(image=image, text=text, prompt=prompt).images[0] ### Bit Diffusion -Based , this is used for diffusion on discrete data - eg, discreate image data, DNA sequence data. An unconditional discreate image can be generated like this: +Based , this is used for diffusion on discrete data - eg, discrete image data, DNA sequence data. An unconditional discrete image can be generated like this: ```python from diffusers import DiffusionPipeline + pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="bit_diffusion") image = pipe().images[0] - ``` ### Stable Diffusion with K Diffusion @@ -1091,37 +1079,36 @@ image = pipe(prompt, generator=generator, num_inference_steps=50).images[0] ### Checkpoint Merger Pipeline -Based on the AUTOMATIC1111/webui for checkpoint merging. This is a custom pipeline that merges upto 3 pretrained model checkpoints as long as they are in the HuggingFace model_index.json format. +Based on the AUTOMATIC1111/webui for checkpoint merging. This is a custom pipeline that merges up to 3 pretrained model checkpoints as long as they are in the HuggingFace model_index.json format. -The checkpoint merging is currently memory intensive as it modifies the weights of a DiffusionPipeline object in place. Expect at least 13GB RAM Usage on Kaggle GPU kernels and -on colab you might run out of the 12GB memory even while merging two checkpoints. +The checkpoint merging is currently memory intensive as it modifies the weights of a DiffusionPipeline object in place. Expect at least 13GB RAM usage on Kaggle GPU kernels and +on Colab you might run out of the 12GB memory even while merging two checkpoints. Usage:- ```python from diffusers import DiffusionPipeline -#Return a CheckpointMergerPipeline class that allows you to merge checkpoints. -#The checkpoint passed here is ignored. But still pass one of the checkpoints you plan to -#merge for convenience +# Return a CheckpointMergerPipeline class that allows you to merge checkpoints. +# The checkpoint passed here is ignored. But still pass one of the checkpoints you plan to +# merge for convenience pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="checkpoint_merger") -#There are multiple possible scenarios: -#The pipeline with the merged checkpoints is returned in all the scenarios +# There are multiple possible scenarios: +# The pipeline with the merged checkpoints is returned in all the scenarios -#Compatible checkpoints a.k.a matched model_index.json files. Ignores the meta attributes in model_index.json during comparison.( attrs with _ as prefix ) -merged_pipe = pipe.merge(["CompVis/stable-diffusion-v1-4","CompVis/stable-diffusion-v1-2"], interp = "sigmoid", alpha = 0.4) +# Compatible checkpoints a.k.a matched model_index.json files. Ignores the meta attributes in model_index.json during comparison.( attrs with _ as prefix ) +merged_pipe = pipe.merge(["CompVis/stable-diffusion-v1-4"," CompVis/stable-diffusion-v1-2"], interp="sigmoid", alpha=0.4) -#Incompatible checkpoints in model_index.json but merge might be possible. Use force = True to ignore model_index.json compatibility -merged_pipe_1 = pipe.merge(["CompVis/stable-diffusion-v1-4","hakurei/waifu-diffusion"], force = True, interp = "sigmoid", alpha = 0.4) +# Incompatible checkpoints in model_index.json but merge might be possible. Use force=True to ignore model_index.json compatibility +merged_pipe_1 = pipe.merge(["CompVis/stable-diffusion-v1-4", "hakurei/waifu-diffusion"], force=True, interp="sigmoid", alpha=0.4) -#Three checkpoint merging. Only "add_difference" method actually works on all three checkpoints. Using any other options will ignore the 3rd checkpoint. -merged_pipe_2 = pipe.merge(["CompVis/stable-diffusion-v1-4","hakurei/waifu-diffusion","prompthero/openjourney"], force = True, interp = "add_difference", alpha = 0.4) +# Three checkpoint merging. Only "add_difference" method actually works on all three checkpoints. Using any other options will ignore the 3rd checkpoint. +merged_pipe_2 = pipe.merge(["CompVis/stable-diffusion-v1-4", "hakurei/waifu-diffusion", "prompthero/openjourney"], force=True, interp="add_difference", alpha=0.4) prompt = "An astronaut riding a horse on Mars" image = merged_pipe(prompt).images[0] - ``` Some examples along with the merge details: @@ -1132,7 +1119,7 @@ Some examples along with the merge details: 2. "hakurei/waifu-diffusion" + "prompthero/openjourney" ; Inverse Sigmoid interpolation; alpha = 0.8 -![Stable plus Waifu Sigmoid 0.8](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/waifu_openjourney_inv_sig_0.8.png) +![Waifu plus openjourney Sigmoid 0.8](https://huggingface.co/datasets/NagaSaiAbhinay/CheckpointMergerSamples/resolve/main/waifu_openjourney_inv_sig_0.8.png) 3. "CompVis/stable-diffusion-v1-4" + "hakurei/waifu-diffusion" + "prompthero/openjourney"; Add Difference interpolation; alpha = 0.5 @@ -1197,16 +1184,16 @@ from PIL import Image pipe = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", custom_pipeline="magic_mix", - scheduler = DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"), + scheduler=DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"), ).to('cuda') img = Image.open('phone.jpg') mix_img = pipe( img, - prompt = 'bed', - kmin = 0.3, - kmax = 0.5, - mix_factor = 0.5, + prompt='bed', + kmin=0.3, + kmax=0.5, + mix_factor=0.5, ) mix_img.save('phone_bed_mix.jpg') ``` @@ -1227,8 +1214,8 @@ For more example generations check out this [demo notebook](https://github.com/d ### Stable UnCLIP -UnCLIPPipeline("kakaobrain/karlo-v1-alpha") provide a prior model that can generate clip image embedding from text. -StableDiffusionImageVariationPipeline("lambdalabs/sd-image-variations-diffusers") provide a decoder model than can generate images from clip image embedding. +UnCLIPPipeline("kakaobrain/karlo-v1-alpha") provides a prior model that can generate clip image embedding from text. +StableDiffusionImageVariationPipeline("lambdalabs/sd-image-variations-diffusers") provides a decoder model than can generate images from clip image embedding. ```python import torch @@ -1269,7 +1256,7 @@ image.save("./shiba-inu.jpg") print(pipeline.decoder_pipe.__class__) # -# this pipeline only use prior module in "kakaobrain/karlo-v1-alpha" +# this pipeline only uses prior module in "kakaobrain/karlo-v1-alpha" # It is used to convert clip text embedding to clip image embedding. print(pipeline) # StableUnCLIPPipeline { @@ -1329,10 +1316,10 @@ pipe.to(device) start_prompt = "A photograph of an adult lion" end_prompt = "A photograph of a lion cub" -#For best results keep the prompts close in length to each other. Of course, feel free to try out with differing lengths. +# For best results keep the prompts close in length to each other. Of course, feel free to try out with differing lengths. generator = torch.Generator(device=device).manual_seed(42) -output = pipe(start_prompt, end_prompt, steps = 6, generator = generator, enable_sequential_cpu_offload=False) +output = pipe(start_prompt, end_prompt, steps=6, generator=generator, enable_sequential_cpu_offload=False) for i,image in enumerate(output.images): img.save('result%s.jpg' % i) @@ -1367,10 +1354,10 @@ pipe = DiffusionPipeline.from_pretrained( pipe.to(device) images = [Image.open('./starry_night.jpg'), Image.open('./flowers.jpg')] -#For best results keep the prompts close in length to each other. Of course, feel free to try out with differing lengths. +# For best results keep the prompts close in length to each other. Of course, feel free to try out with differing lengths. generator = torch.Generator(device=device).manual_seed(42) -output = pipe(image = images ,steps = 6, generator = generator) +output = pipe(image=images, steps=6, generator=generator) for i,image in enumerate(output.images): image.save('starry_to_flowers_%s.jpg' % i) @@ -1392,7 +1379,7 @@ The resulting images in order:- ### DDIM Noise Comparative Analysis Pipeline -#### **Researchย question: What visual concepts do the diffusion models learn from each noise level during training?** +#### **Research question: What visual concepts do the diffusion models learn from each noise level during training?** The [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227) paper proposed an approach to answer the above question, which is their second contribution. The approach consists of the following steps: @@ -1409,7 +1396,7 @@ import torch from PIL import Image import numpy as np -image_path = "path/to/your/image" # images from CelebA-HQ might be better +image_path = "path/to/your/image" # images from CelebA-HQ might be better image_pil = Image.open(image_path) image_name = image_path.split("/")[-1].split(".")[0] @@ -1448,6 +1435,7 @@ import torch from diffusers import DiffusionPipeline from PIL import Image from transformers import CLIPFeatureExtractor, CLIPModel + feature_extractor = CLIPFeatureExtractor.from_pretrained( "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" ) @@ -1622,6 +1610,7 @@ import requests import torch from io import BytesIO from diffusers import StableDiffusionPipeline, RePaintScheduler + def download_image(url): response = requests.get(url) return PIL.Image.open(BytesIO(response.content)).convert("RGB") @@ -1679,7 +1668,7 @@ image.save('tensorrt_img2img_new_zealand_hills.png') ``` ### Stable Diffusion BoxDiff -BoxDiff is a training-free method for controlled generation with bounding box coordinates. It shoud work with any Stable Diffusion model. Below shows an example with `stable-diffusion-2-1-base`. +BoxDiff is a training-free method for controlled generation with bounding box coordinates. It should work with any Stable Diffusion model. Below shows an example with `stable-diffusion-2-1-base`. ```py import torch from PIL import Image, ImageDraw @@ -1839,13 +1828,13 @@ Output Image ### Stable Diffusion on IPEX -This diffusion pipeline aims to accelarate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch). +This diffusion pipeline aims to accelerate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch). To use this pipeline, you need to: 1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch) -**Note:** For each PyTorch release, there is a corresponding release of the IPEX. Here is the mapping relationship. It is recommended to install Pytorch/IPEX2.0 to get the best performance. +**Note:** For each PyTorch release, there is a corresponding release of the IPEX. Here is the mapping relationship. It is recommended to install PyTorch/IPEX2.0 to get the best performance. |PyTorch Version|IPEX Version| |--|--| @@ -1864,26 +1853,26 @@ python -m pip install intel_extension_for_pytorch python -m pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu ``` -2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16. +2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX acceleration. Supported inference datatypes are Float32 and BFloat16. **Note:** The setting of generated image height/width for `prepare_for_ipex()` should be same as the setting of pipeline inference. ```python pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_ipex") # For Float32 -pipe.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) #value of image height/width should be consistent with the pipeline inference +pipe.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) # value of image height/width should be consistent with the pipeline inference # For BFloat16 -pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) #value of image height/width should be consistent with the pipeline inference +pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) # value of image height/width should be consistent with the pipeline inference ``` Then you can use the ipex pipeline in a similar way to the default stable diffusion pipeline. ```python # For Float32 -image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()' +image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] # value of image height/width should be consistent with 'prepare_for_ipex()' # For BFloat16 with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): - image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()' + image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] # value of image height/width should be consistent with 'prepare_for_ipex()' ``` The following code compares the performance of the original stable diffusion pipeline with the ipex-optimized pipeline. @@ -1901,7 +1890,7 @@ def elapsed_time(pipeline, nb_pass=3, num_inference_steps=20): # warmup for _ in range(2): images = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images - #time evaluation + # time evaluation start = time.time() for _ in range(nb_pass): pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512) @@ -1922,7 +1911,7 @@ with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): latency = elapsed_time(pipe) print("Latency of StableDiffusionIPEXPipeline--bf16", latency) latency = elapsed_time(pipe2) - print("Latency of StableDiffusionPipeline--bf16",latency) + print("Latency of StableDiffusionPipeline--bf16", latency) ############## fp32 inference performance ############### @@ -1937,13 +1926,12 @@ pipe4 = StableDiffusionPipeline.from_pretrained(model_id) latency = elapsed_time(pipe3) print("Latency of StableDiffusionIPEXPipeline--fp32", latency) latency = elapsed_time(pipe4) -print("Latency of StableDiffusionPipeline--fp32",latency) - +print("Latency of StableDiffusionPipeline--fp32", latency) ``` ### Stable Diffusion XL on IPEX -This diffusion pipeline aims to accelarate the inference of Stable-Diffusion XL on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch). +This diffusion pipeline aims to accelerate the inference of Stable-Diffusion XL on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch). To use this pipeline, you need to: @@ -1968,7 +1956,7 @@ python -m pip install intel_extension_for_pytorch python -m pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu ``` -2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16. +2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX acceleration. Supported inference datatypes are Float32 and BFloat16. **Note:** The values of `height` and `width` used during preparation with `prepare_for_ipex()` should be the same when running inference with the prepared pipeline. @@ -2011,7 +1999,7 @@ def elapsed_time(pipeline, nb_pass=3, num_inference_steps=1): # warmup for _ in range(2): images = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=0.0).images - #time evaluation + # time evaluation start = time.time() for _ in range(nb_pass): pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=0.0) @@ -2047,8 +2035,7 @@ pipe4 = StableDiffusionXLPipeline.from_pretrained(model_id, low_cpu_mem_usage=Tr latency = elapsed_time(pipe3, num_inference_steps=steps) print("Latency of StableDiffusionXLPipelineIpex--fp32", latency, "s for total", steps, "steps") latency = elapsed_time(pipe4, num_inference_steps=steps) -print("Latency of StableDiffusionXLPipeline--fp32",latency, "s for total", steps, "steps") - +print("Latency of StableDiffusionXLPipeline--fp32", latency, "s for total", steps, "steps") ``` ### CLIP Guided Images Mixing With Stable Diffusion @@ -2061,7 +2048,7 @@ This approach is using (optional) CoCa model to avoid writing image description. ### Stable Diffusion XL Long Weighted Prompt Pipeline -This SDXL pipeline support unlimited length prompt and negative prompt, compatible with A1111 prompt weighted style. +This SDXL pipeline supports unlimited length prompt and negative prompt, compatible with A1111 prompt weighted style. You can provide both `prompt` and `prompt_2`. If only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline. @@ -2089,31 +2076,31 @@ pipe.to("cuda") t2i_images = pipe( prompt=prompt, negative_prompt=neg_prompt, -).images # alternatively, you can call the .text2img() function +).images # alternatively, you can call the .text2img() function # img2img -input_image = load_image("/path/to/local/image.png") # or URL to your input image +input_image = load_image("/path/to/local/image.png") # or URL to your input image i2i_images = pipe.img2img( prompt=prompt, negative_prompt=neg_prompt, image=input_image, - strength=0.8, # higher strength will result in more variation compared to original image + strength=0.8, # higher strength will result in more variation compared to original image ).images # inpaint -input_mask = load_image("/path/to/local/mask.png") # or URL to your input inpainting mask +input_mask = load_image("/path/to/local/mask.png") # or URL to your input inpainting mask inpaint_images = pipe.inpaint( prompt="photo of a cute (black) cat running on the grass" * 20, negative_prompt=neg_prompt, image=input_image, mask=input_mask, - strength=0.6, # higher strength will result in more variation compared to original image + strength=0.6, # higher strength will result in more variation compared to original image ).images pipe.to("cpu") torch.cuda.empty_cache() -from IPython.display import display # assuming you are using this code in a notebook +from IPython.display import display # assuming you are using this code in a notebook display(t2i_images[0]) display(i2i_images[0]) display(inpaint_images[0]) @@ -2153,9 +2140,9 @@ coca_model = open_clip.create_model('coca_ViT-L-14', pretrained='laion2B-s13B-b9 coca_model.dtype = torch.float16 coca_transform = open_clip.image_transform( coca_model.visual.image_size, - is_train = False, - mean = getattr(coca_model.visual, 'image_mean', None), - std = getattr(coca_model.visual, 'image_std', None), + is_train=False, + mean=getattr(coca_model.visual, 'image_mean', None), + std=getattr(coca_model.visual, 'image_std', None), ) coca_tokenizer = SimpleTokenizer() @@ -2207,7 +2194,7 @@ This pipeline uses the Mixture. Refer to the [Mixture](https://arxiv.org/abs/230 ```python from diffusers import LMSDiscreteScheduler, DiffusionPipeline -# Creater scheduler and model (similar to StableDiffusionPipeline) +# Create scheduler and model (similar to StableDiffusionPipeline) scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler, custom_pipeline="mixture_tiling") pipeline.to("cuda") @@ -2248,7 +2235,6 @@ from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline # Use the PNDMScheduler scheduler here instead scheduler = PNDMScheduler.from_pretrained("stabilityai/stable-diffusion-2-inpainting", subfolder="scheduler") - pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting", custom_pipeline="stable_diffusion_tensorrt_inpaint", variant='fp16', @@ -2287,7 +2273,7 @@ from diffusers.pipelines.pipeline_utils import Image2ImageRegion, Text2ImageRegi # Load and preprocess guide image iic_image = preprocess_image(Image.open("input_image.png").convert("RGB")) -# Creater scheduler and model (similar to StableDiffusionPipeline) +# Create scheduler and model (similar to StableDiffusionPipeline) scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler).to("cuda:0", custom_pipeline="mixture_canvas") pipeline.to("cuda") @@ -2298,7 +2284,7 @@ output = pipeline( canvas_width=352, regions=[ Text2ImageRegion(0, 800, 0, 352, guidance_scale=8, - prompt=f"best quality, masterpiece, WLOP, sakimichan, art contest winner on pixiv, 8K, intricate details, wet effects, rain drops, ethereal, mysterious, futuristic, UHD, HDR, cinematic lighting, in a beautiful forest, rainy day, award winning, trending on artstation, beautiful confident cheerful young woman, wearing a futuristic sleeveless dress, ultra beautiful detailed eyes, hyper-detailed face, complex, perfect, model,ย  textured, chiaroscuro, professional make-up, realistic, figure in frame, "), + prompt=f"best quality, masterpiece, WLOP, sakimichan, art contest winner on pixiv, 8K, intricate details, wet effects, rain drops, ethereal, mysterious, futuristic, UHD, HDR, cinematic lighting, in a beautiful forest, rainy day, award winning, trending on artstation, beautiful confident cheerful young woman, wearing a futuristic sleeveless dress, ultra beautiful detailed eyes, hyper-detailed face, complex, perfect, model, textured, chiaroscuro, professional make-up, realistic, figure in frame, "), Image2ImageRegion(352-800, 352, 0, 352, reference_image=iic_image, strength=1.0), ], num_inference_steps=100, @@ -2317,22 +2303,19 @@ It is a simple and minimalist diffusion model. The following code shows how to use the IADB pipeline to generate images using a pretrained celebahq-256 model. ```python - pipeline_iadb = DiffusionPipeline.from_pretrained("thomasc4/iadb-celebahq-256", custom_pipeline='iadb') pipeline_iadb = pipeline_iadb.to('cuda') -output = pipeline_iadb(batch_size=4,num_inference_steps=128) +output = pipeline_iadb(batch_size=4, num_inference_steps=128) for i in range(len(output[0])): plt.imshow(output[0][i]) plt.show() - ``` Sampling with the IADB formulation is easy, and can be done in a few lines (the pipeline already implements it): ```python - def sample_iadb(model, x0, nb_step): x_alpha = x0 for t in range(nb_step): @@ -2343,13 +2326,11 @@ def sample_iadb(model, x0, nb_step): x_alpha = x_alpha + (alpha_next-alpha)*d return x_alpha - ``` The training loop is also straightforward: ```python - # Training loop while True: x0 = sample_noise() @@ -2380,7 +2361,7 @@ import torch from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline from diffusers.utils import load_image -model_id = "kxic/zero123-165000" # zero123-105000, zero123-165000, zero123-xl +model_id = "kxic/zero123-165000" # zero123-105000, zero123-165000, zero123-xl pipe = Zero1to3StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) @@ -2401,9 +2382,9 @@ query_pose3 = [-55.0, 90.0, 0.0] # H, W = (256, 256) # H, W = (512, 512) # zero123 training is 256,256 # for batch input -input_image1 = load_image("./demo/4_blackarm.png") #load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/4_blackarm.png") -input_image2 = load_image("./demo/8_motor.png") #load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/8_motor.png") -input_image3 = load_image("./demo/7_london.png") #load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/7_london.png") +input_image1 = load_image("./demo/4_blackarm.png") # load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/4_blackarm.png") +input_image2 = load_image("./demo/8_motor.png") # load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/8_motor.png") +input_image3 = load_image("./demo/7_london.png") # load_image("https://cvlab-zero123-live.hf.space/file=/home/user/app/configs/7_london.png") input_images = [input_image1, input_image2, input_image3] query_poses = [query_pose1, query_pose2, query_pose3] @@ -2434,7 +2415,6 @@ input_images = pre_images images = pipe(input_imgs=input_images, prompt_imgs=input_images, poses=query_poses, height=H, width=W, guidance_scale=3.0, num_images_per_prompt=num_images_per_prompt, num_inference_steps=50).images - # save imgs log_dir = "logs" os.makedirs(log_dir, exist_ok=True) @@ -2444,12 +2424,11 @@ for obj in range(bs): for idx in range(num_images_per_prompt): images[i].save(os.path.join(log_dir,f"obj{obj}_{idx}.jpg")) i += 1 - ``` ### Stable Diffusion XL Reference -This pipeline uses the Reference . Refer to the [stable_diffusion_reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-reference). +This pipeline uses the Reference. Refer to the [stable_diffusion_reference](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#stable-diffusion-reference). ```py import torch @@ -2457,6 +2436,7 @@ from PIL import Image from diffusers.utils import load_image from diffusers import DiffusionPipeline from diffusers.schedulers import UniPCMultistepScheduler + input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png") # pipe = DiffusionPipeline.from_pretrained( @@ -2529,7 +2509,7 @@ from diffusers import DiffusionPipeline # load the pipeline # make sure you're logged in with `huggingface-cli login` model_id_or_path = "runwayml/stable-diffusion-v1-5" -#can also be used with dreamlike-art/dreamlike-photoreal-2.0 +# can also be used with dreamlike-art/dreamlike-photoreal-2.0 pipe = DiffusionPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16, custom_pipeline="pipeline_fabric").to("cuda") # let's specify a prompt @@ -2560,7 +2540,7 @@ torch.manual_seed(0) image = pipe( prompt=prompt, negative_prompt=negative_prompt, - liked = liked, + liked=liked, num_inference_steps=20, ).images[0] @@ -2730,7 +2710,7 @@ pipe.to(torch_device="cuda", torch_dtype=torch.float32) ```py prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" -# Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps. +# Can be set to 1~50 steps. LCM supports fast inference even <= 4 steps. Recommend: 1~8 steps. num_inference_steps = 4 images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images @@ -2762,9 +2742,9 @@ prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" input_image=Image.open("myimg.png") -strength = 0.5 #strength =0 (no change) strength=1 (completely overwrite image) +strength = 0.5 # strength =0 (no change) strength=1 (completely overwrite image) -# Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps. +# Can be set to 1~50 steps. LCM supports fast inference even <= 4 steps. Recommend: 1~8 steps. num_inference_steps = 4 images = pipe(prompt=prompt, image=input_image, strength=strength, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images @@ -2808,7 +2788,7 @@ images = pipe( guidance_scale=8.0, embedding_interpolation_type="lerp", latent_interpolation_type="slerp", - process_batch_size=4, # Make it higher or lower based on your GPU memory + process_batch_size=4, # Make it higher or lower based on your GPU memory generator=torch.Generator(seed), ) @@ -2827,7 +2807,7 @@ Two checkpoints are available for use: - [ldm3d-pano](https://huggingface.co/Intel/ldm3d-pano). This checkpoint enables the generation of panoramic images and requires the StableDiffusionLDM3DPipeline pipeline to be used. - [ldm3d-sr](https://huggingface.co/Intel/ldm3d-sr). This checkpoint enables the upscaling of RGB and depth images. Can be used in cascade after the original LDM3D pipeline using the StableDiffusionUpscaleLDM3DPipeline pipeline. -'''py +```py from PIL import Image import os import torch @@ -2838,11 +2818,11 @@ from diffusers import StableDiffusionLDM3DPipeline, DiffusionPipeline pipe_ldm3d = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c") pipe_ldm3d.to("cuda") -prompt =f"A picture of some lemons on a table" +prompt = "A picture of some lemons on a table" output = pipe_ldm3d(prompt) rgb_image, depth_image = output.rgb, output.depth -rgb_image[0].save(f"lemons_ldm3d_rgb.jpg") -depth_image[0].save(f"lemons_ldm3d_depth.png") +rgb_image[0].save("lemons_ldm3d_rgb.jpg") +depth_image[0].save("lemons_ldm3d_depth.png") # Upscale the previous output to a resolution of (1024, 1024) @@ -2850,19 +2830,19 @@ pipe_ldm3d_upscale = DiffusionPipeline.from_pretrained("Intel/ldm3d-sr", custom_ pipe_ldm3d_upscale.to("cuda") -low_res_img = Image.open(f"lemons_ldm3d_rgb.jpg").convert("RGB") -low_res_depth = Image.open(f"lemons_ldm3d_depth.png").convert("L") +low_res_img = Image.open("lemons_ldm3d_rgb.jpg").convert("RGB") +low_res_depth = Image.open("lemons_ldm3d_depth.png").convert("L") outputs = pipe_ldm3d_upscale(prompt="high quality high resolution uhd 4k image", rgb=low_res_img, depth=low_res_depth, num_inference_steps=50, target_res=[1024, 1024]) -upscaled_rgb, upscaled_depth =outputs.rgb[0], outputs.depth[0] -upscaled_rgb.save(f"upscaled_lemons_rgb.png") -upscaled_depth.save(f"upscaled_lemons_depth.png") -''' +upscaled_rgb, upscaled_depth = outputs.rgb[0], outputs.depth[0] +upscaled_rgb.save("upscaled_lemons_rgb.png") +upscaled_depth.save("upscaled_lemons_depth.png") +``` ### ControlNet + T2I Adapter Pipeline -This pipelines combines both ControlNet and T2IAdapter into a single pipeline, where the forward pass is executed once. -It receives `control_image` and `adapter_image`, as well as `controlnet_conditioning_scale` and `adapter_conditioning_scale`, for the ControlNet and Adapter modules, respectively. Whenever `adapter_conditioning_scale = 0` or `controlnet_conditioning_scale = 0`, it will act as a full ControlNet module or as a full T2IAdapter module, respectively. +This pipeline combines both ControlNet and T2IAdapter into a single pipeline, where the forward pass is executed once. +It receives `control_image` and `adapter_image`, as well as `controlnet_conditioning_scale` and `adapter_conditioning_scale`, for the ControlNet and Adapter modules, respectively. Whenever `adapter_conditioning_scale=0` or `controlnet_conditioning_scale=0`, it will act as a full ControlNet module or as a full T2IAdapter module, respectively. ```py import cv2 @@ -2925,7 +2905,6 @@ images = pipe( adapter_conditioning_scale=strength, ).images images[0].save("controlnet_and_adapter.png") - ``` ### ControlNet + T2I Adapter + Inpainting Pipeline @@ -2996,12 +2975,11 @@ images = pipe( strength=0.7, ).images images[0].save("controlnet_and_adapter_inpaint.png") - ``` ### Regional Prompting Pipeline -This pipeline is a port of the [Regional Prompter extension](https://github.com/hako-mikan/sd-webui-regional-prompter) for [Stable Diffusion web UI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) to diffusers. +This pipeline is a port of the [Regional Prompter extension](https://github.com/hako-mikan/sd-webui-regional-prompter) for [Stable Diffusion web UI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) to `diffusers`. This code implements a pipeline for the Stable Diffusion model, enabling the division of the canvas into multiple regions, with different prompts applicable to each region. Users can specify regions in two ways: using `Cols` and `Rows` modes for grid-like divisions, or the `Prompt` mode for regions calculated based on prompts. ![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline1.png) @@ -3012,6 +2990,7 @@ This code implements a pipeline for the Stable Diffusion model, enabling the div ```py from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline + pipe = RegionalPromptingStableDiffusionPipeline.from_single_file(model_path, vae=vae) rp_args = { @@ -3019,7 +2998,7 @@ rp_args = { "div": "1;1;1" } -prompt =""" +prompt = """ green hair twintail BREAK red blouse BREAK blue skirt @@ -3029,12 +3008,12 @@ images = pipe( prompt=prompt, negative_prompt=negative_prompt, guidance_scale=7.5, - height = 768, - width = 512, - num_inference_steps =20, - num_images_per_prompt = 1, - rp_args = rp_args - ).images + height=768, + width=512, + num_inference_steps=20, + num_images_per_prompt=1, + rp_args=rp_args + ).images time = time.strftime(r"%Y%m%d%H%M%S") i = 1 @@ -3059,19 +3038,19 @@ blue skirt ### 2-Dimentional division -The prompt consists of instructions separated by the term `BREAK` and is assigned to different regions of a two-dimensional space. The image is initially split in the main splitting direction, which in this case is rows, due to the presence of a single semicolon`;`, dividing the space into an upper and a lower section. Additional sub-splitting is then applied, indicated by commas. The upper row is split into ratios of `2:1:1`, while the lower row is split into a ratio of `4:6`. Rows themselves are split in a `1:2` ratio. According to the reference image, the blue sky is designated as the first region, green hair as the second, the bookshelf as the third, and so on, in a sequence based on their position from the top left. The terrarium is placed on the desk in the fourth region, and the orange dress and sofa are in the fifth region, conforming to their respective splits. +The prompt consists of instructions separated by the term `BREAK` and is assigned to different regions of a two-dimensional space. The image is initially split in the main splitting direction, which in this case is rows, due to the presence of a single semicolon `;`, dividing the space into an upper and a lower section. Additional sub-splitting is then applied, indicated by commas. The upper row is split into ratios of `2:1:1`, while the lower row is split into a ratio of `4:6`. Rows themselves are split in a `1:2` ratio. According to the reference image, the blue sky is designated as the first region, green hair as the second, the bookshelf as the third, and so on, in a sequence based on their position from the top left. The terrarium is placed on the desk in the fourth region, and the orange dress and sofa are in the fifth region, conforming to their respective splits. -``` +```py rp_args = { "mode":"rows", "div": "1,2,1,1;2,4,6" } -prompt =""" +prompt = """ blue sky BREAK green hair BREAK book shelf BREAK -terrarium on desk BREAK +terrarium on the desk BREAK orange dress and sofa """ ``` @@ -3080,10 +3059,10 @@ orange dress and sofa ### Prompt Mode -There are limitations to methods of specifying regions in advance. This is because specifying regions can be a hindrance when designating complex shapes or dynamic compositions. In the region specified by the prompt, the regions is determined after the image generation has begun. This allows us to accommodate compositions and complex regions. +There are limitations to methods of specifying regions in advance. This is because specifying regions can be a hindrance when designating complex shapes or dynamic compositions. In the region specified by the prompt, the region is determined after the image generation has begun. This allows us to accommodate compositions and complex regions. For further infomagen, see [here](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/main/prompt_en.md). -### syntax +### Syntax ``` baseprompt target1 target2 BREAK @@ -3105,14 +3084,14 @@ is also effective. In this example, masks are calculated for shirt, tie, skirt, and color prompts are specified only for those regions. -``` +```py rp_args = { - "mode":"prompt-ex", - "save_mask":True, + "mode": "prompt-ex", + "save_mask": True, "th": "0.4,0.6,0.6", } -prompt =""" +prompt = """ a girl in street with shirt, tie, skirt BREAK red, shirt BREAK green, tie BREAK @@ -3122,7 +3101,7 @@ blue , skirt ![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline3.png) -### threshold +### Threshold The threshold used to determine the mask created by the prompt. This can be set as many times as there are masks, as the range varies widely depending on the target prompt. If multiple regions are used, enter them separated by commas. For example, hair tends to be ambiguous and requires a small value, while face tends to be large and requires a small value. These should be ordered by BREAK. @@ -3141,7 +3120,7 @@ The difference is that in Prompt, duplicate regions are added, whereas in Prompt ### Accuracy -In the case of a 512 x 512 image, Attention mode reduces the size of the region to about 8 x 8 pixels deep in the U-Net, so that small regions get mixed up; Latent mode calculates 64*64, so that the region is exact. +In the case of a 512x512 image, Attention mode reduces the size of the region to about 8x8 pixels deep in the U-Net, so that small regions get mixed up; Latent mode calculates 64*64, so that the region is exact. ``` girl hair twintail frills,ribbons, dress, face BREAK @@ -3154,7 +3133,7 @@ When an image is generated, the generated mask is displayed. It is generated at ### Use common prompt -You can attach the prompt up to ADDCOMM to all prompts by separating it first with ADDCOMM. This is useful when you want to include elements common to all regions. For example, when generating pictures of three people with different appearances, it's necessary to include the instruction of 'three people' in all regions. It's also useful when inserting quality tags and other things."For example, if you write as follows: +You can attach the prompt up to ADDCOMM to all prompts by separating it first with ADDCOMM. This is useful when you want to include elements common to all regions. For example, when generating pictures of three people with different appearances, it's necessary to include the instruction of 'three people' in all regions. It's also useful when inserting quality tags and other things. "For example, if you write as follows: ``` best quality, 3persons in garden, ADDCOMM @@ -3177,24 +3156,24 @@ Negative prompts are equally effective across all regions, but it is possible to ### Parameters -To activate Regional Prompter, it is necessary to enter settings in rp_args. The items that can be set are as follows. rp_args is a dictionary type. +To activate Regional Prompter, it is necessary to enter settings in `rp_args`. The items that can be set are as follows. `rp_args` is a dictionary type. ### Input Parameters Parameters are specified through the `rp_arg`(dictionary type). -``` +```py rp_args = { "mode":"rows", "div": "1;1;1" } -pipe(prompt =prompt, rp_args = rp_args) +pipe(prompt=prompt, rp_args=rp_args) ``` ### Required Parameters -- `mode`: Specifies the method for defining regions. Choose from `Cols`, `Rows`, `Prompt` or `Prompt-Ex`. This parameter is case-insensitive. +- `mode`: Specifies the method for defining regions. Choose from `Cols`, `Rows`, `Prompt`, or `Prompt-Ex`. This parameter is case-insensitive. - `divide`: Used in `Cols` and `Rows` modes. Details on how to specify this are provided under the respective `Cols` and `Rows` sections. - `th`: Used in `Prompt` mode. The method of specification is detailed under the `Prompt` section. @@ -3208,7 +3187,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur - Reference paper - ``` + ```bibtex @article{chung2022diffusion, title={Diffusion posterior sampling for general noisy inverse problems}, author={Chung, Hyungjin and Kim, Jeongsol and Mccann, Michael T and Klasky, Marc L and Ye, Jong Chul}, @@ -3220,7 +3199,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur - This pipeline allows zero-shot conditional sampling from the posterior distribution $p(x|y)$, given observation on $y$, unconditional generative model $p(x)$ and differentiable operator $y=f(x)$. - For example, $f(.)$ can be downsample operator, then $y$ is a downsampled image, and the pipeline becomes a super-resolution pipeline. -- To use this pipeline, you need to know your operator $f(.)$ and corrupted image $y$, and pass them during the call. For example, as in the main function of dps_pipeline.py, you need to first define the Gaussian blurring operator $f(.)$. The operator should be a callable nn.Module, with all the parameter gradient disabled: +- To use this pipeline, you need to know your operator $f(.)$ and corrupted image $y$, and pass them during the call. For example, as in the main function of `dps_pipeline.py`, you need to first define the Gaussian blurring operator $f(.)$. The operator should be a callable `nn.Module`, with all the parameter gradient disabled: ```python import torch.nn.functional as F @@ -3250,7 +3229,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur def weights_init(self): if self.blur_type == "gaussian": n = np.zeros((self.kernel_size, self.kernel_size)) - n[self.kernel_size // 2,self.kernel_size // 2] = 1 + n[self.kernel_size // 2, self.kernel_size // 2] = 1 k = scipy.ndimage.gaussian_filter(n, sigma=self.std) k = torch.from_numpy(k) self.k = k @@ -3280,7 +3259,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur self.conv.update_weights(self.kernel.type(torch.float32)) for param in self.parameters(): - param.requires_grad=False + param.requires_grad = False def forward(self, data, **kwargs): return self.conv(data) @@ -3317,7 +3296,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur - ![sample](https://github.com/tongdaxu/Images/assets/22267548/4d2a1216-08d1-4aeb-9ce3-7a2d87561d65) - Gaussian blurred image: - ![ddpm_generated_image](https://github.com/tongdaxu/Images/assets/22267548/65076258-344b-4ed8-b704-a04edaade8ae) - - You can download those image to run the example on your own. + - You can download those images to run the example on your own. - Next, we need to define a loss function used for diffusion posterior sample. For most of the cases, the RMSE is fine: @@ -3326,7 +3305,7 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur return torch.sqrt(torch.sum((yhat-y)**2)) ``` -- And next, as any other diffusion models, we need the score estimator and scheduler. As we are working with $256x256$ face images, we use ddmp-celebahq-256: +- And next, as any other diffusion models, we need the score estimator and scheduler. As we are working with $256x256$ face images, we use ddpm-celebahq-256: ```python # set up scheduler @@ -3343,20 +3322,20 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur # finally, the pipeline dpspipe = DPSPipeline(model, scheduler) image = dpspipe( - measurement = measurement, - operator = operator, - loss_fn = RMSELoss, - zeta = 1.0, + measurement=measurement, + operator=operator, + loss_fn=RMSELoss, + zeta=1.0, ).images[0] image.save("dps_generated_image.png") ``` -- The zeta is a hyperparameter that is in range of $[0,1]$. It need to be tuned for best effect. By setting zeta=1, you should be able to have the reconstructed result: +- The `zeta` is a hyperparameter that is in range of $[0,1]$. It needs to be tuned for best effect. By setting `zeta=1`, you should be able to have the reconstructed result: - Reconstructed image: - ![sample](https://github.com/tongdaxu/Images/assets/22267548/0ceb5575-d42e-4f0b-99c0-50e69c982209) - The reconstruction is perceptually similar to the source image, but different in details. -- In dps_pipeline.py, we also provide a super-resolution example, which should produce: +- In `dps_pipeline.py`, we also provide a super-resolution example, which should produce: - Downsampled image: - ![dps_mea](https://github.com/tongdaxu/Images/assets/22267548/ff6a33d6-26f0-42aa-88ce-f8a76ba45a13) - Reconstructed image: @@ -3368,9 +3347,8 @@ This pipeline combines AnimateDiff and ControlNet. Enjoy precise motion control ```py import torch -from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter -from diffusers.pipelines import DiffusionPipeline -from diffusers.schedulers import DPMSolverMultistepScheduler +from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter, DiffusionPipeline, DPMSolverMultistepScheduler +from diffusers.utils import export_to_gif from PIL import Image motion_id = "guoyww/animatediff-motion-adapter-v1-5-2" @@ -3385,7 +3363,8 @@ pipe = DiffusionPipeline.from_pretrained( controlnet=controlnet, vae=vae, custom_pipeline="pipeline_animatediff_controlnet", -).to(device="cuda", dtype=torch.float16) + torch_dtype=torch.float16, +).to(device="cuda") pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained( model_id, subfolder="scheduler", beta_schedule="linear", clip_sample=False, timestep_spacing="linspace", steps_offset=1 ) @@ -3406,7 +3385,6 @@ result = pipe( num_inference_steps=20, ).frames[0] -from diffusers.utils import export_to_gif export_to_gif(result.frames[0], "result.gif") ``` @@ -3431,9 +3409,8 @@ You can also use multiple controlnets at once! ```python import torch -from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter -from diffusers.pipelines import DiffusionPipeline -from diffusers.schedulers import DPMSolverMultistepScheduler +from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter, DiffusionPipeline, DPMSolverMultistepScheduler +from diffusers.utils import export_to_gif from PIL import Image motion_id = "guoyww/animatediff-motion-adapter-v1-5-2" @@ -3449,7 +3426,8 @@ pipe = DiffusionPipeline.from_pretrained( controlnet=[controlnet1, controlnet2], vae=vae, custom_pipeline="pipeline_animatediff_controlnet", -).to(device="cuda", dtype=torch.float16) + torch_dtype=torch.float16, +).to(device="cuda") pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained( model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1, beta_schedule="linear", ) @@ -3496,7 +3474,6 @@ result = pipe( num_inference_steps=20, ) -from diffusers.utils import export_to_gif export_to_gif(result.frames[0], "result.gif") ``` @@ -3625,7 +3602,6 @@ pipe.train_lora(prompt, image) output = pipe(prompt, image, mask_image, source_points, target_points) output_image = PIL.Image.fromarray(output) output_image.save("./output.png") - ``` ### Instaflow Pipeline @@ -3674,19 +3650,19 @@ This pipeline provides null-text inversion for editing real images. It enables n - Reference paper - ```@article{hertz2022prompt, - title={Prompt-to-prompt image editing with cross attention control}, - author={Hertz, Amir and Mokady, Ron and Tenenbaum, Jay and Aberman, Kfir and Pritch, Yael and Cohen-Or, Daniel}, - booktitle={arXiv preprint arXiv:2208.01626}, - year={2022} + ```bibtex + @article{hertz2022prompt, + title={Prompt-to-prompt image editing with cross attention control}, + author={Hertz, Amir and Mokady, Ron and Tenenbaum, Jay and Aberman, Kfir and Pritch, Yael and Cohen-Or, Daniel}, + booktitle={arXiv preprint arXiv:2208.01626}, + year={2022} ```} ```py -from diffusers.schedulers import DDIMScheduler +from diffusers import DDIMScheduler from examples.community.pipeline_null_text_inversion import NullTextPipeline import torch -# Load the pipeline device = "cuda" # Provide invert_prompt and the image for null-text optimization. invert_prompt = "A lying cat" @@ -3698,13 +3674,13 @@ prompt = "A lying cat" # or different if editing. prompt = "A lying dog" -#Float32 is essential to a well optimization +# Float32 is essential to a well optimization model_path = "runwayml/stable-diffusion-v1-5" scheduler = DDIMScheduler(num_train_timesteps=1000, beta_start=0.00085, beta_end=0.0120, beta_schedule="scaled_linear") -pipeline = NullTextPipeline.from_pretrained(model_path, scheduler = scheduler, torch_dtype=torch.float32).to(device) +pipeline = NullTextPipeline.from_pretrained(model_path, scheduler=scheduler, torch_dtype=torch.float32).to(device) -#Saves the inverted_latent to save time -inverted_latent, uncond = pipeline.invert(input_image, invert_prompt, num_inner_steps=10, early_stop_epsilon= 1e-5, num_inference_steps = steps) +# Saves the inverted_latent to save time +inverted_latent, uncond = pipeline.invert(input_image, invert_prompt, num_inner_steps=10, early_stop_epsilon=1e-5, num_inference_steps=steps) pipeline(prompt, uncond, inverted_latent, guidance_scale=7.5, num_inference_steps=steps).images[0].save(input_image+".output.jpg") ``` @@ -3761,7 +3737,7 @@ for frame in frames: controlnet = ControlNetModel.from_pretrained( "lllyasviel/sd-controlnet-canny").to('cuda') -# You can use any fintuned SD here +# You can use any finetuned SD here pipe = DiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", controlnet=controlnet, custom_pipeline='rerender_a_video').to('cuda') @@ -3803,7 +3779,7 @@ This pipeline is the implementation of [Style Aligned Image Generation via Share from typing import List import torch -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers import DiffusionPipeline from PIL import Image model_id = "a-r-r-o-w/dreamshaper-xl-turbo" @@ -3872,7 +3848,7 @@ output = pipe( image=image, prompt="A snail moving on the ground", strength=0.8, - latent_interpolation_method="slerp", # can be lerp, slerp, or your own callback + latent_interpolation_method="slerp", # can be lerp, slerp, or your own callback ) frames = output.frames[0] export_to_gif(frames, "animation.gif") @@ -3882,11 +3858,10 @@ export_to_gif(frames, "animation.gif") IP Adapter FaceID is an experimental IP Adapter model that uses image embeddings generated by `insightface`, so no image encoder needs to be loaded. You need to install `insightface` and all its requirements to use this model. -You must pass the image embedding tensor as `image_embeds` to the StableDiffusionPipeline instead of `ip_adapter_image`. +You must pass the image embedding tensor as `image_embeds` to the `DiffusionPipeline` instead of `ip_adapter_image`. You can find more results [here](https://github.com/huggingface/diffusers/pull/6276). ```py -import diffusers import torch from diffusers.utils import load_image import cv2 @@ -3916,7 +3891,7 @@ pipeline.load_ip_adapter_face_id("h94/IP-Adapter-FaceID", "ip-adapter-faceid_sd1 pipeline.to("cuda") generator = torch.Generator(device="cpu").manual_seed(42) -num_images=2 +num_images = 2 image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png") @@ -3939,13 +3914,13 @@ for i in range(num_images): ### InstantID Pipeline -InstantID is a new state-of-the-art tuning-free method to achieve ID-Preserving generation with only single image, supporting various downstream tasks. For any usgae question, please refer to the [official implementation](https://github.com/InstantID/InstantID). +InstantID is a new state-of-the-art tuning-free method to achieve ID-Preserving generation with only single image, supporting various downstream tasks. For any usage question, please refer to the [official implementation](https://github.com/InstantID/InstantID). ```py -# !pip install opencv-python transformers accelerate insightface +# !pip install diffusers opencv-python transformers accelerate insightface import diffusers from diffusers.utils import load_image -from diffusers.models import ControlNetModel +from diffusers import ControlNetModel import cv2 import torch @@ -3963,12 +3938,13 @@ app.prepare(ctx_id=0, det_size=(640, 640)) # prepare models under ./checkpoints # https://huggingface.co/InstantX/InstantID from huggingface_hub import hf_hub_download + hf_hub_download(repo_id="InstantX/InstantID", filename="ControlNetModel/config.json", local_dir="./checkpoints") hf_hub_download(repo_id="InstantX/InstantID", filename="ControlNetModel/diffusion_pytorch_model.safetensors", local_dir="./checkpoints") hf_hub_download(repo_id="InstantX/InstantID", filename="ip-adapter.bin", local_dir="./checkpoints") -face_adapter = f'./checkpoints/ip-adapter.bin' -controlnet_path = f'./checkpoints/ControlNetModel' +face_adapter = './checkpoints/ip-adapter.bin' +controlnet_path = './checkpoints/ControlNetModel' # load IdentityNet controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16) @@ -3979,7 +3955,7 @@ pipe = StableDiffusionXLInstantIDPipeline.from_pretrained( controlnet=controlnet, torch_dtype=torch.float16 ) -pipe.cuda() +pipe.to("cuda") # load adapter pipe.load_ip_adapter_instantid(face_adapter) @@ -4046,8 +4022,9 @@ import cv2 import torch import numpy as np -from diffusers import ControlNetModel,DDIMScheduler, DiffusionPipeline +from diffusers import ControlNetModel, DDIMScheduler, DiffusionPipeline import sys + gmflow_dir = "/path/to/gmflow" sys.path.insert(0, gmflow_dir) @@ -4075,7 +4052,7 @@ def video_to_frame(video_path: str, interval: int): input_video_path = 'https://github.com/williamyang1991/FRESCO/raw/main/data/car-turn.mp4' output_video_path = 'car.gif' -# You can use any fintuned SD here +# You can use any finetuned SD here model_path = 'SG161222/Realistic_Vision_V2.0' prompt = 'a red car turns in the winter' @@ -4120,14 +4097,13 @@ output_frames = pipe( output_frames[0].save(output_video_path, save_all=True, append_images=output_frames[1:], duration=100, loop=0) - ``` # Perturbed-Attention Guidance [Project](https://ku-cvlab.github.io/Perturbed-Attention-Guidance/) / [arXiv](https://arxiv.org/abs/2403.17377) / [GitHub](https://github.com/KU-CVLAB/Perturbed-Attention-Guidance) -This implementation is based on [Diffusers](https://huggingface.co/docs/diffusers/index). StableDiffusionPAGPipeline is a modification of StableDiffusionPipeline to support Perturbed-Attention Guidance (PAG). +This implementation is based on [Diffusers](https://huggingface.co/docs/diffusers/index). `StableDiffusionPAGPipeline` is a modification of `StableDiffusionPipeline` to support Perturbed-Attention Guidance (PAG). ## Example Usage @@ -4147,14 +4123,14 @@ pipe = StableDiffusionPipeline.from_pretrained( torch_dtype=torch.float16 ) -device="cuda" +device = "cuda" pipe = pipe.to(device) pag_scale = 5.0 pag_applied_layers_index = ['m0'] batch_size = 4 -seed=10 +seed = 10 base_dir = "./results/" grid_dir = base_dir + "/pag" + str(pag_scale) + "/" @@ -4164,7 +4140,7 @@ if not os.path.exists(grid_dir): set_seed(seed) -latent_input = randn_tensor(shape=(batch_size,4,64,64),generator=None, device=device, dtype=torch.float16) +latent_input = randn_tensor(shape=(batch_size,4,64,64), generator=None, device=device, dtype=torch.float16) output_baseline = pipe( "", @@ -4196,6 +4172,6 @@ grid_image.save(grid_dir + "sample.png") ## PAG Parameters -pag_scale : gudiance scale of PAG (ex: 5.0) +`pag_scale` : guidance scale of PAG (ex: 5.0) -pag_applied_layers_index : index of the layer to apply perturbation (ex: ['m0']) +`pag_applied_layers_index` : index of the layer to apply perturbation (ex: ['m0']) \ No newline at end of file diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py index bc8309d480..f1711f4efd 100644 --- a/examples/community/lpw_stable_diffusion_xl.py +++ b/examples/community/lpw_stable_diffusion_xl.py @@ -2,7 +2,7 @@ # A SDXL pipeline can take unlimited weighted prompt # # Author: Andrew Zhu -# Github: https://github.com/xhinker +# GitHub: https://github.com/xhinker # Medium: https://medium.com/@xhinker ## ----------------------------------------------------------- diff --git a/examples/dreambooth/README_sdxl.md b/examples/dreambooth/README_sdxl.md index 5929a02b85..7a42bf8fff 100644 --- a/examples/dreambooth/README_sdxl.md +++ b/examples/dreambooth/README_sdxl.md @@ -261,7 +261,7 @@ The authors found that by using DoRA, both the learning capacity and training st **Usage** 1. To use DoRA you need to upgrade the installation of `peft`: ```bash -pip install-U peft +pip install -U peft ``` 2. Enable DoRA training by adding this flag ```bash diff --git a/examples/research_projects/instructpix2pix_lora/README.md b/examples/research_projects/instructpix2pix_lora/README.md index 2fda1cff3f..cfcd98926c 100644 --- a/examples/research_projects/instructpix2pix_lora/README.md +++ b/examples/research_projects/instructpix2pix_lora/README.md @@ -30,7 +30,7 @@ accelerate launch finetune_instruct_pix2pix.py \ ## Inference After training the model and the lora weight of the model is stored in the ```$OUTPUT_DIR```. -```bash +```py # load the base model pipeline pipe_lora = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix") diff --git a/examples/research_projects/intel_opts/README.md b/examples/research_projects/intel_opts/README.md index 1f3030f8d2..b2f75b3b0e 100644 --- a/examples/research_projects/intel_opts/README.md +++ b/examples/research_projects/intel_opts/README.md @@ -6,7 +6,7 @@ This aims to provide diffusers examples with Intel optimizations such as Bfloat1 ## Accelerating the fine-tuning for textual inversion -We accelereate the fine-tuning for textual inversion with Intel Extension for PyTorch. The [examples](textual_inversion) enable both single node and multi-node distributed training with Bfloat16 support on Intel Xeon Scalable Processor. +We accelerate the fine-tuning for textual inversion with Intel Extension for PyTorch. The [examples](textual_inversion) enable both single node and multi-node distributed training with Bfloat16 support on Intel Xeon Scalable Processor. ## Accelerating the inference for Stable Diffusion using Bfloat16 diff --git a/examples/research_projects/multi_subject_dreambooth/README.md b/examples/research_projects/multi_subject_dreambooth/README.md index cc6557ee9f..7c2e6f4009 100644 --- a/examples/research_projects/multi_subject_dreambooth/README.md +++ b/examples/research_projects/multi_subject_dreambooth/README.md @@ -323,7 +323,7 @@ accelerate launch train_dreambooth.py \ ### Using DreamBooth for other pipelines than Stable Diffusion -Altdiffusion also support dreambooth now, the runing comman is basically the same as above, all you need to do is replace the `MODEL_NAME` like this: +Altdiffusion also supports dreambooth now, the running command is basically the same as above, all you need to do is replace the `MODEL_NAME` like this: One can now simply change the `pretrained_model_name_or_path` to another architecture such as [`AltDiffusion`](https://huggingface.co/docs/diffusers/api/pipelines/alt_diffusion). ``` diff --git a/examples/vqgan/README.md b/examples/vqgan/README.md index 0b0f3589ba..056bbcaf67 100644 --- a/examples/vqgan/README.md +++ b/examples/vqgan/README.md @@ -45,7 +45,7 @@ accelerate launch train_vqgan.py \ ``` An example training run is [here](https://wandb.ai/sayakpaul/vqgan-training/runs/0m5kzdfp) by @sayakpaul and a lower scale one [here](https://wandb.ai/dsbuddy27/vqgan-training/runs/eqd6xi4n?nw=nwuserisamu). The validation images can be obtained from [here](https://huggingface.co/datasets/diffusers/docs-images/tree/main/vqgan_validation_images). -The simplest way to improve the quality of a VQGAN model is to maximize the amount of information present in the bottleneck. The easiest way to do this is increasing the image resolution. However, other ways include, but not limited to, lowering compression by downsampling fewer times or increasing the vocaburary size which at most can be around 16384. How to do this is shown below. +The simplest way to improve the quality of a VQGAN model is to maximize the amount of information present in the bottleneck. The easiest way to do this is increasing the image resolution. However, other ways include, but not limited to, lowering compression by downsampling fewer times or increasing the vocabulary size which at most can be around 16384. How to do this is shown below. # Modifying the architecture @@ -118,10 +118,10 @@ To lower the amount of layers in a VQGan, you can remove layers by modifying the "vq_embed_dim": 4 } ``` -For increasing the size of the vocaburaries you can increase num_vq_embeddings. However, [some research](https://magvit.cs.cmu.edu/v2/) shows that the representation of VQGANs start degrading after 2^14~16384 vq embeddings so it's not recommended to go past that. +For increasing the size of the vocabularies you can increase num_vq_embeddings. However, [some research](https://magvit.cs.cmu.edu/v2/) shows that the representation of VQGANs start degrading after 2^14~16384 vq embeddings so it's not recommended to go past that. ## Extra training tips/ideas During logging take care to make sure data_time is low. data_time is the amount spent loading the data and where the GPU is not active. So essentially, it's the time wasted. The easiest way to lower data time is to increase the --dataloader_num_workers to a higher number like 4. Due to a bug in Pytorch, this only works on linux based systems. For more details check [here](https://github.com/huggingface/diffusers/issues/7646) Secondly, training should seem to be done when both the discriminator and the generator loss converges. Thirdly, another low hanging fruit is just using ema using the --use_ema parameter. This tends to make the output images smoother. This has a con where you have to lower your batch size by 1 but it may be worth it. -Another more experimental low hanging fruit is changing from the vgg19 to different models for the lpips loss using the --timm_model_backend. If you do this, I recommend also changing the timm_model_layers parameter to the layer in your model which you think is best for representation. However, becareful with the feature map norms since this can easily overdominate the loss. \ No newline at end of file +Another more experimental low hanging fruit is changing from the vgg19 to different models for the lpips loss using the --timm_model_backend. If you do this, I recommend also changing the timm_model_layers parameter to the layer in your model which you think is best for representation. However, be careful with the feature map norms since this can easily overdominate the loss. \ No newline at end of file diff --git a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py index b0c11362ff..d2f67a6989 100644 --- a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py +++ b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py @@ -52,7 +52,7 @@ EXAMPLE_DOC_STRING = """ >>> image.save("cd_imagenet64_l2_onestep_sample_penguin.png") >>> # Multistep sampling, class-conditional image generation - >>> # Timesteps can be explicitly specified; the particular timesteps below are from the original Github repo: + >>> # Timesteps can be explicitly specified; the particular timesteps below are from the original GitHub repo: >>> # https://github.com/openai/consistency_models/blob/main/scripts/launch.sh#L77 >>> image = pipe(num_inference_steps=None, timesteps=[22, 0], class_labels=145).images[0] >>> image.save("cd_imagenet64_l2_multistep_sample_penguin.png") diff --git a/tests/schedulers/test_scheduler_edm_euler.py b/tests/schedulers/test_scheduler_edm_euler.py index 6cc76cc0bb..a2f6fd9bad 100644 --- a/tests/schedulers/test_scheduler_edm_euler.py +++ b/tests/schedulers/test_scheduler_edm_euler.py @@ -80,7 +80,7 @@ class EDMEulerSchedulerTest(SchedulerCommonTest): assert abs(result_sum.item() - 34.1855) < 1e-3 assert abs(result_mean.item() - 0.044) < 1e-3 - # Override test_from_save_pretrined to use EDMEulerScheduler-specific logic + # Override test_from_save_pretrained to use EDMEulerScheduler-specific logic def test_from_save_pretrained(self): kwargs = dict(self.forward_default_kwargs) num_inference_steps = kwargs.pop("num_inference_steps", None) @@ -118,7 +118,7 @@ class EDMEulerSchedulerTest(SchedulerCommonTest): assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - # Override test_from_save_pretrined to use EDMEulerScheduler-specific logic + # Override test_from_save_pretrained to use EDMEulerScheduler-specific logic def test_step_shape(self): num_inference_steps = 10