sdnext/html/reference.json

{
  "DreamShaper SD 1.5 v8": {
    "path": "dreamshaper_8.safetensors@https://civitai.com/api/download/models/128713",
    "desc": "Showcase finetuned model based on Stable diffusion 1.5",
    "preview": "dreamshaper_8.jpg",
    "original": true
  },
  "DreamShaper SD XL Turbo": {
    "path": "dreamshaperXL_turboDpmppSDE.safetensors@https://civitai.com/api/download/models/251662",
    "desc": "Showcase finetuned model based on Stable diffusion XL",
    "preview": "dreamshaperXL_turboDpmppSDE.jpg"
  },
  "Juggernaut Reborn": {
    "path": "juggernaut_reborn.safetensors@https://civitai.com/api/download/models/274039",
    "desc": "Showcase finetuned model based on Stable diffusion 1.5",
    "preview": "juggernaut_reborn.jpg",
    "original": true
  },
  "Juggernaut XL v7 RunDiffusion": {
    "path": "juggernautXL_v7Rundiffusion.safetensors@https://civitai.com/api/download/models/240840",
    "desc": "Showcase finetuned model based on Stable diffusion XL",
    "preview": "juggernautXL_v7Rundiffusion.jpg"
  },
  "RunwayML SD 1.5": {
    "path": "runwayml/stable-diffusion-v1-5",
    "alt": "v1-5-pruned-emaonly.safetensors@https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors?download=true",
    "desc": "Stable Diffusion 1.5 is the base model all other 1.5 checkpoint were trained from. It's a latent text-to-image diffusion model capable of generating photo-realistic images given any text input. The Stable-Diffusion-v1-5 checkpoint was initialized with the weights of the Stable-Diffusion-v1-2 checkpoint and subsequently fine-tuned on 595k steps at resolution 512x512.",
    "preview": "runwayml--stable-diffusion-v1-5.jpg",
    "original": true
  },
  "StabilityAI SD 2.1 EMA": {
    "path": "stabilityai/stable-diffusion-2-1-base",
    "alt": "v2-1_512-ema-pruned.safetensors@https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.safetensors?download=true",
    "desc": "This stable-diffusion-2-1-base model fine-tunes stable-diffusion-2-base (512-base-ema.ckpt) with 220k extra steps taken",
    "preview": "stabilityai--stable-diffusion-2.1-base.jpg",
    "original": true
  },
  "StabilityAI SD 2.1 V": {
    "path": "stabilityai/stable-diffusion-2-1-base",
    "alt": "v2-1_768-ema-pruned.safetensors@https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.safetensors?download=true",
    "desc": "This stable-diffusion-2 model is resumed from stable-diffusion-2-base (512-base-ema.ckpt) and trained for 150k steps using a v-objective on the same dataset. Resumed for another 140k steps on 768x768 images",
    "preview": "stabilityai--stable-diffusion-2.1-base.jpg",
    "original": true
  },
  "StabilityAI SD-XL 1.0 Base": {
    "path": "stabilityai/stable-diffusion-xl-base-1.0",
    "variant": "fp16",
    "desc": "Stable Diffusion XL (SDXL) is the latest AI image generation model that is tailored towards more photorealistic outputs with more detailed imagery and composition compared to previous SD models, including SD 2.1. It can make realistic faces, legible text within the images, and better image composition, all while using shorter and simpler prompts at a greatly increased base resolution of 1024x1024. Just like its predecessors, SDXL has the ability to generate image variations using image-to-image prompting, inpainting (reimagining of the selected parts of an image), and outpainting (creating new parts that lie outside the image borders).",
    "preview": "stabilityai--stable-diffusion-xl-base-1.0.jpg"
  },
  "StabilityAI SD 2.1 Turbo": {
    "_path": "stabilityai/sd-turbo",
    "path": "sd_turbo.safetensors@https://huggingface.co/stabilityai/sd-turbo/resolve/main/sd_turbo.safetensors?download=true",
    "variant": "fp16",
    "desc": "SD-Turbo is a distilled version of Stable Diffusion 2.1, trained for real-time synthesis. SD-Turbo is based on a novel training method called Adversarial Diffusion Distillation (ADD) (see the technical report), which allows sampling large-scale foundational image diffusion models in 1 to 4 steps at high image quality. This approach uses score distillation to leverage large-scale off-the-shelf image diffusion models as a teacher signal and combines this with an adversarial loss to ensure high image fidelity even in the low-step regime of one or two sampling steps.",
    "preview": "stabilityai--sd-turbo.jpg",
    "original": true
  },
  "StabilityAI SD-XL Turbo": {
    "_path": "stabilityai/sdxl-turbo",
    "path": "sdxl_turbo.safetensors@https://huggingface.co/stabilityai/sdxl-turbo/resolve/main/sd_xl_turbo_1.0_fp16.safetensors?download=true",
    "variant": "fp16",
    "desc": "SDXL-Turbo is a distilled version of SDXL 1.0, trained for real-time synthesis. SDXL-Turbo is based on a novel training method called Adversarial Diffusion Distillation (ADD) (see the technical report), which allows sampling large-scale foundational image diffusion models in 1 to 4 steps at high image quality. This approach uses score distillation to leverage large-scale off-the-shelf image diffusion models as a teacher signal and combines this with an adversarial loss to ensure high image fidelity even in the low-step regime of one or two sampling steps.",
    "preview": "stabilityai--sdxl-turbo.jpg"
  },
  "StabilityAI Stable Video Diffusion": {
    "path": "stabilityai/stable-video-diffusion-img2vid",
    "desc": "(SVD) Image-to-Video is a latent diffusion model trained to generate short video clips from an image conditioning. This model was trained to generate 14 frames at resolution 576x1024 given a context frame of the same size. We also finetune the widely used f8-decoder for temporal consistency.",
    "preview": "stabilityai--stable-video-diffusion-img2vid.jpg"
  },
  "StabilityAI Stable Video Diffusion XT": {
    "path": "stabilityai/stable-video-diffusion-img2vid-xt",
    "desc": "(SVD) Image-to-Video is a latent diffusion model trained to generate short video clips from an image conditioning. This model was trained to generate 25 frames at resolution 576x1024 given a context frame of the same size, finetuned from SVD Image-to-Video [14 frames]. We also finetune the widely used f8-decoder for temporal consistency.",
    "preview": "stabilityai--stable-video-diffusion-img2vid-xt.jpg"
  },
  "Segmind Vega": {
    "path": "segmind/Segmind-Vega",
    "desc": "The Segmind-Vega Model is a distilled version of the Stable Diffusion XL (SDXL), offering a remarkable 70% reduction in size and an impressive 100% speedup while retaining high-quality text-to-image generation capabilities. Trained on diverse datasets, including Grit and Midjourney scrape data, it excels at creating a wide range of visual content based on textual prompts. Employing a knowledge distillation strategy, Segmind-Vega leverages the teachings of several expert models, including SDXL, ZavyChromaXL, and JuggernautXL, to combine their strengths and produce compelling visual outputs.",
    "preview": "segmind--Segmind-Vega.jpg"
  },
  "Segmind SSD-1B": {
    "path": "segmind/SSD-1B",
    "desc": "The Segmind Stable Diffusion Model (SSD-1B) offers a compact, efficient, and distilled version of the SDXL model. At 50% smaller and 60% faster than Stable Diffusion XL (SDXL), it provides quick and seamless performance without sacrificing image quality.",
    "preview": "segmind--SSD-1B.jpg"
  },
  "Segmind Tiny": {
    "path": "segmind/tiny-sd",
    "desc": "Segmind's Tiny-SD offers a compact, efficient, and distilled version of Realistic Vision 4.0 and is up to 80% faster than SD1.5",
    "preview": "segmind--tiny-sd.jpg"
  },
  "Segmind SegMoE SD 4x2": {
    "path": "segmind/SegMoE-SD-4x2-v0",
    "desc": "SegMoE-SD-4x2-v0 is an untrained Segmind Mixture of Diffusion Experts Model generated using segmoe from 4 Expert SD1.5 models. SegMoE is a powerful framework for dynamically combining Stable Diffusion Models into a Mixture of Experts within minutes without training",
    "preview": "segmind--SegMoE-SD-4x2-v0.jpg"
  },
  "Segmind SegMoE XL 2x1": {
    "path": "segmind/SegMoE-2x1-v0",
    "desc": "SegMoE-2x1-v0 is an untrained Segmind Mixture of Diffusion Experts Model generated using segmoe from 2 Expert SDXL models. SegMoE is a powerful framework for dynamically combining Stable Diffusion Models into a Mixture of Experts within minutes without training",
    "preview": "segmind--SegMoE-2x1-v0.jpg"
  },
  "Segmind SegMoE XL 4x2": {
    "path": "segmind/SegMoE-4x2-v0",
    "desc": "SegMoE-4x2-v0 is an untrained Segmind Mixture of Diffusion Experts Model generated using segmoe from 4 Expert SDXL models. SegMoE is a powerful framework for dynamically combining Stable Diffusion Models into a Mixture of Experts within minutes without training",
    "preview": "segmind--SegMoE-4x2-v0.jpg"
  },
  "LCM SD-1.5 Dreamshaper 7": {
    "path": "SimianLuo/LCM_Dreamshaper_v7",
    "desc": "Latent Consistencey Models enable swift inference with minimal steps on any pre-trained LDMs, including Stable Diffusion. By distilling classifier-free guidance into the model's input, LCM can generate high-quality images in very short inference time. LCM can generate quality images in as few as 3-4 steps, making it blazingly fast.",
    "preview": "SimianLuo--LCM_Dreamshaper_v7.jpg"
  },
  "Pixart-α XL 2 Medium 512": {
    "path": "PixArt-alpha/PixArt-XL-2-512x512",
    "desc": "PixArt-α is a Transformer-based T2I diffusion model whose image generation quality is competitive with state-of-the-art image generators (e.g., Imagen, SDXL, and even Midjourney), and the training speed markedly surpasses existing large-scale T2I models. Extensive experiments demonstrate that PIXART-α excels in image quality, artistry, and semantic control. It can directly generate 512px images from text prompts within a single sampling process.",
    "preview": "PixArt-alpha--PixArt-XL-2-512x512.jpg"
  },
  "Pixart-α XL 2 Large 1024": {
    "path": "PixArt-alpha/PixArt-XL-2-1024-MS",
    "desc": "PixArt-α is a Transformer-based T2I diffusion model whose image generation quality is competitive with state-of-the-art image generators (e.g., Imagen, SDXL, and even Midjourney), and the training speed markedly surpasses existing large-scale T2I models. Extensive experiments demonstrate that PIXART-α excels in image quality, artistry, and semantic control. It can directly generate 1024px images from text prompts within a single sampling process.",
    "preview": "PixArt-alpha--PixArt-XL-2-1024-MS.jpg"
  },
  "Pixart-α XL 2 Large LCM": {
    "path": "PixArt-alpha/PixArt-LCM-XL-2-1024-MS",
    "desc": "Pixart-α consists of pure transformer blocks for latent diffusion: It can directly generate 1024px images from text prompts within a single sampling process. LCMs is a diffusion distillation method which predict PF-ODE's solution directly in latent space, achieving super fast inference with few steps. Following LCM LoRA, we illustrative of the generation speed we achieve on various computers. Let us stress again how liberating it is to explore image generation so easily with PixArt-LCM.",
    "preview": "PixArt-alpha--PixArt-XL-2-1024-MS.jpg"
  },
  "Warp Wuerstchen": {
    "path": "warp-ai/wuerstchen",
    "desc": "Würstchen is a diffusion model whose text-conditional model works in a highly compressed latent space of images. Why is this important? Compressing data can reduce computational costs for both training and inference by magnitudes. Training on 1024x1024 images, is way more expensive than training at 32x32. Usually, other works make use of a relatively small compression, in the range of 4x - 8x spatial compression. Würstchen takes this to an extreme. Through its novel design, we achieve a 42x spatial compression. Würstchen employs a two-stage compression, what we call Stage A and Stage B. Stage A is a VQGAN, and Stage B is a Diffusion Autoencoder (more details can be found in the paper). A third model, Stage C, is learned in that highly compressed latent space. This training requires fractions of the compute used for current top-performing models, allowing also cheaper and faster inference.",
    "preview": "warp-ai--wuerstchen.jpg"
  },
  "Kandinsky 2.1": {
    "path": "kandinsky-community/kandinsky-2-1",
    "desc": "Kandinsky 2.1 is a text-conditional diffusion model based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.1 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.",
    "preview": "kandinsky-community--kandinsky-2-1.jpg"
  },
  "Kandinsky 2.2": {
    "path": "kandinsky-community/kandinsky-2-2-decoder",
    "desc": "Kandinsky 2.2 is a text-conditional diffusion model (+0.1!) based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.1 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.",
    "preview": "kandinsky-community--kandinsky-2-2-decoder.jpg"
  },
  "Kandinsky 3": {
    "path": "kandinsky-community/kandinsky-3",
    "desc": "Kandinsky 3.0 is an open-source text-to-image diffusion model built upon the Kandinsky2-x model family. In comparison to its predecessors, Kandinsky 3.0 incorporates more data and specifically related to Russian culture, which allows to generate pictures related to Russin culture. Furthermore, enhancements have been made to the text understanding and visual quality of the model, achieved by increasing the size of the text encoder and Diffusion U-Net models, respectively.",
    "preview": "kandinsky-community--kandinsky-3.jpg"
  },
  "Playground v1": {
    "path": "playgroundai/playground-v1",
    "desc": "Playground v1 is a latent diffusion model that improves the overall HDR quality to get more stunning images.",
    "preview": "playgroundai--playground-v1.jpg"
  },
  "Playground v2 256": {
    "path": "playgroundai/playground-v2-256px-base",
    "desc": "Playground v2 is a diffusion-based text-to-image generative model. The model was trained from scratch by the research team at Playground. Images generated by Playground v2 are favored 2.5 times more than those produced by Stable Diffusion XL, according to Playground’s user study.",
    "preview": "playgroundai--playground-v2-256px-base.jpg"
  },
  "Playground v2 512": {
    "path": "playgroundai/playground-v2-512px-base",
    "desc": "Playground v2 is a diffusion-based text-to-image generative model. The model was trained from scratch by the research team at Playground. Images generated by Playground v2 are favored 2.5 times more than those produced by Stable Diffusion XL, according to Playground’s user study.",
    "preview": "playgroundai--playground-v2-512px-base.jpg"
  },
  "Playground v2 1024": {
    "path": "playgroundai/playground-v2-1024px-aesthetic",
    "desc": "Playground v2 is a diffusion-based text-to-image generative model. The model was trained from scratch by the research team at Playground. Images generated by Playground v2 are favored 2.5 times more than those produced by Stable Diffusion XL, according to Playground’s user study.",
    "preview": "playgroundai--playground-v2-1024px-aesthetic.jpg"
  },
  "DeepFloyd IF Medium": {
    "path": "DeepFloyd/IF-I-M-v1.0",
    "desc": "DeepFloyd-IF is a pixel-based text-to-image triple-cascaded diffusion model, that can generate pictures with new state-of-the-art for photorealism and language understanding. The result is a highly efficient model that outperforms current state-of-the-art models, achieving a zero-shot FID-30K score of 6.66 on the COCO dataset. It is modular and composed of frozen text mode and three pixel cascaded diffusion modules, each designed to generate images of increasing resolution: 64x64, 256x256, and 1024x1024.",
    "preview": "DeepFloyd--IF-I-M-v1.0.jpg"
  },
  "aMUSEd 256": {
    "path": "amused/amused-256",
    "desc": "Amused is a lightweight text to image model based off of the muse architecture. Amused is particularly useful in applications that require a lightweight and fast model such as generating many images quickly at once.",
    "preview": "amused--amused-256.jpg"
  },
  "aMUSEd 512": {
    "path": "amused/amused-512",
    "desc": "Amused is a lightweight text to image model based off of the muse architecture. Amused is particularly useful in applications that require a lightweight and fast model such as generating many images quickly at once.",
    "preview": "amused--amused-512.jpg"
  },
  "Tsinghua UniDiffuser": {
    "path": "thu-ml/unidiffuser-v1",
    "desc": "UniDiffuser is a unified diffusion framework to fit all distributions relevant to a set of multi-modal data in one transformer. UniDiffuser is able to perform image, text, text-to-image, image-to-text, and image-text pair generation by setting proper timesteps without additional overhead.\nSpecifically, UniDiffuser employs a variation of transformer, called U-ViT, which parameterizes the joint noise prediction network. Other components perform as encoders and decoders of different modalities, including a pretrained image autoencoder from Stable Diffusion, a pretrained image ViT-B/32 CLIP encoder, a pretrained text ViT-L CLIP encoder, and a GPT-2 text decoder finetuned by ourselves.",
    "preview": "thu-ml--unidiffuser-v1.jpg"
  },
  "SalesForce BLIP-Diffusion": {
    "path": "salesforce/blipdiffusion",
    "desc": "BLIP-Diffusion, a new subject-driven image generation model that supports multimodal control which consumes inputs of subject images and text prompts. Unlike other subject-driven generation models, BLIP-Diffusion introduces a new multimodal encoder which is pre-trained to provide subject representation.",
    "preview": "salesforce--blipdiffusion.jpg"
  },
  "InstaFlow 0.9B": {
    "path": "XCLiu/instaflow_0_9B_from_sd_1_5",
    "desc": "InstaFlow is an ultra-fast, one-step image generator that achieves image quality close to Stable Diffusion. This efficiency is made possible through a recent Rectified Flow technique, which trains probability flows with straight trajectories, hence inherently requiring only a single step for fast inference.",
    "preview": "XCLiu--instaflow_0_9B_from_sd_1_5.jpg"
  }
}