fix timestep shift

2026-01-27 17:22:53 +03:00 · 2025-10-13 09:18:13 +00:00
parent ec70e3fdc0
commit 12dbabe607
2 changed files with 30 additions and 23 deletions
--- a/docs/source/en/api/pipelines/photon.md
+++ b/docs/source/en/api/pipelines/photon.md
@@ -36,14 +36,16 @@ Both **fine-tuned** and **non-fine-tuned** versions are available:
 - **Fine-tuned models**, trained on the [Alchemist dataset](https://huggingface.co/datasets/yandex/alchemist), enhance the **aesthetic quality** of the base models—especially when prompts are **less detailed**.


-| Model | Recommended dtype | Resolution | Fine-tuned |
-|:-----:|:-----------------:|:----------:|:----------:|
-| [`Photoroom/photon-256-t2i`](https://huggingface.co/Photoroom/photon-256-t2i) | `torch.bfloat16` | 256x256 | No |
-| [`Photoroom/photon-256-t2i-sft`](https://huggingface.co/Photoroom/photon-256-t2i-sft) | `torch.bfloat16` | 256x256 | Yes |
-| [`Photoroom/photon-512-t2i`](https://huggingface.co/Photoroom/photon-512-t2i) | `torch.bfloat16` | 512x512 | No |
-| [`Photoroom/photon-512-t2i-sft`](hhttps://huggingface.co/Photoroom/photon-512-t2i-sft) | `torch.bfloat16` | 512x512 | Yes |
-| [`Photoroom/photon-512-t2i-dc-ae`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae) | `torch.bfloat16` | 512x512 | No |
-| [`Photoroom/photon-512-t2i-dc-ae-sft`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae-sft) | `torch.bfloat16` | 512x512 | Yes |
+| Model | Resolution | Fine-tuned | Distilled | Description | Suggested prompts | Suggested parameters | Recommended dtype |
+|:-----:|:-----------------:|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|
+| [`Photoroom/photon-256-t2i`](https://huggingface.co/Photoroom/photon-256-t2i)| 256 | No | No | Base model pre-trained at 256 with Flux VAE|Works best with detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` |
+| [`Photoroom/photon-256-t2i-sft`](https://huggingface.co/Photoroom/photon-256-t2i-sft)| 512 | Yes | No | Fine-tuned on the [Alchemist dataset](https://huggingface.co/datasets/yandex/alchemist) dataset with Flux VAE | Can handle less detailed prompts|28 steps, cfg=5.0| `torch.bfloat16` |
+| [`Photoroom/photon-512-t2i`](https://huggingface.co/Photoroom/photon-512-t2i)| 512 | No | No | Base model pre-trained at 512 with Flux VAE |Works best with detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` |
+| [`Photoroom/photon-512-t2i-sft`](hhttps://huggingface.co/Photoroom/photon-512-t2i-sft)| 512 | Yes | No | Fine-tuned on the [Alchemist dataset](https://huggingface.co/datasets/yandex/alchemist) dataset with Flux VAE | Can handle less detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` |
+| [`Photoroom/photon-512-t2i-sft-distilled`](https://huggingface.co/Photoroom/photon-512-t2i-sft-distilled)| 512 | Yes | Yes | 8-step distilled model from [`Photoroom/photon-512-t2i-sft`](https://huggingface.co/Photoroom/photon-512-t2i-sft) | Can handle less detailed prompts in natural language|8 steps, cfg=1.0| `torch.bfloat16` |
+| [`Photoroom/photon-512-t2i-dc-ae`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae)| 512 | No | No | Base model pre-trained at 512 with [Deep Compression Autoencoder (DC-AE)](https://hanlab.mit.edu/projects/dc-ae)|Works best with detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` |
+| [`Photoroom/photon-512-t2i-dc-ae-sft`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae-sft)| 512 | Yes | No | Fine-tuned on the [Alchemist dataset](https://huggingface.co/datasets/yandex/alchemist) dataset with [Deep Compression Autoencoder (DC-AE)](https://hanlab.mit.edu/projects/dc-ae) | Can handle less detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` |
+| [`Photoroom/photon-512-t2i-dc-ae-sft-distilled`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae-sft-distilled)| 512 | Yes | Yes | 8-step distilled model from [`Photoroom/photon-512-t2i-dc-ae-sft-distilled`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae-sft-distilled) | Can handle less detailed prompts in natural language|8 steps, cfg=1.0| `torch.bfloat16` |s

 Refer to [this](https://huggingface.co/collections/Photoroom/photon-models-68e66254c202ebfab99ad38e) collection for more information.

@@ -56,8 +58,8 @@ from diffusers.pipelines.photon import PhotonPipeline
 pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i-sft", torch_dtype=torch.bfloat16)
 pipe.to("cuda")

-prompt = "A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “PRX” in bright, sparkling light"
-image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
+prompt = "A front-facing portrait of a lion the golden savanna at sunset."
+image = pipe(prompt, num_inference_steps=28, guidance_scale=5.0).images[0]
 image.save("photon_output.png")
 ```

@@ -75,12 +77,12 @@ from transformers import T5GemmaModel, GemmaTokenizerFast

 # Load transformer
 transformer = PhotonTransformer2DModel.from_pretrained(
-    "Photoroom/photon-512-t2i", subfolder="transformer"
+    "Photoroom/photon-512-t2i-sft", subfolder="transformer"
 ).to(dtype=torch.bfloat16)

 # Load scheduler
 scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
-    "Photoroom/photon-512-t2i", subfolder="scheduler"
+    "Photoroom/photon-512-t2i-sft", subfolder="scheduler"
 )

 # Load T5Gemma text encoder
@@ -136,7 +138,7 @@ import torch
 from diffusers.pipelines.photon import PhotonPipeline
 pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i-sft", torch_dtype=torch.bfloat16)
 pipe = pipe(
-    prompt = "A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “PRX” in bright, sparkling light"
+    prompt = "A front-facing portrait of a lion the golden savanna at sunset."
    num_inference_steps=28,
    guidance_scale=4.0,
    height=512,
@@ -153,7 +155,7 @@ For memory-constrained environments:
 import torch
 from diffusers.pipelines.photon import PhotonPipeline

-pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i", torch_dtype=torch.bfloat16)
+pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i-sft", torch_dtype=torch.bfloat16)
 pipe.enable_model_cpu_offload()  # Offload components to CPU when not in use

 # Or use sequential CPU offload for even lower memory
--- a/scripts/convert_photon_to_diffusers.py
+++ b/scripts/convert_photon_to_diffusers.py
@@ -172,10 +172,10 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Ph
    return transformer


-def create_scheduler_config(output_path: str):
+def create_scheduler_config(output_path: str, shift: float):
    """Create FlowMatchEulerDiscreteScheduler config."""

-    scheduler_config = {"_class_name": "FlowMatchEulerDiscreteScheduler", "num_train_timesteps": 1000, "shift": 1.0}
+    scheduler_config = {"_class_name": "FlowMatchEulerDiscreteScheduler", "num_train_timesteps": 1000, "shift": shift}

    scheduler_path = os.path.join(output_path, "scheduler")
    os.makedirs(scheduler_path, exist_ok=True)
@@ -207,6 +207,7 @@ def download_and_save_vae(vae_type: str, output_path: str):
 def download_and_save_text_encoder(output_path: str):
    """Download and save T5Gemma text encoder and tokenizer."""
    from transformers import GemmaTokenizerFast
+    from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel

    text_encoder_path = os.path.join(output_path, "text_encoder")
    tokenizer_path = os.path.join(output_path, "tokenizer")
@@ -214,14 +215,11 @@ def download_and_save_text_encoder(output_path: str):
    os.makedirs(tokenizer_path, exist_ok=True)

    print("Downloading T5Gemma model from google/t5gemma-2b-2b-ul2...")
-    from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel
-
    t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2")

-    # Save only the encoder
-    encoder = t5gemma_model.encoder
-    encoder.save_pretrained(text_encoder_path)
-
+    # Extract and save only the encoder
+    t5gemma_encoder = t5gemma_model.encoder
+    t5gemma_encoder.save_pretrained(text_encoder_path)
    print(f"✓ Saved T5GemmaEncoder to {text_encoder_path}")

    print("Downloading tokenizer from google/t5gemma-2b-2b-ul2...")
@@ -284,7 +282,7 @@ def main(args):
    print(f"✓ Saved transformer to {transformer_path}")

    # Create scheduler config
-    create_scheduler_config(args.output_path)
+    create_scheduler_config(args.output_path, args.shift)

    download_and_save_vae(args.vae_type, args.output_path)
    download_and_save_text_encoder(args.output_path)
@@ -342,6 +340,13 @@ if __name__ == "__main__":
        default=DEFAULT_RESOLUTION,
        help="Target resolution for the model (256, 512, or 1024). Affects the transformer's sample_size.",
    )
+    
+    parser.add_argument(
+        "--shift",
+        type=float,
+        default=3.0,
+        help="Shift for the scheduler",
+    )

    args = parser.parse_args()