diff --git a/docs/source/en/api/pipelines/photon.md b/docs/source/en/api/pipelines/photon.md index a326c50ae7..2f0f6b428a 100644 --- a/docs/source/en/api/pipelines/photon.md +++ b/docs/source/en/api/pipelines/photon.md @@ -36,14 +36,16 @@ Both **fine-tuned** and **non-fine-tuned** versions are available: - **Fine-tuned models**, trained on the [Alchemist dataset](https://huggingface.co/datasets/yandex/alchemist), enhance the **aesthetic quality** of the base models—especially when prompts are **less detailed**. -| Model | Recommended dtype | Resolution | Fine-tuned | -|:-----:|:-----------------:|:----------:|:----------:| -| [`Photoroom/photon-256-t2i`](https://huggingface.co/Photoroom/photon-256-t2i) | `torch.bfloat16` | 256x256 | No | -| [`Photoroom/photon-256-t2i-sft`](https://huggingface.co/Photoroom/photon-256-t2i-sft) | `torch.bfloat16` | 256x256 | Yes | -| [`Photoroom/photon-512-t2i`](https://huggingface.co/Photoroom/photon-512-t2i) | `torch.bfloat16` | 512x512 | No | -| [`Photoroom/photon-512-t2i-sft`](hhttps://huggingface.co/Photoroom/photon-512-t2i-sft) | `torch.bfloat16` | 512x512 | Yes | -| [`Photoroom/photon-512-t2i-dc-ae`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae) | `torch.bfloat16` | 512x512 | No | -| [`Photoroom/photon-512-t2i-dc-ae-sft`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae-sft) | `torch.bfloat16` | 512x512 | Yes | +| Model | Resolution | Fine-tuned | Distilled | Description | Suggested prompts | Suggested parameters | Recommended dtype | +|:-----:|:-----------------:|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:| +| [`Photoroom/photon-256-t2i`](https://huggingface.co/Photoroom/photon-256-t2i)| 256 | No | No | Base model pre-trained at 256 with Flux VAE|Works best with detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` | +| [`Photoroom/photon-256-t2i-sft`](https://huggingface.co/Photoroom/photon-256-t2i-sft)| 512 | Yes | No | Fine-tuned on the [Alchemist dataset](https://huggingface.co/datasets/yandex/alchemist) dataset with Flux VAE | Can handle less detailed prompts|28 steps, cfg=5.0| `torch.bfloat16` | +| [`Photoroom/photon-512-t2i`](https://huggingface.co/Photoroom/photon-512-t2i)| 512 | No | No | Base model pre-trained at 512 with Flux VAE |Works best with detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` | +| [`Photoroom/photon-512-t2i-sft`](hhttps://huggingface.co/Photoroom/photon-512-t2i-sft)| 512 | Yes | No | Fine-tuned on the [Alchemist dataset](https://huggingface.co/datasets/yandex/alchemist) dataset with Flux VAE | Can handle less detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` | +| [`Photoroom/photon-512-t2i-sft-distilled`](https://huggingface.co/Photoroom/photon-512-t2i-sft-distilled)| 512 | Yes | Yes | 8-step distilled model from [`Photoroom/photon-512-t2i-sft`](https://huggingface.co/Photoroom/photon-512-t2i-sft) | Can handle less detailed prompts in natural language|8 steps, cfg=1.0| `torch.bfloat16` | +| [`Photoroom/photon-512-t2i-dc-ae`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae)| 512 | No | No | Base model pre-trained at 512 with [Deep Compression Autoencoder (DC-AE)](https://hanlab.mit.edu/projects/dc-ae)|Works best with detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` | +| [`Photoroom/photon-512-t2i-dc-ae-sft`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae-sft)| 512 | Yes | No | Fine-tuned on the [Alchemist dataset](https://huggingface.co/datasets/yandex/alchemist) dataset with [Deep Compression Autoencoder (DC-AE)](https://hanlab.mit.edu/projects/dc-ae) | Can handle less detailed prompts in natural language|28 steps, cfg=5.0| `torch.bfloat16` | +| [`Photoroom/photon-512-t2i-dc-ae-sft-distilled`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae-sft-distilled)| 512 | Yes | Yes | 8-step distilled model from [`Photoroom/photon-512-t2i-dc-ae-sft-distilled`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae-sft-distilled) | Can handle less detailed prompts in natural language|8 steps, cfg=1.0| `torch.bfloat16` |s Refer to [this](https://huggingface.co/collections/Photoroom/photon-models-68e66254c202ebfab99ad38e) collection for more information. @@ -56,8 +58,8 @@ from diffusers.pipelines.photon import PhotonPipeline pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i-sft", torch_dtype=torch.bfloat16) pipe.to("cuda") -prompt = "A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “PRX” in bright, sparkling light" -image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0] +prompt = "A front-facing portrait of a lion the golden savanna at sunset." +image = pipe(prompt, num_inference_steps=28, guidance_scale=5.0).images[0] image.save("photon_output.png") ``` @@ -75,12 +77,12 @@ from transformers import T5GemmaModel, GemmaTokenizerFast # Load transformer transformer = PhotonTransformer2DModel.from_pretrained( - "Photoroom/photon-512-t2i", subfolder="transformer" + "Photoroom/photon-512-t2i-sft", subfolder="transformer" ).to(dtype=torch.bfloat16) # Load scheduler scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( - "Photoroom/photon-512-t2i", subfolder="scheduler" + "Photoroom/photon-512-t2i-sft", subfolder="scheduler" ) # Load T5Gemma text encoder @@ -136,7 +138,7 @@ import torch from diffusers.pipelines.photon import PhotonPipeline pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i-sft", torch_dtype=torch.bfloat16) pipe = pipe( - prompt = "A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “PRX” in bright, sparkling light" + prompt = "A front-facing portrait of a lion the golden savanna at sunset." num_inference_steps=28, guidance_scale=4.0, height=512, @@ -153,7 +155,7 @@ For memory-constrained environments: import torch from diffusers.pipelines.photon import PhotonPipeline -pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i", torch_dtype=torch.bfloat16) +pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i-sft", torch_dtype=torch.bfloat16) pipe.enable_model_cpu_offload() # Offload components to CPU when not in use # Or use sequential CPU offload for even lower memory diff --git a/scripts/convert_photon_to_diffusers.py b/scripts/convert_photon_to_diffusers.py index 2f060fd3cd..c9c07f191f 100644 --- a/scripts/convert_photon_to_diffusers.py +++ b/scripts/convert_photon_to_diffusers.py @@ -172,10 +172,10 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Ph return transformer -def create_scheduler_config(output_path: str): +def create_scheduler_config(output_path: str, shift: float): """Create FlowMatchEulerDiscreteScheduler config.""" - scheduler_config = {"_class_name": "FlowMatchEulerDiscreteScheduler", "num_train_timesteps": 1000, "shift": 1.0} + scheduler_config = {"_class_name": "FlowMatchEulerDiscreteScheduler", "num_train_timesteps": 1000, "shift": shift} scheduler_path = os.path.join(output_path, "scheduler") os.makedirs(scheduler_path, exist_ok=True) @@ -207,6 +207,7 @@ def download_and_save_vae(vae_type: str, output_path: str): def download_and_save_text_encoder(output_path: str): """Download and save T5Gemma text encoder and tokenizer.""" from transformers import GemmaTokenizerFast + from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel text_encoder_path = os.path.join(output_path, "text_encoder") tokenizer_path = os.path.join(output_path, "tokenizer") @@ -214,14 +215,11 @@ def download_and_save_text_encoder(output_path: str): os.makedirs(tokenizer_path, exist_ok=True) print("Downloading T5Gemma model from google/t5gemma-2b-2b-ul2...") - from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel - t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2") - # Save only the encoder - encoder = t5gemma_model.encoder - encoder.save_pretrained(text_encoder_path) - + # Extract and save only the encoder + t5gemma_encoder = t5gemma_model.encoder + t5gemma_encoder.save_pretrained(text_encoder_path) print(f"✓ Saved T5GemmaEncoder to {text_encoder_path}") print("Downloading tokenizer from google/t5gemma-2b-2b-ul2...") @@ -284,7 +282,7 @@ def main(args): print(f"✓ Saved transformer to {transformer_path}") # Create scheduler config - create_scheduler_config(args.output_path) + create_scheduler_config(args.output_path, args.shift) download_and_save_vae(args.vae_type, args.output_path) download_and_save_text_encoder(args.output_path) @@ -342,6 +340,13 @@ if __name__ == "__main__": default=DEFAULT_RESOLUTION, help="Target resolution for the model (256, 512, or 1024). Affects the transformer's sample_size.", ) + + parser.add_argument( + "--shift", + type=float, + default=3.0, + help="Shift for the scheduler", + ) args = parser.parse_args()