1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-29 07:22:12 +03:00

Merge branch 'main' into migrate-lora-pytest

This commit is contained in:
Sayak Paul
2025-10-17 07:55:31 +05:30
committed by GitHub
104 changed files with 2358 additions and 1467 deletions

View File

@@ -38,9 +38,8 @@ jobs:
run: |
apt update
apt install -y libpq-dev postgresql-client
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install -r benchmarks/requirements.txt
uv pip install -e ".[quality]"
uv pip install -r benchmarks/requirements.txt
- name: Environment
run: |
python utils/print_env.py

View File

@@ -72,7 +72,6 @@ jobs:
image-name:
- diffusers-pytorch-cpu
- diffusers-pytorch-cuda
- diffusers-pytorch-cuda
- diffusers-pytorch-xformers-cuda
- diffusers-pytorch-minimum-cuda
- diffusers-doc-builder

View File

@@ -12,7 +12,33 @@ concurrency:
cancel-in-progress: true
jobs:
check-links:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install uv
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Install doc-builder
run: |
uv pip install --system git+https://github.com/huggingface/doc-builder.git@main
- name: Check documentation links
run: |
uv run doc-builder check-links docs/source/en
build:
needs: check-links
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
with:
commit_sha: ${{ github.event.pull_request.head.sha }}

View File

@@ -74,7 +74,7 @@ jobs:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade pip
pip install --upgrade huggingface_hub
# Check secret is set

View File

@@ -71,10 +71,9 @@ jobs:
run: nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
python -m uv pip install pytest-reportlog
uv pip install -e ".[quality]"
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip install pytest-reportlog
- name: Environment
run: |
python utils/print_env.py
@@ -84,7 +83,7 @@ jobs:
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
--report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
@@ -124,11 +123,10 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
python -m uv pip install pytest-reportlog
uv pip install -e ".[quality]"
uv pip install peft@git+https://github.com/huggingface/peft.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip install pytest-reportlog
- name: Environment
run: python utils/print_env.py
@@ -139,7 +137,7 @@ jobs:
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_${{ matrix.module }}_cuda \
--report-log=tests_torch_${{ matrix.module }}_cuda.log \
@@ -152,7 +150,7 @@ jobs:
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v --make-reports=examples_torch_cuda \
--report-log=examples_torch_cuda.log \
examples/
@@ -191,8 +189,7 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test,training]
uv pip install -e ".[quality,training]"
- name: Environment
run: |
python utils/print_env.py
@@ -201,7 +198,7 @@ jobs:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
RUN_COMPILE: yes
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -232,11 +229,10 @@ jobs:
run: nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
python -m uv pip install pytest-reportlog
uv pip install -e ".[quality]"
uv pip install peft@git+https://github.com/huggingface/peft.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip install pytest-reportlog
- name: Environment
run: |
python utils/print_env.py
@@ -247,7 +243,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
BIG_GPU_MEMORY: 40
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-m "big_accelerator" \
--make-reports=tests_big_gpu_torch_cuda \
--report-log=tests_big_gpu_torch_cuda.log \
@@ -282,10 +278,9 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip install -e ".[quality]"
uv pip install peft@git+https://github.com/huggingface/peft.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
@@ -297,7 +292,7 @@ jobs:
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_minimum_version_cuda \
tests/models/test_modeling_common.py \
@@ -357,13 +352,12 @@ jobs:
run: nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install -U ${{ matrix.config.backend }}
uv pip install -e ".[quality]"
uv pip install -U ${{ matrix.config.backend }}
if [ "${{ join(matrix.config.additional_deps, ' ') }}" != "" ]; then
python -m uv pip install ${{ join(matrix.config.additional_deps, ' ') }}
uv pip install ${{ join(matrix.config.additional_deps, ' ') }}
fi
python -m uv pip install pytest-reportlog
uv pip install pytest-reportlog
- name: Environment
run: |
python utils/print_env.py
@@ -374,7 +368,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
BIG_GPU_MEMORY: 40
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
--make-reports=tests_${{ matrix.config.backend }}_torch_cuda \
--report-log=tests_${{ matrix.config.backend }}_torch_cuda.log \
tests/quantization/${{ matrix.config.test_location }}
@@ -409,10 +403,9 @@ jobs:
run: nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install -U bitsandbytes optimum_quanto
python -m uv pip install pytest-reportlog
uv pip install -e ".[quality]"
uv pip install -U bitsandbytes optimum_quanto
uv pip install pytest-reportlog
- name: Environment
run: |
python utils/print_env.py
@@ -423,7 +416,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
BIG_GPU_MEMORY: 40
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
--make-reports=tests_pipeline_level_quant_torch_cuda \
--report-log=tests_pipeline_level_quant_torch_cuda.log \
tests/quantization/test_pipeline_level_quantization.py
@@ -523,11 +516,11 @@ jobs:
# - name: Install dependencies
# shell: arch -arch arm64 bash {0}
# run: |
# ${CONDA_RUN} python -m pip install --upgrade pip uv
# ${CONDA_RUN} python -m uv pip install -e [quality,test]
# ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
# ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
# ${CONDA_RUN} python -m uv pip install pytest-reportlog
# ${CONDA_RUN} pip install --upgrade pip uv
# ${CONDA_RUN} uv pip install -e ".[quality]"
# ${CONDA_RUN} uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
# ${CONDA_RUN} uv pip install accelerate@git+https://github.com/huggingface/accelerate
# ${CONDA_RUN} uv pip install pytest-reportlog
# - name: Environment
# shell: arch -arch arm64 bash {0}
# run: |
@@ -538,7 +531,7 @@ jobs:
# HF_HOME: /System/Volumes/Data/mnt/cache
# HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
# run: |
# ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
# ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
# --report-log=tests_torch_mps.log \
# tests/
# - name: Failure short reports
@@ -579,11 +572,11 @@ jobs:
# - name: Install dependencies
# shell: arch -arch arm64 bash {0}
# run: |
# ${CONDA_RUN} python -m pip install --upgrade pip uv
# ${CONDA_RUN} python -m uv pip install -e [quality,test]
# ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
# ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
# ${CONDA_RUN} python -m uv pip install pytest-reportlog
# ${CONDA_RUN} pip install --upgrade pip uv
# ${CONDA_RUN} uv pip install -e ".[quality]"
# ${CONDA_RUN} uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
# ${CONDA_RUN} uv pip install accelerate@git+https://github.com/huggingface/accelerate
# ${CONDA_RUN} uv pip install pytest-reportlog
# - name: Environment
# shell: arch -arch arm64 bash {0}
# run: |
@@ -594,7 +587,7 @@ jobs:
# HF_HOME: /System/Volumes/Data/mnt/cache
# HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
# run: |
# ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
# ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
# --report-log=tests_torch_mps.log \
# tests/
# - name: Failure short reports

View File

@@ -25,11 +25,8 @@ jobs:
python-version: "3.8"
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pip install --upgrade pip uv
python -m uv pip install -e .
python -m uv pip install pytest
pip install -e .
pip install pytest
- name: Check for soft dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
pytest tests/others/test_dependencies.py
pytest tests/others/test_dependencies.py

View File

@@ -42,7 +42,7 @@ jobs:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade pip
pip install .[quality]
- name: Check quality
run: make quality
@@ -62,7 +62,7 @@ jobs:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade pip
pip install .[quality]
- name: Check repo consistency
run: |
@@ -108,21 +108,18 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
uv pip install -e ".[quality]"
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run fast PyTorch Pipeline CPU tests
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
pytest -n 8 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_${{ matrix.config.report }} \
tests/modular_pipelines

View File

@@ -33,8 +33,7 @@ jobs:
fetch-depth: 0
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
uv pip install -e ".[quality]"
- name: Environment
run: |
python utils/print_env.py
@@ -90,19 +89,16 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pip install -e [quality,test]
python -m pip install accelerate
uv pip install -e ".[quality]"
uv pip install accelerate
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run all selected tests on CPU
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.modules }}_tests_cpu ${{ fromJson(needs.setup_pr_tests.outputs.test_map)[matrix.modules] }}
pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.modules }}_tests_cpu ${{ fromJson(needs.setup_pr_tests.outputs.test_map)[matrix.modules] }}
- name: Failure short reports
if: ${{ failure() }}
@@ -148,19 +144,16 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pip install -e [quality,test]
pip install -e [quality]
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run Hub tests for models, schedulers, and pipelines on a staging env
if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
HUGGINGFACE_CO_STAGING=true python -m pytest \
HUGGINGFACE_CO_STAGING=true pytest \
-m "is_staging_test" \
--make-reports=tests_${{ matrix.config.report }} \
tests

View File

@@ -38,7 +38,7 @@ jobs:
python-version: "3.8"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade pip
pip install .[quality]
- name: Check quality
run: make quality
@@ -58,7 +58,7 @@ jobs:
python-version: "3.8"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade pip
pip install .[quality]
- name: Check repo consistency
run: |
@@ -114,21 +114,18 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
uv pip install -e ".[quality]"
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run fast PyTorch Pipeline CPU tests
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
pytest -n 8 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_${{ matrix.config.report }} \
tests/pipelines
@@ -136,8 +133,7 @@ jobs:
- name: Run fast PyTorch Model Scheduler CPU tests
if: ${{ matrix.config.framework == 'pytorch_models' }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx and not Dependency" \
--make-reports=tests_${{ matrix.config.report }} \
tests/models tests/schedulers tests/others
@@ -145,9 +141,8 @@ jobs:
- name: Run example PyTorch CPU tests
if: ${{ matrix.config.framework == 'pytorch_examples' }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install peft timm
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
uv pip install ".[training]"
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
--make-reports=tests_${{ matrix.config.report }} \
examples
@@ -195,19 +190,16 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
uv pip install -e ".[quality]"
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run Hub tests for models, schedulers, and pipelines on a staging env
if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
HUGGINGFACE_CO_STAGING=true python -m pytest \
HUGGINGFACE_CO_STAGING=true pytest \
-m "is_staging_test" \
--make-reports=tests_${{ matrix.config.report }} \
tests
@@ -249,27 +241,24 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
uv pip install -e ".[quality]"
# TODO (sayakpaul, DN6): revisit `--no-deps`
python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
python -m uv pip install -U tokenizers
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
uv pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
uv pip install -U tokenizers
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run fast PyTorch LoRA tests with PEFT
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-s -v \
--make-reports=tests_peft_main \
tests/lora/
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-s -v \
--make-reports=tests_models_lora_peft_main \
tests/models/ -k "lora"

View File

@@ -39,7 +39,7 @@ jobs:
python-version: "3.8"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade pip
pip install .[quality]
- name: Check quality
run: make quality
@@ -59,7 +59,7 @@ jobs:
python-version: "3.8"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade pip
pip install .[quality]
- name: Check repo consistency
run: |
@@ -88,8 +88,7 @@ jobs:
fetch-depth: 2
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
uv pip install -e ".[quality]"
- name: Environment
run: |
python utils/print_env.py
@@ -130,10 +129,9 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
uv pip install -e ".[quality]"
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
- name: Environment
run: |
@@ -152,13 +150,13 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
if [ "${{ matrix.module }}" = "ip_adapters" ]; then
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
else
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx and $pattern" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
@@ -200,11 +198,10 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
uv pip install -e ".[quality]"
uv pip install peft@git+https://github.com/huggingface/peft.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
- name: Environment
run: |
@@ -225,10 +222,10 @@ jobs:
run: |
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
if [ -z "$pattern" ]; then
python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
--make-reports=tests_torch_cuda_${{ matrix.module }}
else
python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
--make-reports=tests_torch_cuda_${{ matrix.module }}
fi
@@ -265,22 +262,19 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
python -m uv pip install -e [quality,test,training]
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
uv pip install -e ".[quality,training]"
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run example tests on GPU
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install timm
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
uv pip install ".[training]"
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
- name: Failure short reports
if: ${{ failure() }}

View File

@@ -25,12 +25,8 @@ jobs:
python-version: "3.8"
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pip install --upgrade pip uv
python -m uv pip install -e .
python -m uv pip install torch torchvision torchaudio
python -m uv pip install pytest
pip install -e .
pip install torch torchvision torchaudio pytest
- name: Check for soft dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
pytest tests/others/test_dependencies.py
pytest tests/others/test_dependencies.py

View File

@@ -34,8 +34,7 @@ jobs:
fetch-depth: 2
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
uv pip install -e ".[quality]"
- name: Environment
run: |
python utils/print_env.py
@@ -75,9 +74,8 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip install -e ".[quality]"
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
python utils/print_env.py
@@ -87,7 +85,7 @@ jobs:
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
@@ -126,10 +124,9 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip install -e ".[quality]"
uv pip install peft@git+https://github.com/huggingface/peft.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
@@ -141,7 +138,7 @@ jobs:
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_cuda_${{ matrix.module }} \
tests/${{ matrix.module }}
@@ -180,8 +177,7 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test,training]
uv pip install -e ".[quality,training]"
- name: Environment
run: |
python utils/print_env.py
@@ -190,7 +186,7 @@ jobs:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
RUN_COMPILE: yes
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -223,8 +219,7 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test,training]
uv pip install -e ".[quality,training]"
- name: Environment
run: |
python utils/print_env.py
@@ -232,7 +227,7 @@ jobs:
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_xformers_cuda_failures_short.txt
@@ -264,21 +259,18 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test,training]
uv pip install -e ".[quality,training]"
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run example tests on GPU
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install timm
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
uv pip install ".[training]"
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
- name: Failure short reports
if: ${{ failure() }}

View File

@@ -60,19 +60,16 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
uv pip install -e ".[quality]"
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run fast PyTorch CPU tests
if: ${{ matrix.config.framework == 'pytorch' }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_${{ matrix.config.report }} \
tests/
@@ -80,9 +77,8 @@ jobs:
- name: Run example PyTorch CPU tests
if: ${{ matrix.config.framework == 'pytorch_examples' }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install peft timm
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
uv pip install ".[training]"
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
--make-reports=tests_${{ matrix.config.report }} \
examples

View File

@@ -32,8 +32,7 @@ jobs:
fetch-depth: 2
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
uv pip install -e ".[quality]"
- name: Environment
run: |
python utils/print_env.py
@@ -73,9 +72,8 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip install -e ".[quality]"
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
python utils/print_env.py
@@ -85,7 +83,7 @@ jobs:
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
@@ -124,10 +122,9 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip install -e ".[quality]"
uv pip install peft@git+https://github.com/huggingface/peft.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
@@ -139,7 +136,7 @@ jobs:
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_${{ matrix.module }}_cuda \
tests/${{ matrix.module }}
@@ -175,10 +172,9 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
uv pip install -e ".[quality]"
uv pip install peft@git+https://github.com/huggingface/peft.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
@@ -190,7 +186,7 @@ jobs:
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_minimum_cuda \
tests/models/test_modeling_common.py \
@@ -235,8 +231,7 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test,training]
uv pip install -e ".[quality,training]"
- name: Environment
run: |
python utils/print_env.py
@@ -245,7 +240,7 @@ jobs:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
RUN_COMPILE: yes
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -278,8 +273,7 @@ jobs:
nvidia-smi
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test,training]
uv pip install -e ".[quality,training]"
- name: Environment
run: |
python utils/print_env.py
@@ -287,7 +281,7 @@ jobs:
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_xformers_cuda_failures_short.txt
@@ -321,21 +315,18 @@ jobs:
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test,training]
uv pip install -e ".[quality,training]"
- name: Environment
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python utils/print_env.py
- name: Run example tests on GPU
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install timm
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
uv pip install ".[training]"
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
- name: Failure short reports
if: ${{ failure() }}

View File

@@ -63,9 +63,8 @@ jobs:
- name: Install pytest
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install peft
uv pip install -e ".[quality]"
uv pip install peft
- name: Run tests
env:

3
.gitignore vendored
View File

@@ -125,6 +125,9 @@ dmypy.json
.vs
.vscode
# Cursor
.cursor
# Pycharm
.idea

View File

@@ -1,56 +1,45 @@
FROM ubuntu:20.04
FROM python:3.10-slim
ENV PYTHONDONTWRITEBYTECODE=1
LABEL maintainer="Hugging Face"
LABEL repository="diffusers"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get -y update \
&& apt-get install -y software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa
RUN apt-get -y update && apt-get install -y bash \
build-essential \
git \
git-lfs \
curl \
ca-certificates \
libglib2.0-0 \
libsndfile1-dev \
libgl1 \
zip \
wget
RUN apt install -y bash \
build-essential \
git \
git-lfs \
curl \
ca-certificates \
libsndfile1-dev \
python3.10 \
python3-pip \
libgl1 \
zip \
wget \
python3.10-venv && \
rm -rf /var/lib/apt/lists
# make sure to use venv
RUN python3.10 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
ENV UV_PYTHON=/usr/local/bin/python
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
python3.10 -m uv pip install --no-cache-dir \
torch \
torchvision \
torchaudio \
invisible_watermark \
--extra-index-url https://download.pytorch.org/whl/cpu && \
python3.10 -m uv pip install --no-cache-dir \
accelerate \
datasets \
hf-doc-builder \
huggingface-hub \
Jinja2 \
librosa \
numpy==1.26.4 \
scipy \
tensorboard \
transformers \
matplotlib \
setuptools==69.5.1 \
bitsandbytes \
torchao \
gguf \
optimum-quanto
RUN pip install uv
RUN uv pip install --no-cache-dir \
torch \
torchvision \
torchaudio \
--extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
# Extra dependencies
RUN uv pip install --no-cache-dir \
accelerate \
numpy==1.26.4 \
hf_transfer \
setuptools==69.5.1 \
bitsandbytes \
torchao \
gguf \
optimum-quanto
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
CMD ["/bin/bash"]

View File

@@ -1,50 +1,38 @@
FROM ubuntu:20.04
FROM python:3.10-slim
ENV PYTHONDONTWRITEBYTECODE=1
LABEL maintainer="Hugging Face"
LABEL repository="diffusers"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get -y update \
&& apt-get install -y software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa
RUN apt-get -y update && apt-get install -y bash \
build-essential \
git \
git-lfs \
curl \
ca-certificates \
libglib2.0-0 \
libsndfile1-dev \
libgl1
RUN apt install -y bash \
build-essential \
git \
git-lfs \
curl \
ca-certificates \
libsndfile1-dev \
python3.10 \
python3.10-dev \
python3-pip \
libgl1 \
python3.10-venv && \
rm -rf /var/lib/apt/lists
# make sure to use venv
RUN python3.10 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
ENV UV_PYTHON=/usr/local/bin/python
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
python3.10 -m uv pip install --no-cache-dir \
torch \
torchvision \
torchaudio \
invisible_watermark \
--extra-index-url https://download.pytorch.org/whl/cpu && \
python3.10 -m uv pip install --no-cache-dir \
accelerate \
datasets \
hf-doc-builder \
huggingface-hub \
Jinja2 \
librosa \
numpy==1.26.4 \
scipy \
tensorboard \
transformers matplotlib \
hf_transfer
RUN pip install uv
RUN uv pip install --no-cache-dir \
torch \
torchvision \
torchaudio \
--extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
# Extra dependencies
RUN uv pip install --no-cache-dir \
accelerate \
numpy==1.26.4 \
hf_transfer
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
CMD ["/bin/bash"]

View File

@@ -2,11 +2,13 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
LABEL maintainer="Hugging Face"
LABEL repository="diffusers"
ARG PYTHON_VERSION=3.12
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get -y update \
&& apt-get install -y software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa
&& add-apt-repository ppa:deadsnakes/ppa && \
apt-get update
RUN apt install -y bash \
build-essential \
@@ -14,38 +16,34 @@ RUN apt install -y bash \
git-lfs \
curl \
ca-certificates \
libglib2.0-0 \
libsndfile1-dev \
libgl1 \
python3.10 \
python3.10-dev \
python3 \
python3-pip \
python3.10-venv && \
rm -rf /var/lib/apt/lists
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# make sure to use venv
RUN python3.10 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:$PATH"
ENV VIRTUAL_ENV="/opt/venv"
ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
python3.10 -m uv pip install --no-cache-dir \
RUN uv pip install --no-cache-dir \
torch \
torchvision \
torchaudio \
invisible_watermark && \
python3.10 -m pip install --no-cache-dir \
torchaudio
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
# Extra dependencies
RUN uv pip install --no-cache-dir \
accelerate \
datasets \
hf-doc-builder \
huggingface-hub \
hf_transfer \
Jinja2 \
librosa \
numpy==1.26.4 \
scipy \
tensorboard \
transformers \
pytorch-lightning \
pytorch-lightning \
hf_transfer
CMD ["/bin/bash"]

View File

@@ -2,6 +2,7 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
LABEL maintainer="Hugging Face"
LABEL repository="diffusers"
ARG PYTHON_VERSION=3.10
ENV DEBIAN_FRONTEND=noninteractive
ENV MINIMUM_SUPPORTED_TORCH_VERSION="2.1.0"
ENV MINIMUM_SUPPORTED_TORCHVISION_VERSION="0.16.0"
@@ -9,7 +10,8 @@ ENV MINIMUM_SUPPORTED_TORCHAUDIO_VERSION="2.1.0"
RUN apt-get -y update \
&& apt-get install -y software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa
&& add-apt-repository ppa:deadsnakes/ppa && \
apt-get update
RUN apt install -y bash \
build-essential \
@@ -17,37 +19,34 @@ RUN apt install -y bash \
git-lfs \
curl \
ca-certificates \
libglib2.0-0 \
libsndfile1-dev \
libgl1 \
python3.10 \
python3.10-dev \
python3 \
python3-pip \
python3.10-venv && \
rm -rf /var/lib/apt/lists
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# make sure to use venv
RUN python3.10 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:$PATH"
ENV VIRTUAL_ENV="/opt/venv"
ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
python3.10 -m uv pip install --no-cache-dir \
RUN uv pip install --no-cache-dir \
torch==$MINIMUM_SUPPORTED_TORCH_VERSION \
torchvision==$MINIMUM_SUPPORTED_TORCHVISION_VERSION \
torchaudio==$MINIMUM_SUPPORTED_TORCHAUDIO_VERSION \
invisible_watermark && \
python3.10 -m pip install --no-cache-dir \
torchaudio==$MINIMUM_SUPPORTED_TORCHAUDIO_VERSION
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
# Extra dependencies
RUN uv pip install --no-cache-dir \
accelerate \
datasets \
hf-doc-builder \
huggingface-hub \
hf_transfer \
Jinja2 \
librosa \
numpy==1.26.4 \
scipy \
tensorboard \
transformers \
pytorch-lightning \
hf_transfer
CMD ["/bin/bash"]

View File

@@ -2,50 +2,49 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
LABEL maintainer="Hugging Face"
LABEL repository="diffusers"
ARG PYTHON_VERSION=3.12
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get -y update \
&& apt-get install -y software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa
&& add-apt-repository ppa:deadsnakes/ppa && \
apt-get update
RUN apt install -y bash \
build-essential \
git \
git-lfs \
curl \
ca-certificates \
libsndfile1-dev \
libgl1 \
python3.10 \
python3.10-dev \
python3-pip \
python3.10-venv && \
rm -rf /var/lib/apt/lists
build-essential \
git \
git-lfs \
curl \
ca-certificates \
libglib2.0-0 \
libsndfile1-dev \
libgl1 \
python3 \
python3-pip \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# make sure to use venv
RUN python3.10 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:$PATH"
ENV VIRTUAL_ENV="/opt/venv"
ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
python3.10 -m pip install --no-cache-dir \
torch \
torchvision \
torchaudio \
invisible_watermark && \
python3.10 -m uv pip install --no-cache-dir \
accelerate \
datasets \
hf-doc-builder \
huggingface-hub \
hf_transfer \
Jinja2 \
librosa \
numpy==1.26.4 \
scipy \
tensorboard \
transformers \
xformers \
hf_transfer
RUN uv pip install --no-cache-dir \
torch \
torchvision \
torchaudio
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
# Extra dependencies
RUN uv pip install --no-cache-dir \
accelerate \
numpy==1.26.4 \
pytorch-lightning \
hf_transfer \
xformers
CMD ["/bin/bash"]

View File

@@ -49,7 +49,7 @@
isExpanded: false
sections:
- local: using-diffusers/weighted_prompts
title: Prompt techniques
title: Prompting
- local: using-diffusers/create_a_server
title: Create a server
- local: using-diffusers/batched_inference

View File

@@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License.
[IP-Adapter](https://hf.co/papers/2308.06721) is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder.
> [!TIP]
> Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter [loading](../../using-diffusers/loading_adapters#ip-adapter) guide, and you can see how to use it in the [usage](../../using-diffusers/ip_adapter) guide.
> Learn how to load and use an IP-Adapter checkpoint and image in the [IP-Adapter](../../using-diffusers/ip_adapter) guide,.
## IPAdapterMixin

View File

@@ -34,7 +34,7 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
- [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
> [!TIP]
> To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
> To learn more about how to load LoRA weights, see the [LoRA](../../tutorials/using_peft_for_inference) loading guide.
## LoraBaseMixin

View File

@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
# PEFT
Diffusers supports loading adapters such as [LoRA](../../using-diffusers/loading_adapters) with the [PEFT](https://huggingface.co/docs/peft/index) library with the [`~loaders.peft.PeftAdapterMixin`] class. This allows modeling classes in Diffusers like [`UNet2DConditionModel`], [`SD3Transformer2DModel`] to operate with an adapter.
Diffusers supports loading adapters such as [LoRA](../../tutorials/using_peft_for_inference) with the [PEFT](https://huggingface.co/docs/peft/index) library with the [`~loaders.peft.PeftAdapterMixin`] class. This allows modeling classes in Diffusers like [`UNet2DConditionModel`], [`SD3Transformer2DModel`] to operate with an adapter.
> [!TIP]
> Refer to the [Inference with PEFT](../../tutorials/using_peft_for_inference.md) tutorial for an overview of how to use PEFT in Diffusers for inference.

View File

@@ -17,7 +17,7 @@ Textual Inversion is a training method for personalizing models by learning new
[`TextualInversionLoaderMixin`] provides a function for loading Textual Inversion embeddings from Diffusers and Automatic1111 into the text encoder and loading a special token to activate the embeddings.
> [!TIP]
> To learn more about how to load Textual Inversion embeddings, see the [Textual Inversion](../../using-diffusers/loading_adapters#textual-inversion) loading guide.
> To learn more about how to load Textual Inversion embeddings, see the [Textual Inversion](../../using-diffusers/textual_inversion_inference) loading guide.
## TextualInversionLoaderMixin

View File

@@ -17,7 +17,7 @@ This class is useful when *only* loading weights into a [`SD3Transformer2DModel`
The [`SD3Transformer2DLoadersMixin`] class currently only loads IP-Adapter weights, but will be used in the future to save weights and load LoRAs.
> [!TIP]
> To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
> To learn more about how to load LoRA weights, see the [LoRA](../../tutorials/using_peft_for_inference) loading guide.
## SD3Transformer2DLoadersMixin

View File

@@ -17,7 +17,7 @@ Some training methods - like LoRA and Custom Diffusion - typically target the UN
The [`UNet2DConditionLoadersMixin`] class provides functions for loading and saving weights, fusing and unfusing LoRAs, disabling and enabling LoRAs, and setting and deleting adapters.
> [!TIP]
> To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
> To learn more about how to load LoRA weights, see the [LoRA](../../tutorials/using_peft_for_inference) guide.
## UNet2DConditionLoadersMixin

View File

@@ -418,7 +418,7 @@ When unloading the Control LoRA weights, call `pipe.unload_lora_weights(reset_to
## IP-Adapter
> [!TIP]
> Check out [IP-Adapter](../../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.
> Check out [IP-Adapter](../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.
An IP-Adapter lets you prompt Flux with images, in addition to the text prompt. This is especially useful when describing complex concepts that are difficult to articulate through text alone and you have reference images.

View File

@@ -21,7 +21,7 @@
## Available models
The following models are available for the [`HiDreamImagePipeline`](text-to-image) pipeline:
The following models are available for the [`HiDreamImagePipeline`] pipeline:
| Model name | Description |
|:---|:---|

View File

@@ -254,8 +254,8 @@ export_to_video(video, "output.mp4", fps=24)
pipeline.vae.enable_tiling()
def round_to_nearest_resolution_acceptable_by_vae(height, width):
height = height - (height % pipeline.vae_temporal_compression_ratio)
width = width - (width % pipeline.vae_temporal_compression_ratio)
height = height - (height % pipeline.vae_spatial_compression_ratio)
width = width - (width % pipeline.vae_spatial_compression_ratio)
return height, width
prompt = """
@@ -325,6 +325,95 @@ export_to_video(video, "output.mp4", fps=24)
</details>
- LTX-Video 0.9.8 distilled model is similar to the 0.9.7 variant. It is guidance and timestep-distilled, and similar inference code can be used as above. An improvement of this version is that it supports generating very long videos. Additionally, it supports using tone mapping to improve the quality of the generated video using the `tone_map_compression_ratio` parameter. The default value of `0.6` is recommended.
<details>
<summary>Show example code</summary>
```python
import torch
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.pipelines.ltx.modeling_latent_upsampler import LTXLatentUpsamplerModel
from diffusers.utils import export_to_video, load_video
pipeline = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.8-13B-distilled", torch_dtype=torch.bfloat16)
# TODO: Update the checkpoint here once updated in LTX org
upsampler = LTXLatentUpsamplerModel.from_pretrained("a-r-r-o-w/LTX-0.9.8-Latent-Upsampler", torch_dtype=torch.bfloat16)
pipe_upsample = LTXLatentUpsamplePipeline(vae=pipeline.vae, latent_upsampler=upsampler).to(torch.bfloat16)
pipeline.to("cuda")
pipe_upsample.to("cuda")
pipeline.vae.enable_tiling()
def round_to_nearest_resolution_acceptable_by_vae(height, width):
height = height - (height % pipeline.vae_spatial_compression_ratio)
width = width - (width % pipeline.vae_spatial_compression_ratio)
return height, width
prompt = """The camera pans over a snow-covered mountain range, revealing a vast expanse of snow-capped peaks and valleys.The mountains are covered in a thick layer of snow, with some areas appearing almost white while others have a slightly darker, almost grayish hue. The peaks are jagged and irregular, with some rising sharply into the sky while others are more rounded. The valleys are deep and narrow, with steep slopes that are also covered in snow. The trees in the foreground are mostly bare, with only a few leaves remaining on their branches. The sky is overcast, with thick clouds obscuring the sun. The overall impression is one of peace and tranquility, with the snow-covered mountains standing as a testament to the power and beauty of nature."""
# prompt = """A woman walks away from a white Jeep parked on a city street at night, then ascends a staircase and knocks on a door. The woman, wearing a dark jacket and jeans, walks away from the Jeep parked on the left side of the street, her back to the camera; she walks at a steady pace, her arms swinging slightly by her sides; the street is dimly lit, with streetlights casting pools of light on the wet pavement; a man in a dark jacket and jeans walks past the Jeep in the opposite direction; the camera follows the woman from behind as she walks up a set of stairs towards a building with a green door; she reaches the top of the stairs and turns left, continuing to walk towards the building; she reaches the door and knocks on it with her right hand; the camera remains stationary, focused on the doorway; the scene is captured in real-life footage."""
negative_prompt = "bright colors, symbols, graffiti, watermarks, worst quality, inconsistent motion, blurry, jittery, distorted"
expected_height, expected_width = 480, 832
downscale_factor = 2 / 3
# num_frames = 161
num_frames = 361
# 1. Generate video at smaller resolution
downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
latents = pipeline(
prompt=prompt,
negative_prompt=negative_prompt,
width=downscaled_width,
height=downscaled_height,
num_frames=num_frames,
timesteps=[1000, 993, 987, 981, 975, 909, 725, 0.03],
decode_timestep=0.05,
decode_noise_scale=0.025,
image_cond_noise_scale=0.0,
guidance_scale=1.0,
guidance_rescale=0.7,
generator=torch.Generator().manual_seed(0),
output_type="latent",
).frames
# 2. Upscale generated video using latent upsampler with fewer inference steps
# The available latent upsampler upscales the height/width by 2x
upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
upscaled_latents = pipe_upsample(
latents=latents,
adain_factor=1.0,
tone_map_compression_ratio=0.6,
output_type="latent"
).frames
# 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
video = pipeline(
prompt=prompt,
negative_prompt=negative_prompt,
width=upscaled_width,
height=upscaled_height,
num_frames=num_frames,
denoise_strength=0.999, # Effectively, 4 inference steps out of 5
timesteps=[1000, 909, 725, 421, 0],
latents=upscaled_latents,
decode_timestep=0.05,
decode_noise_scale=0.025,
image_cond_noise_scale=0.0,
guidance_scale=1.0,
guidance_rescale=0.7,
generator=torch.Generator().manual_seed(0),
output_type="pil",
).frames[0]
# 4. Downscale the video to the expected resolution
video = [frame.resize((expected_width, expected_height)) for frame in video]
export_to_video(video, "output.mp4", fps=24)
```
</details>
- LTX-Video supports LoRAs with [`~loaders.LTXVideoLoraLoaderMixin.load_lora_weights`].
<details>

View File

@@ -75,7 +75,7 @@ The following is a summary of the recommended checkpoints, all of which produce
| [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1) | Depth | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference. |
| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1) | Normals | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1. |
| [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity. |
| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | Intrinsics | HyperSim decomposition of an image &nbsp\\(I\\)&nbsp is comprised of Albedo &nbsp\\(A\\), Diffuse shading &nbsp\\(S\\), and Non-diffuse residual &nbsp\\(R\\): &nbsp\\(I = A*S+R\\). |
| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1) | Intrinsics | HyperSim decomposition of an image $I$ is comprised of Albedo $A$, Diffuse shading $S$, and Non-diffuse residual $R$: $I = A*S+R$. |
> [!TIP]
> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff

View File

@@ -32,7 +32,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
| [Attend-and-Excite](attend_and_excite) | text2image |
| [AudioLDM](audioldm) | text2audio |
| [AudioLDM2](audioldm2) | text2audio |
| [AuraFlow](auraflow) | text2image |
| [AuraFlow](aura_flow) | text2image |
| [BLIP Diffusion](blip_diffusion) | text2image |
| [Bria 3.2](bria_3_2) | text2image |
| [CogVideoX](cogvideox) | text2video |

View File

@@ -109,7 +109,7 @@ image_1 = load_image("https://huggingface.co/datasets/huggingface/documentation-
image_2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peng.png")
image = pipe(
image=[image_1, image_2],
prompt="put the penguin and the cat at a game show called "Qwen Edit Plus Games"",
prompt='''put the penguin and the cat at a game show called "Qwen Edit Plus Games"''',
num_inference_steps=50
).images[0]
```

View File

@@ -271,7 +271,7 @@ Check out the full script [here](https://gist.github.com/sayakpaul/508d89d7aad4f
Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.
Refer to the [Quantization](../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`StableDiffusion3Pipeline`] for inference with bitsandbytes.
Refer to the [Quantization](../../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`StableDiffusion3Pipeline`] for inference with bitsandbytes.
```py
import torch

View File

@@ -29,7 +29,7 @@ The abstract from the paper is:
Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.
Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
Check out the [Text or image-to-video](../../../using-diffusers/text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
## StableVideoDiffusionPipeline

View File

@@ -172,7 +172,7 @@ Here are some sample outputs:
Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.
Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
Check out the [Text or image-to-video](../../using-diffusers/text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
> [!TIP]
> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.

View File

@@ -26,6 +26,10 @@ Utility and helper functions for working with 🤗 Diffusers.
[[autodoc]] utils.load_image
## load_video
[[autodoc]] utils.load_video
## export_to_gif
[[autodoc]] utils.export_to_gif

View File

@@ -81,6 +81,45 @@ with attention_backend("_flash_3_hub"):
> [!TIP]
> Most attention backends support `torch.compile` without graph breaks and can be used to further speed up inference.
## Checks
The attention dispatcher includes debugging checks that catch common errors before they cause problems.
1. Device checks verify that query, key, and value tensors live on the same device.
2. Data type checks confirm tensors have matching dtypes and use either bfloat16 or float16.
3. Shape checks validate tensor dimensions and prevent mixing attention masks with causal flags.
Enable these checks by setting the `DIFFUSERS_ATTN_CHECKS` environment variable. Checks add overhead to every attention operation, so they're disabled by default.
```bash
export DIFFUSERS_ATTN_CHECKS=yes
```
The checks are run now before every attention operation.
```py
import torch
query = torch.randn(1, 10, 8, 64, dtype=torch.bfloat16, device="cuda")
key = torch.randn(1, 10, 8, 64, dtype=torch.bfloat16, device="cuda")
value = torch.randn(1, 10, 8, 64, dtype=torch.bfloat16, device="cuda")
try:
with attention_backend("flash"):
output = dispatch_attention_fn(query, key, value)
print("✓ Flash Attention works with checks enabled")
except Exception as e:
print(f"✗ Flash Attention failed: {e}")
```
You can also configure the registry directly.
```py
from diffusers.models.attention_dispatch import _AttentionBackendRegistry
_AttentionBackendRegistry._checks_enabled = True
```
## Available backends
Refer to the table below for a complete list of available attention backends and their variants.

View File

@@ -548,4 +548,4 @@ Training the DeepFloyd IF model can be challenging, but here are some tips that
Congratulations on training your DreamBooth model! To learn more about how to use your new model, the following guide may be helpful:
- Learn how to [load a DreamBooth](../using-diffusers/loading_adapters) model for inference if you trained your model with LoRA.
- Learn how to [load a DreamBooth](../using-diffusers/dreambooth) model for inference if you trained your model with LoRA.

View File

@@ -75,7 +75,7 @@ accelerate launch train_lcm_distill_sd_wds.py \
Most of the parameters are identical to the parameters in the [Text-to-image](text2image#script-parameters) training guide, so you'll focus on the parameters that are relevant to latent consistency distillation in this guide.
- `--pretrained_teacher_model`: the path to a pretrained latent diffusion model to use as the teacher model
- `--pretrained_vae_model_name_or_path`: path to a pretrained VAE; the SDXL VAE is known to suffer from numerical instability, so this parameter allows you to specify an alternative VAE (like this [VAE]((https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)) by madebyollin which works in fp16)
- `--pretrained_vae_model_name_or_path`: path to a pretrained VAE; the SDXL VAE is known to suffer from numerical instability, so this parameter allows you to specify an alternative VAE (like this [VAE](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)) by madebyollin which works in fp16)
- `--w_min` and `--w_max`: the minimum and maximum guidance scale values for guidance scale sampling
- `--num_ddim_timesteps`: the number of timesteps for DDIM sampling
- `--loss_type`: the type of loss (L2 or Huber) to calculate for latent consistency distillation; Huber loss is generally preferred because it's more robust to outliers
@@ -245,5 +245,5 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl
Congratulations on distilling a LCM model! To learn more about LCM, the following may be helpful:
- Learn how to use [LCMs for inference](../using-diffusers/lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
- Learn how to use [LCMs for inference](../using-diffusers/inference_with_lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.

View File

@@ -198,5 +198,5 @@ image = pipeline("A naruto with blue eyes").images[0]
Congratulations on training a new model with LoRA! To learn more about how to use your new model, the following guides may be helpful:
- Learn how to [load different LoRA formats](../using-diffusers/loading_adapters#LoRA) trained using community trainers like Kohya and TheLastBen.
- Learn how to [load different LoRA formats](../tutorials/using_peft_for_inference) trained using community trainers like Kohya and TheLastBen.
- Learn how to use and [combine multiple LoRA's](../tutorials/using_peft_for_inference) with PEFT for inference.

View File

@@ -178,5 +178,5 @@ image.save("yoda-naruto.png")
Congratulations on training your own text-to-image model! To learn more about how to use your new model, the following guides may be helpful:
- Learn how to [load LoRA weights](../using-diffusers/loading_adapters#LoRA) for inference if you trained your model with LoRA.
- Learn how to [load LoRA weights](../tutorials/using_peft_for_inference) for inference if you trained your model with LoRA.
- Learn more about how certain parameters like guidance scale or techniques such as prompt weighting can help you control inference in the [Text-to-image](../using-diffusers/conditional_image_generation) task guide.

View File

@@ -203,5 +203,4 @@ image.save("cat-train.png")
Congratulations on training your own Textual Inversion model! 🎉 To learn more about how to use your new model, the following guides may be helpful:
- Learn how to [load Textual Inversion embeddings](../using-diffusers/loading_adapters) and also use them as negative embeddings.
- Learn how to use [Textual Inversion](textual_inversion_inference) for inference with Stable Diffusion 1/2 and Stable Diffusion XL.
- Learn how to [load Textual Inversion embeddings](../using-diffusers/textual_inversion_inference) and also use them as negative embeddings.

View File

@@ -16,24 +16,24 @@ Batch inference processes multiple prompts at a time to increase throughput. It
The downside is increased latency because you must wait for the entire batch to complete, and more GPU memory is required for large batches.
<hfoptions id="usage">
<hfoption id="text-to-image">
For text-to-image, pass a list of prompts to the pipeline.
For text-to-image, pass a list of prompts to the pipeline and for image-to-image, pass a list of images and prompts to the pipeline. The example below demonstrates batched text-to-image inference.
```py
import torch
import matplotlib.pyplot as plt
from diffusers import DiffusionPipeline
pipeline = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
torch_dtype=torch.float16,
device_map="cuda"
)
prompts = [
"cinematic photo of A beautiful sunset over mountains, 35mm photograph, film, professional, 4k, highly detailed",
"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
"pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
"Cinematic shot of a cozy coffee shop interior, warm pastel light streaming through a window where a cat rests. Shallow depth of field, glowing cups in soft focus, dreamy lofi-inspired mood, nostalgic tones, framed like a quiet film scene.",
"Polaroid-style photograph of a cozy coffee shop interior, bathed in warm pastel light. A cat sits on the windowsill near steaming mugs. Soft, slightly faded tones and dreamy blur evoke nostalgia, a lofi mood, and the intimate, imperfect charm of instant film.",
"Soft watercolor illustration of a cozy coffee shop interior, pastel washes of color filling the space. A cat rests peacefully on the windowsill as warm light glows through. Gentle brushstrokes create a dreamy, lofi-inspired atmosphere with whimsical textures and nostalgic calm.",
"Isometric pixel-art illustration of a cozy coffee shop interior in detailed 8-bit style. Warm pastel light fills the space as a cat rests on the windowsill. Blocky furniture and tiny mugs add charm, low-res retro graphics enhance the nostalgic, lofi-inspired game aesthetic."
]
images = pipeline(
@@ -52,6 +52,10 @@ plt.tight_layout()
plt.show()
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/batch-inference.png"/>
</div>
To generate multiple variations of one prompt, use the `num_images_per_prompt` argument.
```py
@@ -61,11 +65,18 @@ from diffusers import DiffusionPipeline
pipeline = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
torch_dtype=torch.float16,
device_map="cuda"
)
prompt="""
Isometric pixel-art illustration of a cozy coffee shop interior in detailed 8-bit style. Warm pastel light fills the
space as a cat rests on the windowsill. Blocky furniture and tiny mugs add charm, low-res retro graphics enhance the
nostalgic, lofi-inspired game aesthetic.
"""
images = pipeline(
prompt="pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics",
prompt=prompt,
num_images_per_prompt=4
).images
@@ -81,6 +92,10 @@ plt.tight_layout()
plt.show()
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/batch-inference-2.png"/>
</div>
Combine both approaches to generate different variations of different prompts.
```py
@@ -89,7 +104,7 @@ images = pipeline(
num_images_per_prompt=2,
).images
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
fig, axes = plt.subplots(2, 4, figsize=(12, 12))
axes = axes.flatten()
for i, image in enumerate(images):
@@ -101,126 +116,18 @@ plt.tight_layout()
plt.show()
```
</hfoption>
<hfoption id="image-to-image">
For image-to-image, pass a list of input images and prompts to the pipeline.
```py
import torch
from diffusers.utils import load_image
from diffusers import DiffusionPipeline
pipeline = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
input_images = [
load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"),
load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"),
load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png")
]
prompts = [
"cinematic photo of a beautiful sunset over mountains, 35mm photograph, film, professional, 4k, highly detailed",
"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
"pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
]
images = pipeline(
prompt=prompts,
image=input_images,
guidance_scale=8.0,
strength=0.5
).images
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
axes = axes.flatten()
for i, image in enumerate(images):
axes[i].imshow(image)
axes[i].set_title(f"Image {i+1}")
axes[i].axis('off')
plt.tight_layout()
plt.show()
```
To generate multiple variations of one prompt, use the `num_images_per_prompt` argument.
```py
import torch
import matplotlib.pyplot as plt
from diffusers.utils import load_image
from diffusers import DiffusionPipeline
pipeline = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
input_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png")
images = pipeline(
prompt="pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics",
image=input_image,
num_images_per_prompt=4
).images
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
axes = axes.flatten()
for i, image in enumerate(images):
axes[i].imshow(image)
axes[i].set_title(f"Image {i+1}")
axes[i].axis('off')
plt.tight_layout()
plt.show()
```
Combine both approaches to generate different variations of different prompts.
```py
input_images = [
load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"),
load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png")
]
prompts = [
"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
"pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
]
images = pipeline(
prompt=prompts,
image=input_images,
num_images_per_prompt=2,
).images
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
axes = axes.flatten()
for i, image in enumerate(images):
axes[i].imshow(image)
axes[i].set_title(f"Image {i+1}")
axes[i].axis('off')
plt.tight_layout()
plt.show()
```
</hfoption>
</hfoptions>
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/batch-inference-3.png"/>
</div>
## Deterministic generation
Enable reproducible batch generation by passing a list of [Generators](https://pytorch.org/docs/stable/generated/torch.Generator.html) to the pipeline and tie each `Generator` to a seed to reuse it.
Use a list comprehension to iterate over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch.
> [!TIP]
> Refer to the [Reproducibility](./reusing_seeds) docs to learn more about deterministic algorithms and the `Generator` object.
Don't multiply the `Generator` by the batch size because that only creates one `Generator` object that is used sequentially for each image in the batch.
Use a list comprehension to iterate over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch. Don't multiply the `Generator` by the batch size because that only creates one `Generator` object that is used sequentially for each image in the batch.
```py
generator = [torch.Generator(device="cuda").manual_seed(0)] * 3
@@ -234,14 +141,16 @@ from diffusers import DiffusionPipeline
pipeline = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
torch_dtype=torch.float16,
device_map="cuda"
)
generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(3)]
prompts = [
"cinematic photo of A beautiful sunset over mountains, 35mm photograph, film, professional, 4k, highly detailed",
"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
"pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
"Cinematic shot of a cozy coffee shop interior, warm pastel light streaming through a window where a cat rests. Shallow depth of field, glowing cups in soft focus, dreamy lofi-inspired mood, nostalgic tones, framed like a quiet film scene.",
"Polaroid-style photograph of a cozy coffee shop interior, bathed in warm pastel light. A cat sits on the windowsill near steaming mugs. Soft, slightly faded tones and dreamy blur evoke nostalgia, a lofi mood, and the intimate, imperfect charm of instant film.",
"Soft watercolor illustration of a cozy coffee shop interior, pastel washes of color filling the space. A cat rests peacefully on the windowsill as warm light glows through. Gentle brushstrokes create a dreamy, lofi-inspired atmosphere with whimsical textures and nostalgic calm.",
"Isometric pixel-art illustration of a cozy coffee shop interior in detailed 8-bit style. Warm pastel light fills the space as a cat rests on the windowsill. Blocky furniture and tiny mugs add charm, low-res retro graphics enhance the nostalgic, lofi-inspired game aesthetic."
]
images = pipeline(
@@ -261,4 +170,4 @@ plt.tight_layout()
plt.show()
```
You can use this to iteratively select an image associated with a seed and then improve on it by crafting a more detailed prompt.
You can use this to select an image associated with a seed and iteratively improve on it by crafting a more detailed prompt.

View File

@@ -70,32 +70,6 @@ For convenience, we provide a table to denote which methods are inference-only a
[InstructPix2Pix](../api/pipelines/pix2pix) is fine-tuned from Stable Diffusion to support editing input images. It takes as inputs an image and a prompt describing an edit, and it outputs the edited image.
InstructPix2Pix has been explicitly trained to work well with [InstructGPT](https://openai.com/blog/instruction-following/)-like prompts.
## Pix2Pix Zero
[Paper](https://huggingface.co/papers/2302.03027)
[Pix2Pix Zero](../api/pipelines/pix2pix_zero) allows modifying an image so that one concept or subject is translated to another one while preserving general image semantics.
The denoising process is guided from one conceptual embedding towards another conceptual embedding. The intermediate latents are optimized during the denoising process to push the attention maps towards reference attention maps. The reference attention maps are from the denoising process of the input image and are used to encourage semantic preservation.
Pix2Pix Zero can be used both to edit synthetic images as well as real images.
- To edit synthetic images, one first generates an image given a caption.
Next, we generate image captions for the concept that shall be edited and for the new target concept. We can use a model like [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) for this purpose. Then, "mean" prompt embeddings for both the source and target concepts are created via the text encoder. Finally, the pix2pix-zero algorithm is used to edit the synthetic image.
- To edit a real image, one first generates an image caption using a model like [BLIP](https://huggingface.co/docs/transformers/model_doc/blip). Then one applies DDIM inversion on the prompt and image to generate "inverse" latents. Similar to before, "mean" prompt embeddings for both source and target concepts are created and finally the pix2pix-zero algorithm in combination with the "inverse" latents is used to edit the image.
> [!TIP]
> Pix2Pix Zero is the first model that allows "zero-shot" image editing. This means that the model
> can edit an image in less than a minute on a consumer GPU as shown [here](../api/pipelines/pix2pix_zero#usage-example).
As mentioned above, Pix2Pix Zero includes optimizing the latents (and not any of the UNet, VAE, or the text encoder) to steer the generation toward a specific concept. This means that the overall
pipeline might require more memory than a standard [StableDiffusionPipeline](../api/pipelines/stable_diffusion/text2img).
> [!TIP]
> An important distinction between methods like InstructPix2Pix and Pix2Pix Zero is that the former
> involves fine-tuning the pre-trained weights while the latter does not. This means that you can
> apply Pix2Pix Zero to any of the available Stable Diffusion models.
## Attend and Excite
[Paper](https://huggingface.co/papers/2301.13826)
@@ -178,14 +152,6 @@ multi-concept training by design. Like DreamBooth and Textual Inversion, Custom
teach a pre-trained text-to-image diffusion model about new concepts to generate outputs involving the
concept(s) of interest.
## Model Editing
[Paper](https://huggingface.co/papers/2303.08084)
The [text-to-image model editing pipeline](../api/pipelines/model_editing) helps you mitigate some of the incorrect implicit assumptions a pre-trained text-to-image
diffusion model might make about the subjects present in the input prompt. For example, if you prompt Stable Diffusion to generate images for "A pack of roses", the roses in the generated images
are more likely to be red. This pipeline helps you change that assumption.
## DiffEdit
[Paper](https://huggingface.co/papers/2210.11427)

View File

@@ -257,7 +257,7 @@ LCMs are compatible with adapters like LoRA, ControlNet, T2I-Adapter, and Animat
### LoRA
[LoRA](../using-diffusers/loading_adapters#lora) adapters can be rapidly finetuned to learn a new style from just a few images and plugged into a pretrained model to generate images in that style.
[LoRA](../tutorials/using_peft_for_inference) adapters can be rapidly finetuned to learn a new style from just a few images and plugged into a pretrained model to generate images in that style.
<hfoptions id="lcm-lora">
<hfoption id="LCM">

View File

@@ -18,7 +18,7 @@ Trajectory Consistency Distillation (TCD) enables a model to generate higher qua
The major advantages of TCD are:
- Better than Teacher: TCD demonstrates superior generative quality at both small and large inference steps and exceeds the performance of [DPM-Solver++(2S)](../../api/schedulers/multistep_dpm_solver) with Stable Diffusion XL (SDXL). There is no additional discriminator or LPIPS supervision included during TCD training.
- Better than Teacher: TCD demonstrates superior generative quality at both small and large inference steps and exceeds the performance of [DPM-Solver++(2S)](../api/schedulers/multistep_dpm_solver) with Stable Diffusion XL (SDXL). There is no additional discriminator or LPIPS supervision included during TCD training.
- Flexible Inference Steps: The inference steps for TCD sampling can be freely adjusted without adversely affecting the image quality.
@@ -166,7 +166,7 @@ image = pipe(
TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method.
> [!TIP]
> Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods.
> Check out the [Merge LoRAs](../tutorials/using_peft_for_inference#merge) guide to learn more about efficient merging methods.
```python
import torch

View File

@@ -280,7 +280,7 @@ refiner = DiffusionPipeline.from_pretrained(
```
> [!WARNING]
> You can use SDXL refiner with a different base model. For example, you can use the [Hunyuan-DiT](../../api/pipelines/hunyuandit) or [PixArt-Sigma](../../api/pipelines/pixart_sigma) pipelines to generate images with better prompt adherence. Once you have generated an image, you can pass it to the SDXL refiner model to enhance final generation quality.
> You can use SDXL refiner with a different base model. For example, you can use the [Hunyuan-DiT](../api/pipelines/hunyuandit) or [PixArt-Sigma](../api/pipelines/pixart_sigma) pipelines to generate images with better prompt adherence. Once you have generated an image, you can pass it to the SDXL refiner model to enhance final generation quality.
Generate an image from the base model, and set the model output to **latent** space:

View File

@@ -10,423 +10,96 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
specific language governing permissions and limitations under the License.
-->
# Prompt techniques
[[open-in-colab]]
Prompts are important because they describe what you want a diffusion model to generate. The best prompts are detailed, specific, and well-structured to help the model realize your vision. But crafting a great prompt takes time and effort and sometimes it may not be enough because language and words can be imprecise. This is where you need to boost your prompt with other techniques, such as prompt enhancing and prompt weighting, to get the results you want.
# Prompting
This guide will show you how you can use these prompt techniques to generate high-quality images with lower effort and adjust the weight of certain keywords in a prompt.
Prompts describes what a model should generate. Good prompts are detailed, specific, and structured and they generate better images and videos.
## Prompt engineering
This guide shows you how to write effective prompts and introduces techniques that make them stronger.
> [!TIP]
> This is not an exhaustive guide on prompt engineering, but it will help you understand the necessary parts of a good prompt. We encourage you to continue experimenting with different prompts and combine them in new ways to see what works best. As you write more prompts, you'll develop an intuition for what works and what doesn't!
## Writing good prompts
New diffusion models do a pretty good job of generating high-quality images from a basic prompt, but it is still important to create a well-written prompt to get the best results. Here are a few tips for writing a good prompt:
Every effective prompt needs three core elements.
1. What is the image *medium*? Is it a photo, a painting, a 3D illustration, or something else?
2. What is the image *subject*? Is it a person, animal, object, or scene?
3. What *details* would you like to see in the image? This is where you can get really creative and have a lot of fun experimenting with different words to bring your image to life. For example, what is the lighting like? What is the vibe and aesthetic? What kind of art or illustration style are you looking for? The more specific and precise words you use, the better the model will understand what you want to generate.
1. <span class="underline decoration-sky-500 decoration-2 underline-offset-4">Subject</span> - what you want to generate. Start your prompt here.
2. <span class="underline decoration-pink-500 decoration-2 underline-offset-4">Style</span> - the medium or aesthetic. How should it look?
3. <span class="underline decoration-green-500 decoration-2 underline-offset-4">Context</span> - details about actions, setting, and mood.
Use these elements as a structured narrative, not a keyword list. Modern models understand language better than keyword matching. Start simple, then add details.
Context is especially important for creating better prompts. Try adding lighting, artistic details, and mood.
<div class="flex gap-4">
<div>
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/plain-prompt.png"/>
<figcaption class="mt-2 text-center text-sm text-gray-500">"A photo of a banana-shaped couch in a living room"</figcaption>
<div class="flex-1 text-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ok-prompt.png" class="w-full h-auto object-cover rounded-lg">
<figcaption class="mt-2 text-sm text-gray-500">A <span class="underline decoration-sky-500 decoration-2 underline-offset-1">cute cat</span> <span class="underline decoration-pink-500 decoration-2 underline-offset-1">lounges on a leaf in a pool during a peaceful summer afternoon</span>, in <span class="underline decoration-green-500 decoration-2 underline-offset-1">lofi art style, illustration</span>.</figcaption>
</div>
<div>
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png"/>
<figcaption class="mt-2 text-center text-sm text-gray-500">"A vibrant yellow banana-shaped couch sits in a cozy living room, its curve cradling a pile of colorful cushions. on the wooden floor, a patterned rug adds a touch of eclectic charm, and a potted plant sits in the corner, reaching towards the sunlight filtering through the windows"</figcaption>
<div class="flex-1 text-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/better-prompt.png" class="w-full h-auto object-cover rounded-lg"/>
<figcaption class="mt-2 text-sm text-gray-500">A cute cat lounges on a floating leaf in a sparkling pool during a peaceful summer afternoon. Clear reflections ripple across the water, with sunlight casting soft, smooth highlights. The illustration is detailed and polished, with elegant lines and harmonious colors, evoking a relaxing, serene, and whimsical lofi mood, anime-inspired and visually comforting.</figcaption>
</div>
</div>
## Prompt enhancing with GPT2
Prompt enhancing is a technique for quickly improving prompt quality without spending too much effort constructing one. It uses a model like GPT2 pretrained on Stable Diffusion text prompts to automatically enrich a prompt with additional important keywords to generate high-quality images.
The technique works by curating a list of specific keywords and forcing the model to generate those words to enhance the original prompt. This way, your prompt can be "a cat" and GPT2 can enhance the prompt to "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic".
Be specific and add context. Use photography terms like lens type, focal length, camera angles, and depth of field.
> [!TIP]
> You should also use a [*offset noise*](https://www.crosslabs.org//blog/diffusion-with-offset-noise) LoRA to improve the contrast in bright and dark images and create better lighting overall. This [LoRA](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_offset_example-lora_1.0.safetensors) is available from [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0).
Start by defining certain styles and a list of words (you can check out a more comprehensive list of [words](https://hf.co/LykosAI/GPT-Prompt-Expansion-Fooocus-v2/blob/main/positive.txt) and [styles](https://github.com/lllyasviel/Fooocus/tree/main/sdxl_styles) used by Fooocus) to enhance a prompt with.
```py
import torch
from transformers import GenerationConfig, GPT2LMHeadModel, GPT2Tokenizer, LogitsProcessor, LogitsProcessorList
from diffusers import StableDiffusionXLPipeline
styles = {
"cinematic": "cinematic film still of {prompt}, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
"anime": "anime artwork of {prompt}, anime style, key visual, vibrant, studio anime, highly detailed",
"photographic": "cinematic photo of {prompt}, 35mm photograph, film, professional, 4k, highly detailed",
"comic": "comic of {prompt}, graphic illustration, comic art, graphic novel art, vibrant, highly detailed",
"lineart": "line art drawing {prompt}, professional, sleek, modern, minimalist, graphic, line art, vector graphics",
"pixelart": " pixel-art {prompt}, low-res, blocky, pixel art style, 8-bit graphics",
}
words = [
"aesthetic", "astonishing", "beautiful", "breathtaking", "composition", "contrasted", "epic", "moody", "enhanced",
"exceptional", "fascinating", "flawless", "glamorous", "glorious", "illumination", "impressive", "improved",
"inspirational", "magnificent", "majestic", "hyperrealistic", "smooth", "sharp", "focus", "stunning", "detailed",
"intricate", "dramatic", "high", "quality", "perfect", "light", "ultra", "highly", "radiant", "satisfying",
"soothing", "sophisticated", "stylish", "sublime", "terrific", "touching", "timeless", "wonderful", "unbelievable",
"elegant", "awesome", "amazing", "dynamic", "trendy",
]
```
You may have noticed in the `words` list, there are certain words that can be paired together to create something more meaningful. For example, the words "high" and "quality" can be combined to create "high quality". Let's pair these words together and remove the words that can't be paired.
```py
word_pairs = ["highly detailed", "high quality", "enhanced quality", "perfect composition", "dynamic light"]
def find_and_order_pairs(s, pairs):
words = s.split()
found_pairs = []
for pair in pairs:
pair_words = pair.split()
if pair_words[0] in words and pair_words[1] in words:
found_pairs.append(pair)
words.remove(pair_words[0])
words.remove(pair_words[1])
for word in words[:]:
for pair in pairs:
if word in pair.split():
words.remove(word)
break
ordered_pairs = ", ".join(found_pairs)
remaining_s = ", ".join(words)
return ordered_pairs, remaining_s
```
Next, implement a custom [`~transformers.LogitsProcessor`] class that assigns tokens in the `words` list a value of 0 and assigns tokens not in the `words` list a negative value so they aren't picked during generation. This way, generation is biased towards words in the `words` list. After a word from the list is used, it is also assigned a negative value so it isn't picked again.
```py
class CustomLogitsProcessor(LogitsProcessor):
def __init__(self, bias):
super().__init__()
self.bias = bias
def __call__(self, input_ids, scores):
if len(input_ids.shape) == 2:
last_token_id = input_ids[0, -1]
self.bias[last_token_id] = -1e10
return scores + self.bias
word_ids = [tokenizer.encode(word, add_prefix_space=True)[0] for word in words]
bias = torch.full((tokenizer.vocab_size,), -float("Inf")).to("cuda")
bias[word_ids] = 0
processor = CustomLogitsProcessor(bias)
processor_list = LogitsProcessorList([processor])
```
Combine the prompt and the `cinematic` style prompt defined in the `styles` dictionary earlier.
```py
prompt = "a cat basking in the sun on a roof in Turkey"
style = "cinematic"
prompt = styles[style].format(prompt=prompt)
prompt
"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"
```
Load a GPT2 tokenizer and model from the [Gustavosta/MagicPrompt-Stable-Diffusion](https://huggingface.co/Gustavosta/MagicPrompt-Stable-Diffusion) checkpoint (this specific checkpoint is trained to generate prompts) to enhance the prompt.
```py
tokenizer = GPT2Tokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
model = GPT2LMHeadModel.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion", torch_dtype=torch.float16).to(
"cuda"
)
model.eval()
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
token_count = inputs["input_ids"].shape[1]
max_new_tokens = 50 - token_count
generation_config = GenerationConfig(
penalty_alpha=0.7,
top_k=50,
eos_token_id=model.config.eos_token_id,
pad_token_id=model.config.eos_token_id,
pad_token=model.config.pad_token_id,
do_sample=True,
)
with torch.no_grad():
generated_ids = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_new_tokens=max_new_tokens,
generation_config=generation_config,
logits_processor=proccesor_list,
)
```
Then you can combine the input prompt and the generated prompt. Feel free to take a look at what the generated prompt (`generated_part`) is, the word pairs that were found (`pairs`), and the remaining words (`words`). This is all packed together in the `enhanced_prompt`.
```py
output_tokens = [tokenizer.decode(generated_id, skip_special_tokens=True) for generated_id in generated_ids]
input_part, generated_part = output_tokens[0][: len(prompt)], output_tokens[0][len(prompt) :]
pairs, words = find_and_order_pairs(generated_part, word_pairs)
formatted_generated_part = pairs + ", " + words
enhanced_prompt = input_part + ", " + formatted_generated_part
enhanced_prompt
["cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic"]
```
Finally, load a pipeline and the offset noise LoRA with a *low weight* to generate an image with the enhanced prompt.
```py
pipeline = StableDiffusionXLPipeline.from_pretrained(
"RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, variant="fp16"
).to("cuda")
pipeline.load_lora_weights(
"stabilityai/stable-diffusion-xl-base-1.0",
weight_name="sd_xl_offset_example-lora_1.0.safetensors",
adapter_name="offset",
)
pipeline.set_adapters(["offset"], adapter_weights=[0.2])
image = pipeline(
enhanced_prompt,
width=1152,
height=896,
guidance_scale=7.5,
num_inference_steps=25,
).images[0]
image
```
<div class="flex gap-4">
<div>
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"/>
<figcaption class="mt-2 text-center text-sm text-gray-500">"a cat basking in the sun on a roof in Turkey"</figcaption>
</div>
<div>
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/enhanced-prompt.png"/>
<figcaption class="mt-2 text-center text-sm text-gray-500">"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"</figcaption>
</div>
</div>
> Try a [prompt enhancer](https://huggingface.co/models?sort=downloads&search=prompt+enhancer) to help improve your prompt structure.
## Prompt weighting
Prompt weighting provides a way to emphasize or de-emphasize certain parts of a prompt, allowing for more control over the generated image. A prompt can include several concepts, which gets turned into contextualized text embeddings. The embeddings are used by the model to condition its cross-attention layers to generate an image (read the Stable Diffusion [blog post](https://huggingface.co/blog/stable_diffusion) to learn more about how it works).
Prompt weighting makes some words stronger and others weaker. It scales attention scores so you control how much influence each concept has.
Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt embeddings is to use [Stable Diffusion Long Prompt Weighted Embedding](https://github.com/xhinker/sd_embed) (sd_embed). Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [negative_prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
Diffusers handles this through `prompt_embeds` and `pooled_prompt_embeds` arguments which take scaled text embedding vectors. Use the [sd_embed](https://github.com/xhinker/sd_embed) library to generate these embeddings. It also supports longer prompts.
> [!TIP]
> If your favorite pipeline doesn't have a `prompt_embeds` parameter, please open an [issue](https://github.com/huggingface/diffusers/issues/new/choose) so we can add it!
This guide will show you how to weight your prompts with sd_embed.
Before you begin, make sure you have the latest version of sd_embed installed:
```bash
pip install git+https://github.com/xhinker/sd_embed.git@main
```
For this example, let's use [`StableDiffusionXLPipeline`].
> [!NOTE]
> The sd_embed library only supports Stable Diffusion, Stable Diffusion XL, Stable Diffusion 3, Stable Cascade, and Flux. Prompt weighting doesn't necessarily help for newer models like Flux which already has very good prompt adherence.
```py
from diffusers import StableDiffusionXLPipeline, UniPCMultistepScheduler
import torch
pipe = StableDiffusionXLPipeline.from_pretrained("Lykon/dreamshaper-xl-1-0", torch_dtype=torch.float16)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")
!uv pip install git+https://github.com/xhinker/sd_embed.git@main
```
To upweight or downweight a concept, surround the text with parentheses. More parentheses applies a heavier weight on the text. You can also append a numerical multiplier to the text to indicate how much you want to increase or decrease its weights by.
Format weighted text with numerical multipliers or parentheses. More parentheses mean stronger weighting.
| format | multiplier |
|---|---|
| `(hippo)` | increase by 1.1x |
| `((hippo))` | increase by 1.21x |
| `(hippo:1.5)` | increase by 1.5x |
| `(hippo:0.5)` | decrease by 4x |
| `(cat)` | increase by 1.1x |
| `((cat))` | increase by 1.21x |
| `(cat:1.5)` | increase by 1.5x |
| `(cat:0.5)` | decrease by 4x |
Create a prompt and use a combination of parentheses and numerical multipliers to upweight various text.
Create a weighted prompt and pass it to [get_weighted_text_embeddings_sdxl](https://github.com/xhinker/sd_embed/blob/4a47f71150a22942fa606fb741a1c971d95ba56f/src/sd_embed/embedding_funcs.py#L405) to generate embeddings.
> [!TIP]
> You could also pass negative prompts to `negative_prompt_embeds` and `negative_pooled_prompt_embeds`.
```py
import torch
from diffusers import DiffusionPipeline
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sdxl
prompt = """A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus.
This imaginative creature features the distinctive, bulky body of a hippo,
but with a texture and appearance resembling a golden-brown, crispy waffle.
The creature might have elements like waffle squares across its skin and a syrup-like sheen.
It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting,
possibly including oversized utensils or plates in the background.
The image should evoke a sense of playful absurdity and culinary fantasy.
"""
neg_prompt = """\
skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
(normal quality:2),lowres,((monochrome)),((grayscale))
"""
```
Use the `get_weighted_text_embeddings_sdxl` function to generate the prompt embeddings and the negative prompt embeddings. It'll also generated the pooled and negative pooled prompt embeddings since you're using the SDXL model.
> [!TIP]
> You can safely ignore the error message below about the token index length exceeding the models maximum sequence length. All your tokens will be used in the embedding process.
>
> ```
> Token indices sequence length is longer than the specified maximum sequence length for this model
> ```
```py
(
prompt_embeds,
prompt_neg_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds
) = get_weighted_text_embeddings_sdxl(
pipe,
prompt=prompt,
neg_prompt=neg_prompt
pipeline = DiffusionPipeline.from_pretrained(
"Lykon/dreamshaper-xl-1-0", torch_dtype=torch.bfloat16, device_map="cuda"
)
image = pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=prompt_neg_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
num_inference_steps=30,
height=1024,
width=1024 + 512,
guidance_scale=4.0,
generator=torch.Generator("cuda").manual_seed(2)
).images[0]
image
prompt = """
A (cute cat:1.4) lounges on a (floating leaf:1.2) in a (sparkling pool:1.1) during a peaceful summer afternoon.
Gentle ripples reflect pastel skies, while (sunlight:1.1) casts soft highlights. The illustration is smooth and polished
with elegant, sketchy lines and subtle gradients, evoking a ((whimsical, nostalgic, dreamy lofi atmosphere:2.0)),
(anime-inspired:1.6), calming, comforting, and visually serene.
"""
prompt_embeds, _, pooled_prompt_embeds, *_ = get_weighted_text_embeddings_sdxl(pipeline, prompt=prompt)
```
Pass the embeddings to `prompt_embeds` and `pooled_prompt_embeds` to generate your image.
```py
image = pipeline(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds).images[0]
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_sdxl.png"/>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/prompt-embed-sdxl.png"/>
</div>
> [!TIP]
> Refer to the [sd_embed](https://github.com/xhinker/sd_embed) repository for additional details about long prompt weighting for FLUX.1, Stable Cascade, and Stable Diffusion 1.5.
### Textual inversion
[Textual inversion](../training/text_inversion) is a technique for learning a specific concept from some images which you can use to generate new images conditioned on that concept.
Create a pipeline and use the [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] function to load the textual inversion embeddings (feel free to browse the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer) for 100+ trained concepts):
```py
import torch
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained(
"stable-diffusion-v1-5/stable-diffusion-v1-5",
torch_dtype=torch.float16,
).to("cuda")
pipe.load_textual_inversion("sd-concepts-library/midjourney-style")
```
Add the `<midjourney-style>` text to the prompt to trigger the textual inversion.
```py
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
prompt = """<midjourney-style> A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus.
This imaginative creature features the distinctive, bulky body of a hippo,
but with a texture and appearance resembling a golden-brown, crispy waffle.
The creature might have elements like waffle squares across its skin and a syrup-like sheen.
It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting,
possibly including oversized utensils or plates in the background.
The image should evoke a sense of playful absurdity and culinary fantasy.
"""
neg_prompt = """\
skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
(normal quality:2),lowres,((monochrome)),((grayscale))
"""
```
Use the `get_weighted_text_embeddings_sd15` function to generate the prompt embeddings and the negative prompt embeddings.
```py
(
prompt_embeds,
prompt_neg_embeds,
) = get_weighted_text_embeddings_sd15(
pipe,
prompt=prompt,
neg_prompt=neg_prompt
)
image = pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=prompt_neg_embeds,
height=768,
width=896,
guidance_scale=4.0,
generator=torch.Generator("cuda").manual_seed(2)
).images[0]
image
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_textual_inversion.png"/>
</div>
### DreamBooth
[DreamBooth](../training/dreambooth) is a technique for generating contextualized images of a subject given just a few images of the subject to train on. It is similar to textual inversion, but DreamBooth trains the full model whereas textual inversion only fine-tunes the text embeddings. This means you should use [`~DiffusionPipeline.from_pretrained`] to load the DreamBooth model (feel free to browse the [Stable Diffusion Dreambooth Concepts Library](https://huggingface.co/sd-dreambooth-library) for 100+ trained models):
```py
import torch
from diffusers import DiffusionPipeline, UniPCMultistepScheduler
pipe = DiffusionPipeline.from_pretrained("sd-dreambooth-library/dndcoverart-v1", torch_dtype=torch.float16).to("cuda")
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
```
Depending on the model you use, you'll need to incorporate the model's unique identifier into your prompt. For example, the `dndcoverart-v1` model uses the identifier `dndcoverart`:
```py
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
prompt = """dndcoverart of A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus.
This imaginative creature features the distinctive, bulky body of a hippo,
but with a texture and appearance resembling a golden-brown, crispy waffle.
The creature might have elements like waffle squares across its skin and a syrup-like sheen.
It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting,
possibly including oversized utensils or plates in the background.
The image should evoke a sense of playful absurdity and culinary fantasy.
"""
neg_prompt = """\
skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
(normal quality:2),lowres,((monochrome)),((grayscale))
"""
(
prompt_embeds
, prompt_neg_embeds
) = get_weighted_text_embeddings_sd15(
pipe
, prompt = prompt
, neg_prompt = neg_prompt
)
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_dreambooth.png"/>
</div>
Prompt weighting works with [Textual inversion](./textual_inversion_inference) and [DreamBooth](./dreambooth) adapters too.

View File

@@ -280,5 +280,5 @@ This is really what 🧨 Diffusers is designed for: to make it intuitive and eas
For your next steps, feel free to:
* Learn how to [build and contribute a pipeline](../using-diffusers/contribute_pipeline) to 🧨 Diffusers. We can't wait and see what you'll come up with!
* Learn how to [build and contribute a pipeline](../conceptual/contribution) to 🧨 Diffusers. We can't wait and see what you'll come up with!
* Explore [existing pipelines](../api/pipelines/overview) in the library, and see if you can deconstruct and build a pipeline from scratch using the models and schedulers separately.

View File

@@ -14,51 +14,47 @@ specific language governing permissions and limitations under the License.
## 서문 [[preamble]]
[Diffusers](https://huggingface.co/docs/diffusers/index)는 사전 훈련된 diffusion 모델을 제공하며 추론 훈련을 위한 모듈 툴박스로 용됩니다.
[Diffusers](https://huggingface.co/docs/diffusers/index)는 사전 훈련된 diffusion 모델을 제공하며, 추론 훈련을 위한 모듈 툴박스로 용됩니다.
이 기술의 실제 적용 사회에 미칠 수 있는 부정적 영향을 고려하여 Diffusers 라이브러리의 개발, 사용자 기여 사용에 윤리 지침을 제공하는 것이 중요하다고 생각합니다.
이이 기술을 사용함에 따른 위험은 여전히 검토 중이지만, 몇 가지 예를 들면: 예술가들에 대한 저작권 문제; 딥 페이크의 악용; 부적절한 맥락에서의 성적 콘텐츠 생성; 동의 없는 사칭; 소수자 집단의 억압을 영속화하는 유해한 사회적 편견 등이 있습니다.
우리는 위험을 지속적으로 추적하고 커뮤니티의 응답과 소중한 피드백에 따라 다음 지침을 조정할 것입니다.
이 기술의 실제 적용 사례와 사회에 미칠 수 있는 잠재적 부정적 영향을 고려할 때, Diffusers 라이브러리의 개발, 사용자 기여, 사용에 윤리 지침을 제공하는 것이 중요하다고 생각합니다.
이 기술 사용과 관련된 위험은 여전히 검토 중이지만, 예를 들면: 예술가의 저작권 문제, 딥페이크 악용, 부적절한 맥락에서의 성적 콘텐츠 생성, 비동의 사칭, 소수자 집단 억압을 영속화하는 유해한 사회적 편견 등이 있습니다.
우리는 이러한 위험을 지속적으로 추적하고, 커뮤니티의 반응과 소중한 피드백에 따라 아래 지침을 조정할 것입니다.
## 범위 [[scope]]
Diffusers 커뮤니티는 프로젝트 개발에 다음과 같은 윤리 지침을 적용하며, 특히 윤리적 문제와 관련된 민감한 주제에 대 커뮤니티의 기여를 조정하는 데 도움을 줄 것입니다.
Diffusers 커뮤니티는 프로젝트 개발에 다음 윤리 지침을 적용하며, 특히 윤리적 문제와 관련된 민감한 주제에 대 커뮤니티의 기여를 조정하는 데 도움을 줄 것입니다.
## 윤리 지침 [[ethical-guidelines]]
다음 윤리 지침은 일반적으로 적용되지만, 민감한 윤리적 문제와 관련하여 기술적 선택을 할 때 이를 우선적으로 적용할 것입니다. 나아가, 해당 기술의 최신 동향과 관련된 새로운 위험이 발생함에 따라 이러한 윤리 원칙을 조정할 것을 약속드립니다.
다음 윤리 지침은 일반적으로 적용되지만, 윤리적으로 민감한 문제와 관련 기술적 선택을 할 때 우선적으로 적용니다. 또한, 해당 기술의 최신 동향과 관련된 새로운 위험이 발생함에 따라 이러한 윤리 원칙을 지속적으로 조정할 것을 약속니다.
- **투명성**: 우리는 PR 관리하고, 사용자에게 우리의 선택을 설명하며, 기술적 의사결정을 내릴 때 투명성을 유지할 것을 약속합니다.
- **투명성**: 우리는 PR 관리, 사용자에게 선택의 이유 설명, 기술적 의사결정 과정에서 투명성을 유지할 것을 약속합니다.
- **일관성**: 우리는 프로젝트 관리에서 사용자에게 동일한 수준의 관심을 보장하고 기술적으로 안정고 일관된 상태를 유지할 것을 약속합니다.
- **일관성**: 프로젝트 관리에서 모든 사용자에게 동일한 수준의 관심을 보장하고, 기술적으로 안정적이고 일관된 상태를 유지할 것을 약속합니다.
- **간결성**: Diffusers 라이브러리를 사용하고 활용하기 쉽게 만들기 위해, 프로젝트의 목표를 간결하고 일관성 있게 유지할 것을 약속합니다.
- **간결성**: Diffusers 라이브러리를 쉽게 사용하고 활용할 수 있도록, 프로젝트의 목표를 간결하고 일관성 있게 유지할 것을 약속합니다.
- **접근성**: Diffusers 프로젝트는 기술적 전문 지식 없어도 프로젝트 운영에 참여할 수 있는 기여자의 진입장벽을 낮춥니다. 이를 통해 연구 결과물이 커뮤니티에 더 잘 접근 수 있게 됩니다.
- **접근성**: Diffusers 프로젝트는 기술적 전문지식 없어도 기여할 수 있도록 진입장벽을 낮춥니다. 이를 통해 연구 결과물이 커뮤니티에 더 잘 접근 수 있니다.
- **재현성**: 우리는 Diffusers 라이브러리를 통해 제공되는 업스트림(upstream) 코드, 모델 데이터셋의 재현성에 대해 투명하게 공개 것을 목표로 합니다.
- **책임**: 우리는 커뮤니티와 팀워크를 통해, 이 기술의 잠재적인 위험과 위험을 예측하고 완화하는 데 대한 공동 책임을 가지고 있습니다.
- **재현성**: 우리는 Diffusers 라이브러리를 통해 제공되는 업스트림 코드, 모델, 데이터셋의 재현성에 대해 투명하게 공개하는 것을 목표로 합니다.
- **책임**: 커뮤니티와 팀워크를 통해, 이 기술의 잠재적 위험을 예측하고 완화하는 데 공동 책임을 집니다.
## 구현 사례: 안전 기능과 메커니즘 [[examples-of-implementations-safety-features-and-mechanisms]]
팀은 diffusion 기술과 관련된 잠재적 윤리 및 사회적 위험에 대하기 위 기술적비기술적 도구를 제공하고자 하고 있습니다. 또한, 커뮤니티의 참여는 이러한 기능 구현하고 우리와 함께 인식을 높이는 데 매우 중요합니다.
팀은 diffusion 기술과 관련된 잠재적 윤리 및 사회적 위험에 대하기 위 기술적·비기술적 도구를 제공하고자 노력하고 있습니다. 또한, 커뮤니티의 참여는 이러한 기능 구현과 인식 제고에 매우 중요합니다.
- [**커뮤니티 탭**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): 이를 통해 커뮤니티 프로젝트에 대해 토론하고 더 나은 협을 할 수 있니다.
- [**커뮤니티 탭**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): 커뮤니티 프로젝트에 대해 토론하고 더 나은 협을 할 수 있도록 지원합니다.
- **편향 탐색 및 평가**: Hugging Face 팀은 Stable Diffusion 모델의 편향성을 대화형으로 보여주는 [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer) 제공합니다. 이런 의미에서, 우리는 편향 탐색 평가를 지원하고 장려합니다.
- **편향 탐색 및 평가**: Hugging Face 팀은 Stable Diffusion 모델의 편향성을 대화형으로 보여주는 [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer) 제공합니다. 우리는 이러한 편향 탐색 평가를 지원하고 장려합니다.
- **배포에서의 안전 유도**
- [**안전한 Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_safe): 이는 필터되지 않은 웹 크롤링 데이터셋으로 훈련된 Stable Diffusion과 같은 모델이 부적절 변질에 취약한 문제를 완화합니다. 관련 논문: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://huggingface.co/papers/2211.05105).
- [**안전한 Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_safe): 필터되지 않은 웹 크롤링 데이터셋으로 훈련된 Stable Diffusion과 같은 모델이 부적절하게 변질되는 문제를 완화합니다. 관련 논문: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://huggingface.co/papers/2211.05105).
- [**안전 검사기**](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py): 이미지가 생성된 후에 이미가 임베딩 공간에서 일련의 하드코딩된 유해 개념 클래스 확률을 확인하고 비교합니다. 유해 개념은 역공학을 방지하기 위해 의도적으로 숨겨져 있습니다.
- [**안전 검사기**](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py): 생성된 이미가 임베딩 공간에서 하드코딩된 유해 개념 클래스와 일치할 확률을 확인하고 비교합니다. 유해 개념은 역공학을 방지하기 위해 의도적으로 숨겨져 있습니다.
- **Hub에서의 단계적 배포**: 특히 민감한 상황에서는 일부 리포지토리에 대한 접근을 제한해야 합니다. 단계적 배포는 중간 단계로, 리포지토리 작성자가 사용에 대 더 많은 통제을 갖게 합니다.
- **Hub에서의 단계적 배포**: 특히 민감한 상황에서는 일부 리포지토리에 대한 접근을 제한할 수 있습니다. 단계적 배포는 리포지토리 작성자가 사용에 대 더 많은 통제을 갖도록 하는 중간 단계입니다.
- **라이선싱**: [OpenRAILs](https://huggingface.co/blog/open_rail)와 같은 새로운 유형의 라이선싱을 통해 자유로운 접근을 보장하면서도 책임 있는 사용을 위한 일련의 제한을 둘 수 있습니다.
- **라이선싱**: [OpenRAILs](https://huggingface.co/blog/open_rail)와 같은 새로운 유형의 라이선스를 통해 자유로운 접근을 보장하면서도 보다 책임 있는 사용을 위한 일련의 제한을 둘 수 있습니다.

View File

@@ -1338,7 +1338,7 @@ def main(args):
batch["pixel_values"] = batch["pixel_values"].to(
accelerator.device, non_blocking=True, dtype=vae.dtype
)
latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
if train_dataset.custom_instance_prompts:
with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
prompt_embeds, prompt_embeds_mask = compute_text_embeddings(

View File

@@ -5,4 +5,4 @@ datasets>=2.19.1
ftfy
tensorboard
Jinja2
peft==0.7.0
peft>=0.17.0

View File

@@ -5,4 +5,4 @@ ftfy
tensorboard
Jinja2
datasets
peft==0.7.0
peft>=0.17.0

View File

@@ -369,6 +369,15 @@ def get_spatial_latent_upsampler_config(version: str) -> Dict[str, Any]:
"spatial_upsample": True,
"temporal_upsample": False,
}
elif version == "0.9.8":
config = {
"in_channels": 128,
"mid_channels": 512,
"num_blocks_per_stage": 4,
"dims": 3,
"spatial_upsample": True,
"temporal_upsample": False,
}
else:
raise ValueError(f"Unsupported version: {version}")
return config
@@ -402,7 +411,7 @@ def get_args():
"--version",
type=str,
default="0.9.0",
choices=["0.9.0", "0.9.1", "0.9.5", "0.9.7"],
choices=["0.9.0", "0.9.1", "0.9.5", "0.9.7", "0.9.8"],
help="Version of the LTX model",
)
return parser.parse_args()

View File

@@ -145,6 +145,7 @@ _deps = [
"black",
"phonemizer",
"opencv-python",
"timm",
]
# this is a lookup table with items like:
@@ -218,7 +219,7 @@ class DepsTableUpdateCommand(Command):
extras = {}
extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder")
extras["docs"] = deps_list("hf-doc-builder")
extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft")
extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft", "timm")
extras["test"] = deps_list(
"compel",
"GitPython",

View File

@@ -386,10 +386,14 @@ else:
_import_structure["modular_pipelines"].extend(
[
"FluxAutoBlocks",
"FluxKontextAutoBlocks",
"FluxKontextModularPipeline",
"FluxModularPipeline",
"QwenImageAutoBlocks",
"QwenImageEditAutoBlocks",
"QwenImageEditModularPipeline",
"QwenImageEditPlusAutoBlocks",
"QwenImageEditPlusModularPipeline",
"QwenImageModularPipeline",
"StableDiffusionXLAutoBlocks",
"StableDiffusionXLModularPipeline",
@@ -1048,10 +1052,14 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
else:
from .modular_pipelines import (
FluxAutoBlocks,
FluxKontextAutoBlocks,
FluxKontextModularPipeline,
FluxModularPipeline,
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
QwenImageEditModularPipeline,
QwenImageEditPlusAutoBlocks,
QwenImageEditPlusModularPipeline,
QwenImageModularPipeline,
StableDiffusionXLAutoBlocks,
StableDiffusionXLModularPipeline,

View File

@@ -52,4 +52,5 @@ deps = {
"black": "black",
"phonemizer": "phonemizer",
"opencv-python": "opencv-python",
"timm": "timm",
}

View File

@@ -17,7 +17,10 @@ from dataclasses import dataclass
from typing import Dict, List, Type, Union
import torch
import torch.distributed._functional_collectives as funcol
if torch.distributed.is_available():
import torch.distributed._functional_collectives as funcol
from ..models._modeling_parallel import (
ContextParallelConfig,

View File

@@ -18,7 +18,6 @@ import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, register_to_config
from ...utils import logging

View File

@@ -23,7 +23,6 @@ from typing import List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import FromOriginalModelMixin

View File

@@ -17,7 +17,6 @@ from typing import List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import FromOriginalModelMixin

View File

@@ -16,7 +16,6 @@ from math import gcd
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from ...configuration_utils import ConfigMixin, register_to_config

View File

@@ -18,7 +18,6 @@ from typing import Dict, Optional, Union
import numpy as np
import torch
import torch.nn as nn
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, register_to_config
from ...utils import logging

View File

@@ -353,7 +353,9 @@ class LTXVideoTransformerBlock(nn.Module):
norm_hidden_states = self.norm1(hidden_states)
num_ada_params = self.scale_shift_table.shape[0]
ada_values = self.scale_shift_table[None, None] + temb.reshape(batch_size, temb.size(1), num_ada_params, -1)
ada_values = self.scale_shift_table[None, None].to(temb.device) + temb.reshape(
batch_size, temb.size(1), num_ada_params, -1
)
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ada_values.unbind(dim=2)
norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa

View File

@@ -682,12 +682,12 @@ class WanTransformer3DModel(
# 5. Output norm, projection & unpatchify
if temb.ndim == 3:
# batch_size, seq_len, inner_dim (wan 2.2 ti2v)
shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
shift, scale = (self.scale_shift_table.unsqueeze(0).to(temb.device) + temb.unsqueeze(2)).chunk(2, dim=2)
shift = shift.squeeze(2)
scale = scale.squeeze(2)
else:
# batch_size, inner_dim
shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)
# Move the shift and scale tensors to the same device as hidden_states.
# When using multi-GPU inference via accelerate these will be on the

View File

@@ -103,7 +103,7 @@ class WanVACETransformerBlock(nn.Module):
control_hidden_states = control_hidden_states + hidden_states
shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
self.scale_shift_table + temb.float()
self.scale_shift_table.to(temb.device) + temb.float()
).chunk(6, dim=1)
# 1. Self-attention
@@ -361,7 +361,7 @@ class WanVACETransformer3DModel(
hidden_states = hidden_states + control_hint * scale
# 6. Output norm, projection & unpatchify
shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)
# Move the shift and scale tensors to the same device as hidden_states.
# When using multi-GPU inference via accelerate these will be on the

View File

@@ -16,7 +16,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin

View File

@@ -18,7 +18,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import UNet2DConditionLoadersMixin

View File

@@ -16,7 +16,6 @@ from typing import Any, Dict, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import UNet2DConditionLoadersMixin

View File

@@ -16,7 +16,6 @@ from dataclasses import dataclass
from typing import Dict, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ...configuration_utils import ConfigMixin, register_to_config

View File

@@ -18,7 +18,6 @@ from typing import Any, Dict, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, UNet2DConditionLoadersMixin

View File

@@ -46,12 +46,19 @@ else:
]
_import_structure["stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline"]
_import_structure["wan"] = ["WanAutoBlocks", "WanModularPipeline"]
_import_structure["flux"] = ["FluxAutoBlocks", "FluxModularPipeline"]
_import_structure["flux"] = [
"FluxAutoBlocks",
"FluxModularPipeline",
"FluxKontextAutoBlocks",
"FluxKontextModularPipeline",
]
_import_structure["qwenimage"] = [
"QwenImageAutoBlocks",
"QwenImageModularPipeline",
"QwenImageEditModularPipeline",
"QwenImageEditAutoBlocks",
"QwenImageEditPlusModularPipeline",
"QwenImageEditPlusAutoBlocks",
]
_import_structure["components_manager"] = ["ComponentsManager"]
@@ -63,7 +70,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
from ..utils.dummy_pt_objects import * # noqa F403
else:
from .components_manager import ComponentsManager
from .flux import FluxAutoBlocks, FluxModularPipeline
from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
from .modular_pipeline import (
AutoPipelineBlocks,
BlockState,
@@ -78,6 +85,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
QwenImageEditModularPipeline,
QwenImageEditPlusAutoBlocks,
QwenImageEditPlusModularPipeline,
QwenImageModularPipeline,
)
from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline

View File

@@ -25,14 +25,18 @@ else:
_import_structure["modular_blocks"] = [
"ALL_BLOCKS",
"AUTO_BLOCKS",
"AUTO_BLOCKS_KONTEXT",
"FLUX_KONTEXT_BLOCKS",
"TEXT2IMAGE_BLOCKS",
"FluxAutoBeforeDenoiseStep",
"FluxAutoBlocks",
"FluxAutoBlocks",
"FluxAutoDecodeStep",
"FluxAutoDenoiseStep",
"FluxKontextAutoBlocks",
"FluxKontextAutoDenoiseStep",
"FluxKontextBeforeDenoiseStep",
]
_import_structure["modular_pipeline"] = ["FluxModularPipeline"]
_import_structure["modular_pipeline"] = ["FluxKontextModularPipeline", "FluxModularPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
@@ -45,13 +49,18 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
from .modular_blocks import (
ALL_BLOCKS,
AUTO_BLOCKS,
AUTO_BLOCKS_KONTEXT,
FLUX_KONTEXT_BLOCKS,
TEXT2IMAGE_BLOCKS,
FluxAutoBeforeDenoiseStep,
FluxAutoBlocks,
FluxAutoDecodeStep,
FluxAutoDenoiseStep,
FluxKontextAutoBlocks,
FluxKontextAutoDenoiseStep,
FluxKontextBeforeDenoiseStep,
)
from .modular_pipeline import FluxModularPipeline
from .modular_pipeline import FluxKontextModularPipeline, FluxModularPipeline
else:
import sys

View File

@@ -13,12 +13,12 @@
# limitations under the License.
import inspect
from typing import Any, List, Optional, Tuple, Union
from typing import List, Optional, Union
import numpy as np
import torch
from ...models import AutoencoderKL
from ...pipelines import FluxPipeline
from ...schedulers import FlowMatchEulerDiscreteScheduler
from ...utils import logging
from ...utils.torch_utils import randn_tensor
@@ -104,48 +104,6 @@ def calculate_shift(
return mu
# Adapted from the original implementation.
def prepare_latents_img2img(
vae, scheduler, image, timestep, batch_size, num_channels_latents, height, width, dtype, device, generator
):
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
latent_channels = vae.config.latent_channels
# VAE applies 8x compression on images but we must also account for packing which requires
# latent height and width to be divisible by 2.
height = 2 * (int(height) // (vae_scale_factor * 2))
width = 2 * (int(width) // (vae_scale_factor * 2))
shape = (batch_size, num_channels_latents, height, width)
latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
image = image.to(device=device, dtype=dtype)
if image.shape[1] != latent_channels:
image_latents = _encode_vae_image(image=image, generator=generator)
else:
image_latents = image
if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
# expand init_latents for batch_size
additional_image_per_prompt = batch_size // image_latents.shape[0]
image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
)
else:
image_latents = torch.cat([image_latents], dim=0)
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
latents = scheduler.scale_noise(image_latents, timestep, noise)
latents = _pack_latents(latents, batch_size, num_channels_latents, height, width)
return latents, latent_image_ids
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
def retrieve_latents(
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -160,43 +118,6 @@ def retrieve_latents(
raise AttributeError("Could not access latents of provided encoder_output")
def _pack_latents(latents, batch_size, num_channels_latents, height, width):
latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
latents = latents.permute(0, 2, 4, 1, 3, 5)
latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
return latents
def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
latent_image_ids = torch.zeros(height, width, 3)
latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
latent_image_ids = latent_image_ids.reshape(
latent_image_id_height * latent_image_id_width, latent_image_id_channels
)
return latent_image_ids.to(device=device, dtype=dtype)
# Cannot use "# Copied from" because it introduces weird indentation errors.
def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator):
if isinstance(generator, list):
image_latents = [
retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0])
]
image_latents = torch.cat(image_latents, dim=0)
else:
image_latents = retrieve_latents(vae.encode(image), generator=generator)
image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
return image_latents
def _get_initial_timesteps_and_optionals(
transformer,
scheduler,
@@ -231,92 +152,6 @@ def _get_initial_timesteps_and_optionals(
return timesteps, num_inference_steps, sigmas, guidance
class FluxInputStep(ModularPipelineBlocks):
model_name = "flux"
@property
def description(self) -> str:
return (
"Input processing step that:\n"
" 1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
" 2. Adjusts input tensor shapes based on `batch_size` (number of prompts) and `num_images_per_prompt`\n\n"
"All input tensors are expected to have either batch_size=1 or match the batch_size\n"
"of prompt_embeds. The tensors will be duplicated across the batch dimension to\n"
"have a final batch_size of batch_size * num_images_per_prompt."
)
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("num_images_per_prompt", default=1),
InputParam(
"prompt_embeds",
required=True,
type_hint=torch.Tensor,
description="Pre-generated text embeddings. Can be generated from text_encoder step.",
),
InputParam(
"pooled_prompt_embeds",
type_hint=torch.Tensor,
description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.",
),
# TODO: support negative embeddings?
]
@property
def intermediate_outputs(self) -> List[str]:
return [
OutputParam(
"batch_size",
type_hint=int,
description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
),
OutputParam(
"dtype",
type_hint=torch.dtype,
description="Data type of model tensor inputs (determined by `prompt_embeds`)",
),
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
description="text embeddings used to guide the image generation",
),
OutputParam(
"pooled_prompt_embeds",
type_hint=torch.Tensor,
description="pooled text embeddings used to guide the image generation",
),
# TODO: support negative embeddings?
]
def check_inputs(self, components, block_state):
if block_state.prompt_embeds is not None and block_state.pooled_prompt_embeds is not None:
if block_state.prompt_embeds.shape[0] != block_state.pooled_prompt_embeds.shape[0]:
raise ValueError(
"`prompt_embeds` and `pooled_prompt_embeds` must have the same batch size when passed directly, but"
f" got: `prompt_embeds` {block_state.prompt_embeds.shape} != `pooled_prompt_embeds`"
f" {block_state.pooled_prompt_embeds.shape}."
)
@torch.no_grad()
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
# TODO: consider adding negative embeddings?
block_state = self.get_block_state(state)
self.check_inputs(components, block_state)
block_state.batch_size = block_state.prompt_embeds.shape[0]
block_state.dtype = block_state.prompt_embeds.dtype
_, seq_len, _ = block_state.prompt_embeds.shape
block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
block_state.prompt_embeds = block_state.prompt_embeds.view(
block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
)
self.set_block_state(state, block_state)
return components, state
class FluxSetTimestepsStep(ModularPipelineBlocks):
model_name = "flux"
@@ -385,6 +220,10 @@ class FluxSetTimestepsStep(ModularPipelineBlocks):
block_state.sigmas = sigmas
block_state.guidance = guidance
# We set the index here to remove DtoH sync, helpful especially during compilation.
# Check out more details here: https://github.com/huggingface/diffusers/pull/11696
components.scheduler.set_begin_index(0)
self.set_block_state(state, block_state)
return components, state
@@ -428,11 +267,6 @@ class FluxImg2ImgSetTimestepsStep(ModularPipelineBlocks):
type_hint=int,
description="The number of denoising steps to perform at inference time",
),
OutputParam(
"latent_timestep",
type_hint=torch.Tensor,
description="The timestep that represents the initial noise level for image-to-image generation",
),
OutputParam("guidance", type_hint=torch.Tensor, description="Optional guidance to be used."),
]
@@ -480,8 +314,6 @@ class FluxImg2ImgSetTimestepsStep(ModularPipelineBlocks):
block_state.sigmas = sigmas
block_state.guidance = guidance
block_state.latent_timestep = timesteps[:1].repeat(batch_size)
self.set_block_state(state, block_state)
return components, state
@@ -520,11 +352,6 @@ class FluxPrepareLatentsStep(ModularPipelineBlocks):
OutputParam(
"latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
),
OutputParam(
"latent_image_ids",
type_hint=torch.Tensor,
description="IDs computed from the image sequence needed for RoPE",
),
]
@staticmethod
@@ -548,20 +375,13 @@ class FluxPrepareLatentsStep(ModularPipelineBlocks):
generator,
latents=None,
):
# Couldn't use the `prepare_latents` method directly from Flux because I decided to copy over
# the packing methods here. So, for example, `comp._pack_latents()` won't work if we were
# to go with the "# Copied from ..." approach. Or maybe there's a way?
# VAE applies 8x compression on images but we must also account for packing which requires
# latent height and width to be divisible by 2.
height = 2 * (int(height) // (comp.vae_scale_factor * 2))
width = 2 * (int(width) // (comp.vae_scale_factor * 2))
shape = (batch_size, num_channels_latents, height, width)
if latents is not None:
latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
return latents.to(device=device, dtype=dtype), latent_image_ids
return latents.to(device=device, dtype=dtype)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
@@ -569,26 +389,23 @@ class FluxPrepareLatentsStep(ModularPipelineBlocks):
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
# TODO: move packing latents code to a patchifier similar to Qwen
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
latents = _pack_latents(latents, batch_size, num_channels_latents, height, width)
latents = FluxPipeline._pack_latents(latents, batch_size, num_channels_latents, height, width)
latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
return latents, latent_image_ids
return latents
@torch.no_grad()
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
block_state.height = block_state.height or components.default_height
block_state.width = block_state.width or components.default_width
block_state.device = components._execution_device
block_state.dtype = torch.bfloat16 # TODO: okay to hardcode this?
block_state.num_channels_latents = components.num_channels_latents
self.check_inputs(components, block_state)
batch_size = block_state.batch_size * block_state.num_images_per_prompt
block_state.latents, block_state.latent_image_ids = self.prepare_latents(
block_state.latents = self.prepare_latents(
components,
batch_size,
block_state.num_channels_latents,
@@ -608,82 +425,194 @@ class FluxPrepareLatentsStep(ModularPipelineBlocks):
class FluxImg2ImgPrepareLatentsStep(ModularPipelineBlocks):
model_name = "flux"
@property
def expected_components(self) -> List[ComponentSpec]:
return [ComponentSpec("vae", AutoencoderKL), ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
@property
def description(self) -> str:
return "Step that prepares the latents for the image-to-image generation process"
return "Step that adds noise to image latents for image-to-image. Should be run after `set_timesteps`,"
" `prepare_latents`. Both noise and image latents should already be patchified."
@property
def inputs(self) -> List[Tuple[str, Any]]:
def expected_components(self) -> List[ComponentSpec]:
return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("height", type_hint=int),
InputParam("width", type_hint=int),
InputParam("latents", type_hint=Optional[torch.Tensor]),
InputParam("num_images_per_prompt", type_hint=int, default=1),
InputParam("generator"),
InputParam(
"image_latents",
name="latents",
required=True,
type_hint=torch.Tensor,
description="The latents representing the reference image for image-to-image/inpainting generation. Can be generated in vae_encode step.",
description="The initial random noised, can be generated in prepare latent step.",
),
InputParam(
"latent_timestep",
name="image_latents",
required=True,
type_hint=torch.Tensor,
description="The timestep that represents the initial noise level for image-to-image/inpainting generation. Can be generated in set_timesteps step.",
description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
),
InputParam(
"batch_size",
name="timesteps",
required=True,
type_hint=int,
description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam("dtype", required=True, type_hint=torch.dtype, description="The dtype of the model inputs"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
"latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
),
OutputParam(
"latent_image_ids",
name="initial_noise",
type_hint=torch.Tensor,
description="IDs computed from the image sequence needed for RoPE",
description="The initial random noised used for inpainting denoising.",
),
]
@staticmethod
def check_inputs(image_latents, latents):
if image_latents.shape[0] != latents.shape[0]:
raise ValueError(
f"`image_latents` must have have same batch size as `latents`, but got {image_latents.shape[0]} and {latents.shape[0]}"
)
if image_latents.ndim != 3:
raise ValueError(f"`image_latents` must have 3 dimensions (patchified), but got {image_latents.ndim}")
@torch.no_grad()
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
block_state.device = components._execution_device
block_state.dtype = torch.bfloat16 # TODO: okay to hardcode this?
block_state.num_channels_latents = components.num_channels_latents
block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
block_state.device = components._execution_device
self.check_inputs(image_latents=block_state.image_latents, latents=block_state.latents)
# TODO: implement `check_inputs`
batch_size = block_state.batch_size * block_state.num_images_per_prompt
if block_state.latents is None:
block_state.latents, block_state.latent_image_ids = prepare_latents_img2img(
components.vae,
components.scheduler,
block_state.image_latents,
block_state.latent_timestep,
batch_size,
block_state.num_channels_latents,
block_state.height,
block_state.width,
block_state.dtype,
block_state.device,
block_state.generator,
)
# prepare latent timestep
latent_timestep = block_state.timesteps[:1].repeat(block_state.latents.shape[0])
# make copy of initial_noise
block_state.initial_noise = block_state.latents
# scale noise
block_state.latents = components.scheduler.scale_noise(
block_state.image_latents, latent_timestep, block_state.latents
)
self.set_block_state(state, block_state)
return components, state
class FluxRoPEInputsStep(ModularPipelineBlocks):
model_name = "flux"
@property
def description(self) -> str:
return "Step that prepares the RoPE inputs for the denoising process. Should be placed after text encoder and latent preparation steps."
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(name="prompt_embeds"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="txt_ids",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the prompt embeds, used for RoPE calculation.",
),
OutputParam(
name="img_ids",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the image latents, used for RoPE calculation.",
),
]
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
prompt_embeds = block_state.prompt_embeds
device, dtype = prompt_embeds.device, prompt_embeds.dtype
block_state.txt_ids = torch.zeros(prompt_embeds.shape[1], 3).to(
device=prompt_embeds.device, dtype=prompt_embeds.dtype
)
height = 2 * (int(block_state.height) // (components.vae_scale_factor * 2))
width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
block_state.img_ids = FluxPipeline._prepare_latent_image_ids(None, height // 2, width // 2, device, dtype)
self.set_block_state(state, block_state)
return components, state
class FluxKontextRoPEInputsStep(ModularPipelineBlocks):
model_name = "flux-kontext"
@property
def description(self) -> str:
return "Step that prepares the RoPE inputs for the denoising process of Flux Kontext. Should be placed after text encoder and latent preparation steps."
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="image_height"),
InputParam(name="image_width"),
InputParam(name="height"),
InputParam(name="width"),
InputParam(name="prompt_embeds"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="txt_ids",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the prompt embeds, used for RoPE calculation.",
),
OutputParam(
name="img_ids",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the image latents, used for RoPE calculation.",
),
]
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
prompt_embeds = block_state.prompt_embeds
device, dtype = prompt_embeds.device, prompt_embeds.dtype
block_state.txt_ids = torch.zeros(prompt_embeds.shape[1], 3).to(
device=prompt_embeds.device, dtype=prompt_embeds.dtype
)
img_ids = None
if (
getattr(block_state, "image_height", None) is not None
and getattr(block_state, "image_width", None) is not None
):
image_latent_height = 2 * (int(block_state.image_height) // (components.vae_scale_factor * 2))
image_latent_width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
img_ids = FluxPipeline._prepare_latent_image_ids(
None, image_latent_height // 2, image_latent_width // 2, device, dtype
)
# image ids are the same as latent ids with the first dimension set to 1 instead of 0
img_ids[..., 0] = 1
height = 2 * (int(block_state.height) // (components.vae_scale_factor * 2))
width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
latent_ids = FluxPipeline._prepare_latent_image_ids(None, height // 2, width // 2, device, dtype)
if img_ids is not None:
latent_ids = torch.cat([latent_ids, img_ids], dim=0)
block_state.img_ids = latent_ids
self.set_block_state(state, block_state)

View File

@@ -76,18 +76,17 @@ class FluxLoopDenoiser(ModularPipelineBlocks):
description="Pooled prompt embeddings",
),
InputParam(
"text_ids",
"txt_ids",
required=True,
type_hint=torch.Tensor,
description="IDs computed from text sequence needed for RoPE",
),
InputParam(
"latent_image_ids",
"img_ids",
required=True,
type_hint=torch.Tensor,
description="IDs computed from image sequence needed for RoPE",
),
# TODO: guidance
]
@torch.no_grad()
@@ -101,8 +100,8 @@ class FluxLoopDenoiser(ModularPipelineBlocks):
encoder_hidden_states=block_state.prompt_embeds,
pooled_projections=block_state.pooled_prompt_embeds,
joint_attention_kwargs=block_state.joint_attention_kwargs,
txt_ids=block_state.text_ids,
img_ids=block_state.latent_image_ids,
txt_ids=block_state.txt_ids,
img_ids=block_state.img_ids,
return_dict=False,
)[0]
block_state.noise_pred = noise_pred
@@ -110,6 +109,96 @@ class FluxLoopDenoiser(ModularPipelineBlocks):
return components, block_state
class FluxKontextLoopDenoiser(ModularPipelineBlocks):
model_name = "flux-kontext"
@property
def expected_components(self) -> List[ComponentSpec]:
return [ComponentSpec("transformer", FluxTransformer2DModel)]
@property
def description(self) -> str:
return (
"Step within the denoising loop that denoise the latents for Flux Kontext. "
"This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
"object (e.g. `FluxDenoiseLoopWrapper`)"
)
@property
def inputs(self) -> List[Tuple[str, Any]]:
return [
InputParam("joint_attention_kwargs"),
InputParam(
"latents",
required=True,
type_hint=torch.Tensor,
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
),
InputParam(
"image_latents",
type_hint=torch.Tensor,
description="Image latents to use for the denoising process. Can be generated in prepare_latent step.",
),
InputParam(
"guidance",
required=True,
type_hint=torch.Tensor,
description="Guidance scale as a tensor",
),
InputParam(
"prompt_embeds",
required=True,
type_hint=torch.Tensor,
description="Prompt embeddings",
),
InputParam(
"pooled_prompt_embeds",
required=True,
type_hint=torch.Tensor,
description="Pooled prompt embeddings",
),
InputParam(
"txt_ids",
required=True,
type_hint=torch.Tensor,
description="IDs computed from text sequence needed for RoPE",
),
InputParam(
"img_ids",
required=True,
type_hint=torch.Tensor,
description="IDs computed from latent sequence needed for RoPE",
),
]
@torch.no_grad()
def __call__(
self, components: FluxModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
) -> PipelineState:
latents = block_state.latents
latent_model_input = latents
image_latents = block_state.image_latents
if image_latents is not None:
latent_model_input = torch.cat([latent_model_input, image_latents], dim=1)
timestep = t.expand(latents.shape[0]).to(latents.dtype)
noise_pred = components.transformer(
hidden_states=latent_model_input,
timestep=timestep / 1000,
guidance=block_state.guidance,
encoder_hidden_states=block_state.prompt_embeds,
pooled_projections=block_state.pooled_prompt_embeds,
joint_attention_kwargs=block_state.joint_attention_kwargs,
txt_ids=block_state.txt_ids,
img_ids=block_state.img_ids,
return_dict=False,
)[0]
noise_pred = noise_pred[:, : latents.size(1)]
block_state.noise_pred = noise_pred
return components, block_state
class FluxLoopAfterDenoiser(ModularPipelineBlocks):
model_name = "flux"
@@ -195,9 +284,6 @@ class FluxDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
block_state.num_warmup_steps = max(
len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
)
# We set the index here to remove DtoH sync, helpful especially during compilation.
# Check out more details here: https://github.com/huggingface/diffusers/pull/11696
components.scheduler.set_begin_index(0)
with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
for i, t in enumerate(block_state.timesteps):
components, block_state = self.loop_step(components, block_state, i=i, t=t)
@@ -225,3 +311,20 @@ class FluxDenoiseStep(FluxDenoiseLoopWrapper):
" - `FluxLoopAfterDenoiser`\n"
"This block supports both text2image and img2img tasks."
)
class FluxKontextDenoiseStep(FluxDenoiseLoopWrapper):
model_name = "flux-kontext"
block_classes = [FluxKontextLoopDenoiser, FluxLoopAfterDenoiser]
block_names = ["denoiser", "after_denoiser"]
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `FluxDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
" - `FluxKontextLoopDenoiser`\n"
" - `FluxLoopAfterDenoiser`\n"
"This block supports both text2image and img2img tasks."
)

View File

@@ -20,12 +20,12 @@ import torch
from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
from ...configuration_utils import FrozenDict
from ...image_processor import VaeImageProcessor
from ...image_processor import VaeImageProcessor, is_valid_image, is_valid_image_imagelist
from ...loaders import FluxLoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL
from ...utils import USE_PEFT_BACKEND, is_ftfy_available, logging, scale_lora_layers, unscale_lora_layers
from ..modular_pipeline import ModularPipelineBlocks, PipelineState
from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
from .modular_pipeline import FluxModularPipeline
@@ -67,89 +67,219 @@ def retrieve_latents(
raise AttributeError("Could not access latents of provided encoder_output")
class FluxVaeEncoderStep(ModularPipelineBlocks):
def encode_vae_image(vae: AutoencoderKL, image: torch.Tensor, generator: torch.Generator, sample_mode="sample"):
if isinstance(generator, list):
image_latents = [
retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode)
for i in range(image.shape[0])
]
image_latents = torch.cat(image_latents, dim=0)
else:
image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode=sample_mode)
image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
return image_latents
class FluxProcessImagesInputStep(ModularPipelineBlocks):
model_name = "flux"
@property
def description(self) -> str:
return "Vae Encoder step that encode the input image into a latent representation"
return "Image Preprocess step."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("vae", AutoencoderKL),
ComponentSpec(
"image_processor",
VaeImageProcessor,
config=FrozenDict({"vae_scale_factor": 16, "vae_latent_channels": 16}),
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
@property
def inputs(self) -> List[InputParam]:
return [InputParam("resized_image"), InputParam("image"), InputParam("height"), InputParam("width")]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [OutputParam(name="processed_image")]
@staticmethod
def check_inputs(height, width, vae_scale_factor):
if height is not None and height % (vae_scale_factor * 2) != 0:
raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
if width is not None and width % (vae_scale_factor * 2) != 0:
raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
@torch.no_grad()
def __call__(self, components: FluxModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
if block_state.resized_image is None and block_state.image is None:
raise ValueError("`resized_image` and `image` cannot be None at the same time")
if block_state.resized_image is None:
image = block_state.image
self.check_inputs(
height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
)
height = block_state.height or components.default_height
width = block_state.width or components.default_width
else:
width, height = block_state.resized_image[0].size
image = block_state.resized_image
block_state.processed_image = components.image_processor.preprocess(image=image, height=height, width=width)
self.set_block_state(state, block_state)
return components, state
class FluxKontextProcessImagesInputStep(ModularPipelineBlocks):
model_name = "flux-kontext"
def __init__(self, _auto_resize=True):
self._auto_resize = _auto_resize
super().__init__()
@property
def description(self) -> str:
return (
"Image preprocess step for Flux Kontext. The preprocessed image goes to the VAE.\n"
"Kontext works as a T2I model, too, in case no input image is provided."
)
@property
def expected_components(self) -> List[ComponentSpec]:
return [
InputParam("image", required=True),
InputParam("height"),
InputParam("width"),
InputParam("generator"),
InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
InputParam(
"preprocess_kwargs",
type_hint=Optional[dict],
description="A kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under `self.image_processor` in [diffusers.image_processor.VaeImageProcessor]",
ComponentSpec(
"image_processor",
VaeImageProcessor,
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
@property
def inputs(self) -> List[InputParam]:
return [InputParam("image")]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [OutputParam(name="processed_image")]
@torch.no_grad()
def __call__(self, components: FluxModularPipeline, state: PipelineState):
from ...pipelines.flux.pipeline_flux_kontext import PREFERRED_KONTEXT_RESOLUTIONS
block_state = self.get_block_state(state)
images = block_state.image
if images is None:
block_state.processed_image = None
else:
multiple_of = components.image_processor.config.vae_scale_factor
if not is_valid_image_imagelist(images):
raise ValueError(f"Images must be image or list of images but are {type(images)}")
if is_valid_image(images):
images = [images]
img = images[0]
image_height, image_width = components.image_processor.get_default_height_width(img)
aspect_ratio = image_width / image_height
if self._auto_resize:
# Kontext is trained on specific resolutions, using one of them is recommended
_, image_width, image_height = min(
(abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
)
image_width = image_width // multiple_of * multiple_of
image_height = image_height // multiple_of * multiple_of
images = components.image_processor.resize(images, image_height, image_width)
block_state.processed_image = components.image_processor.preprocess(images, image_height, image_width)
self.set_block_state(state, block_state)
return components, state
class FluxVaeEncoderDynamicStep(ModularPipelineBlocks):
model_name = "flux"
def __init__(
self, input_name: str = "processed_image", output_name: str = "image_latents", sample_mode: str = "sample"
):
"""Initialize a VAE encoder step for converting images to latent representations.
Both the input and output names are configurable so this block can be configured to process to different image
inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").
Args:
input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
Examples: "processed_image" or "processed_control_image"
output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
Examples: "image_latents" or "control_image_latents"
sample_mode (str, optional): Sampling mode to be used.
Examples:
# Basic usage with default settings (includes image processor): # FluxImageVaeEncoderDynamicStep()
# Custom input/output names for control image: # FluxImageVaeEncoderDynamicStep(
input_name="processed_control_image", output_name="control_image_latents"
)
"""
self._image_input_name = input_name
self._image_latents_output_name = output_name
self.sample_mode = sample_mode
super().__init__()
@property
def description(self) -> str:
return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
@property
def expected_components(self) -> List[ComponentSpec]:
components = [ComponentSpec("vae", AutoencoderKL)]
return components
@property
def inputs(self) -> List[InputParam]:
inputs = [InputParam(self._image_input_name), InputParam("generator")]
return inputs
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
"image_latents",
self._image_latents_output_name,
type_hint=torch.Tensor,
description="The latents representing the reference image for image-to-image/inpainting generation",
description="The latents representing the reference image",
)
]
@staticmethod
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image with self.vae->vae
def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator):
if isinstance(generator, list):
image_latents = [
retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0])
]
image_latents = torch.cat(image_latents, dim=0)
else:
image_latents = retrieve_latents(vae.encode(image), generator=generator)
image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
return image_latents
@torch.no_grad()
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
block_state.preprocess_kwargs = block_state.preprocess_kwargs or {}
block_state.device = components._execution_device
block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
image = getattr(block_state, self._image_input_name)
block_state.image = components.image_processor.preprocess(
block_state.image, height=block_state.height, width=block_state.width, **block_state.preprocess_kwargs
)
block_state.image = block_state.image.to(device=block_state.device, dtype=block_state.dtype)
if image is None:
setattr(block_state, self._image_latents_output_name, None)
else:
device = components._execution_device
dtype = components.vae.dtype
image = image.to(device=device, dtype=dtype)
block_state.batch_size = block_state.image.shape[0]
# if generator is a list, make sure the length of it matches the length of images (both should be batch_size)
if isinstance(block_state.generator, list) and len(block_state.generator) != block_state.batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
f" size of {block_state.batch_size}. Make sure the batch size matches the length of the generators."
# Encode image into latents
image_latents = encode_vae_image(
image=image, vae=components.vae, generator=block_state.generator, sample_mode=self.sample_mode
)
block_state.image_latents = self._encode_vae_image(
components.vae, image=block_state.image, generator=block_state.generator
)
setattr(block_state, self._image_latents_output_name, image_latents)
self.set_block_state(state, block_state)
@@ -161,7 +291,7 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
@property
def description(self) -> str:
return "Text Encoder step that generate text_embeddings to guide the video generation"
return "Text Encoder step that generate text_embeddings to guide the image generation"
@property
def expected_components(self) -> List[ComponentSpec]:
@@ -172,15 +302,12 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
ComponentSpec("tokenizer_2", T5TokenizerFast),
]
@property
def expected_configs(self) -> List[ConfigSpec]:
return []
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("prompt"),
InputParam("prompt_2"),
InputParam("max_sequence_length", type_hint=int, default=512, required=False),
InputParam("joint_attention_kwargs"),
]
@@ -189,19 +316,16 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
return [
OutputParam(
"prompt_embeds",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="text embeddings used to guide the image generation",
),
OutputParam(
"pooled_prompt_embeds",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="pooled text embeddings used to guide the image generation",
),
OutputParam(
"text_ids",
type_hint=torch.Tensor,
description="ids from the text sequence for RoPE",
),
]
@staticmethod
@@ -212,16 +336,10 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
@staticmethod
def _get_t5_prompt_embeds(
components,
prompt: Union[str, List[str]],
num_images_per_prompt: int,
max_sequence_length: int,
device: torch.device,
components, prompt: Union[str, List[str]], max_sequence_length: int, device: torch.device
):
dtype = components.text_encoder_2.dtype
prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt)
if isinstance(components, TextualInversionLoaderMixin):
prompt = components.maybe_convert_prompt(prompt, components.tokenizer_2)
@@ -247,23 +365,11 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
prompt_embeds = components.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
_, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
return prompt_embeds
@staticmethod
def _get_clip_prompt_embeds(
components,
prompt: Union[str, List[str]],
num_images_per_prompt: int,
device: torch.device,
):
def _get_clip_prompt_embeds(components, prompt: Union[str, List[str]], device: torch.device):
prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt)
if isinstance(components, TextualInversionLoaderMixin):
prompt = components.maybe_convert_prompt(prompt, components.tokenizer)
@@ -293,10 +399,6 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
prompt_embeds = prompt_embeds.pooler_output
prompt_embeds = prompt_embeds.to(dtype=components.text_encoder.dtype, device=device)
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
return prompt_embeds
@staticmethod
@@ -305,34 +407,11 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
prompt: Union[str, List[str]],
prompt_2: Union[str, List[str]],
device: Optional[torch.device] = None,
num_images_per_prompt: int = 1,
prompt_embeds: Optional[torch.FloatTensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
max_sequence_length: int = 512,
lora_scale: Optional[float] = None,
):
r"""
Encodes the prompt into text encoder hidden states.
Args:
prompt (`str` or `List[str]`, *optional*):
prompt to be encoded
prompt_2 (`str` or `List[str]`, *optional*):
The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
used in all text-encoders
device: (`torch.device`):
torch device
num_images_per_prompt (`int`):
number of images that should be generated per prompt
prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
If not provided, pooled text embeddings will be generated from `prompt` input argument.
lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
"""
device = device or components._execution_device
# set lora scale so that monkey patched LoRA
@@ -357,12 +436,10 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
components,
prompt=prompt,
device=device,
num_images_per_prompt=num_images_per_prompt,
)
prompt_embeds = FluxTextEncoderStep._get_t5_prompt_embeds(
components,
prompt=prompt_2,
num_images_per_prompt=num_images_per_prompt,
max_sequence_length=max_sequence_length,
device=device,
)
@@ -377,10 +454,7 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
# Retrieve the original scale by scaling back the LoRA layers
unscale_lora_layers(components.text_encoder_2, lora_scale)
dtype = components.text_encoder.dtype if components.text_encoder is not None else torch.bfloat16
text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
return prompt_embeds, pooled_prompt_embeds, text_ids
return prompt_embeds, pooled_prompt_embeds
@torch.no_grad()
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
@@ -396,14 +470,14 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
if block_state.joint_attention_kwargs is not None
else None
)
(block_state.prompt_embeds, block_state.pooled_prompt_embeds, block_state.text_ids) = self.encode_prompt(
block_state.prompt_embeds, block_state.pooled_prompt_embeds = self.encode_prompt(
components,
prompt=block_state.prompt,
prompt_2=None,
prompt_embeds=None,
pooled_prompt_embeds=None,
device=block_state.device,
num_images_per_prompt=1, # TODO: hardcoded for now.
max_sequence_length=block_state.max_sequence_length,
lora_scale=block_state.text_encoder_lora_scale,
)

View File

@@ -0,0 +1,359 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import torch
from ...pipelines import FluxPipeline
from ...utils import logging
from ..modular_pipeline import ModularPipelineBlocks, PipelineState
from ..modular_pipeline_utils import InputParam, OutputParam
# TODO: consider making these common utilities for modular if they are not pipeline-specific.
from ..qwenimage.inputs import calculate_dimension_from_latents, repeat_tensor_to_batch_size
from .modular_pipeline import FluxModularPipeline
logger = logging.get_logger(__name__)
class FluxTextInputStep(ModularPipelineBlocks):
model_name = "flux"
@property
def description(self) -> str:
return (
"Text input processing step that standardizes text embeddings for the pipeline.\n"
"This step:\n"
" 1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
" 2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)"
)
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("num_images_per_prompt", default=1),
InputParam(
"prompt_embeds",
required=True,
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="Pre-generated text embeddings. Can be generated from text_encoder step.",
),
InputParam(
"pooled_prompt_embeds",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.",
),
# TODO: support negative embeddings?
]
@property
def intermediate_outputs(self) -> List[str]:
return [
OutputParam(
"batch_size",
type_hint=int,
description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
),
OutputParam(
"dtype",
type_hint=torch.dtype,
description="Data type of model tensor inputs (determined by `prompt_embeds`)",
),
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="denoiser_input_fields",
description="text embeddings used to guide the image generation",
),
OutputParam(
"pooled_prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="denoiser_input_fields",
description="pooled text embeddings used to guide the image generation",
),
# TODO: support negative embeddings?
]
def check_inputs(self, components, block_state):
if block_state.prompt_embeds is not None and block_state.pooled_prompt_embeds is not None:
if block_state.prompt_embeds.shape[0] != block_state.pooled_prompt_embeds.shape[0]:
raise ValueError(
"`prompt_embeds` and `pooled_prompt_embeds` must have the same batch size when passed directly, but"
f" got: `prompt_embeds` {block_state.prompt_embeds.shape} != `pooled_prompt_embeds`"
f" {block_state.pooled_prompt_embeds.shape}."
)
@torch.no_grad()
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
# TODO: consider adding negative embeddings?
block_state = self.get_block_state(state)
self.check_inputs(components, block_state)
block_state.batch_size = block_state.prompt_embeds.shape[0]
block_state.dtype = block_state.prompt_embeds.dtype
_, seq_len, _ = block_state.prompt_embeds.shape
block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
block_state.prompt_embeds = block_state.prompt_embeds.view(
block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
)
self.set_block_state(state, block_state)
return components, state
# Adapted from `QwenImageInputsDynamicStep`
class FluxInputsDynamicStep(ModularPipelineBlocks):
model_name = "flux"
def __init__(
self,
image_latent_inputs: List[str] = ["image_latents"],
additional_batch_inputs: List[str] = [],
):
if not isinstance(image_latent_inputs, list):
image_latent_inputs = [image_latent_inputs]
if not isinstance(additional_batch_inputs, list):
additional_batch_inputs = [additional_batch_inputs]
self._image_latent_inputs = image_latent_inputs
self._additional_batch_inputs = additional_batch_inputs
super().__init__()
@property
def description(self) -> str:
# Functionality section
summary_section = (
"Input processing step that:\n"
" 1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
" 2. For additional batch inputs: Expands batch dimensions to match final batch size"
)
# Inputs info
inputs_info = ""
if self._image_latent_inputs or self._additional_batch_inputs:
inputs_info = "\n\nConfigured inputs:"
if self._image_latent_inputs:
inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}"
if self._additional_batch_inputs:
inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}"
# Placement guidance
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
return summary_section + inputs_info + placement_section
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam(name="num_images_per_prompt", default=1),
InputParam(name="batch_size", required=True),
InputParam(name="height"),
InputParam(name="width"),
]
# Add image latent inputs
for image_latent_input_name in self._image_latent_inputs:
inputs.append(InputParam(name=image_latent_input_name))
# Add additional batch inputs
for input_name in self._additional_batch_inputs:
inputs.append(InputParam(name=input_name))
return inputs
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(name="image_height", type_hint=int, description="The height of the image latents"),
OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
]
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
# Process image latent inputs (height/width calculation, patchify, and batch expansion)
for image_latent_input_name in self._image_latent_inputs:
image_latent_tensor = getattr(block_state, image_latent_input_name)
if image_latent_tensor is None:
continue
# 1. Calculate height/width from latents
height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
block_state.height = block_state.height or height
block_state.width = block_state.width or width
if not hasattr(block_state, "image_height"):
block_state.image_height = height
if not hasattr(block_state, "image_width"):
block_state.image_width = width
# 2. Patchify the image latent tensor
# TODO: Implement patchifier for Flux.
latent_height, latent_width = image_latent_tensor.shape[2:]
image_latent_tensor = FluxPipeline._pack_latents(
image_latent_tensor, block_state.batch_size, image_latent_tensor.shape[1], latent_height, latent_width
)
# 3. Expand batch size
image_latent_tensor = repeat_tensor_to_batch_size(
input_name=image_latent_input_name,
input_tensor=image_latent_tensor,
num_images_per_prompt=block_state.num_images_per_prompt,
batch_size=block_state.batch_size,
)
setattr(block_state, image_latent_input_name, image_latent_tensor)
# Process additional batch inputs (only batch expansion)
for input_name in self._additional_batch_inputs:
input_tensor = getattr(block_state, input_name)
if input_tensor is None:
continue
# Only expand batch size
input_tensor = repeat_tensor_to_batch_size(
input_name=input_name,
input_tensor=input_tensor,
num_images_per_prompt=block_state.num_images_per_prompt,
batch_size=block_state.batch_size,
)
setattr(block_state, input_name, input_tensor)
self.set_block_state(state, block_state)
return components, state
class FluxKontextInputsDynamicStep(FluxInputsDynamicStep):
model_name = "flux-kontext"
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
# Process image latent inputs (height/width calculation, patchify, and batch expansion)
for image_latent_input_name in self._image_latent_inputs:
image_latent_tensor = getattr(block_state, image_latent_input_name)
if image_latent_tensor is None:
continue
# 1. Calculate height/width from latents
# Unlike the `FluxInputsDynamicStep`, we don't overwrite the `block.height` and `block.width`
height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
if not hasattr(block_state, "image_height"):
block_state.image_height = height
if not hasattr(block_state, "image_width"):
block_state.image_width = width
# 2. Patchify the image latent tensor
# TODO: Implement patchifier for Flux.
latent_height, latent_width = image_latent_tensor.shape[2:]
image_latent_tensor = FluxPipeline._pack_latents(
image_latent_tensor, block_state.batch_size, image_latent_tensor.shape[1], latent_height, latent_width
)
# 3. Expand batch size
image_latent_tensor = repeat_tensor_to_batch_size(
input_name=image_latent_input_name,
input_tensor=image_latent_tensor,
num_images_per_prompt=block_state.num_images_per_prompt,
batch_size=block_state.batch_size,
)
setattr(block_state, image_latent_input_name, image_latent_tensor)
# Process additional batch inputs (only batch expansion)
for input_name in self._additional_batch_inputs:
input_tensor = getattr(block_state, input_name)
if input_tensor is None:
continue
# Only expand batch size
input_tensor = repeat_tensor_to_batch_size(
input_name=input_name,
input_tensor=input_tensor,
num_images_per_prompt=block_state.num_images_per_prompt,
batch_size=block_state.batch_size,
)
setattr(block_state, input_name, input_tensor)
self.set_block_state(state, block_state)
return components, state
class FluxKontextSetResolutionStep(ModularPipelineBlocks):
model_name = "flux-kontext"
def description(self):
return (
"Determines the height and width to be used during the subsequent computations.\n"
"It should always be placed _before_ the latent preparation step."
)
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam(name="height"),
InputParam(name="width"),
InputParam(name="max_area", type_hint=int, default=1024**2),
]
return inputs
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(name="height", type_hint=int, description="The height of the initial noisy latents"),
OutputParam(name="width", type_hint=int, description="The width of the initial noisy latents"),
]
@staticmethod
def check_inputs(height, width, vae_scale_factor):
if height is not None and height % (vae_scale_factor * 2) != 0:
raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
if width is not None and width % (vae_scale_factor * 2) != 0:
raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
height = block_state.height or components.default_height
width = block_state.width or components.default_width
self.check_inputs(height, width, components.vae_scale_factor)
original_height, original_width = height, width
max_area = block_state.max_area
aspect_ratio = width / height
width = round((max_area * aspect_ratio) ** 0.5)
height = round((max_area / aspect_ratio) ** 0.5)
multiple_of = components.vae_scale_factor * 2
width = width // multiple_of * multiple_of
height = height // multiple_of * multiple_of
if height != original_height or width != original_width:
logger.warning(
f"Generation `height` and `width` have been adjusted to {height} and {width} to fit the model requirements."
)
block_state.height = height
block_state.width = width
self.set_block_state(state, block_state)
return components, state

View File

@@ -18,21 +18,49 @@ from ..modular_pipeline_utils import InsertableDict
from .before_denoise import (
FluxImg2ImgPrepareLatentsStep,
FluxImg2ImgSetTimestepsStep,
FluxInputStep,
FluxKontextRoPEInputsStep,
FluxPrepareLatentsStep,
FluxRoPEInputsStep,
FluxSetTimestepsStep,
)
from .decoders import FluxDecodeStep
from .denoise import FluxDenoiseStep
from .encoders import FluxTextEncoderStep, FluxVaeEncoderStep
from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
from .encoders import (
FluxKontextProcessImagesInputStep,
FluxProcessImagesInputStep,
FluxTextEncoderStep,
FluxVaeEncoderDynamicStep,
)
from .inputs import (
FluxInputsDynamicStep,
FluxKontextInputsDynamicStep,
FluxKontextSetResolutionStep,
FluxTextInputStep,
)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
# vae encoder (run before before_denoise)
FluxImg2ImgVaeEncoderBlocks = InsertableDict(
[("preprocess", FluxProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep())]
)
class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
model_name = "flux"
block_classes = FluxImg2ImgVaeEncoderBlocks.values()
block_names = FluxImg2ImgVaeEncoderBlocks.keys()
@property
def description(self) -> str:
return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
block_classes = [FluxVaeEncoderStep]
block_classes = [FluxImg2ImgVaeEncoderStep]
block_names = ["img2img"]
block_trigger_inputs = ["image"]
@@ -41,52 +69,89 @@ class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
return (
"Vae encoder step that encode the image inputs into their latent representations.\n"
+ "This is an auto pipeline block that works for img2img tasks.\n"
+ " - `FluxVaeEncoderStep` (img2img) is used when only `image` is provided."
+ " - if `image` is provided, step will be skipped."
+ " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
+ " - if `image` is not provided, step will be skipped."
)
# before_denoise: text2img, img2img
class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [
FluxInputStep,
FluxPrepareLatentsStep,
FluxSetTimestepsStep,
]
block_names = ["input", "prepare_latents", "set_timesteps"]
# Flux Kontext vae encoder (run before before_denoise)
FluxKontextVaeEncoderBlocks = InsertableDict(
[("preprocess", FluxKontextProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep(sample_mode="argmax"))]
)
class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
model_name = "flux-kontext"
block_classes = FluxKontextVaeEncoderBlocks.values()
block_names = FluxKontextVaeEncoderBlocks.keys()
@property
def description(self) -> str:
return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
block_classes = [FluxKontextVaeEncoderStep]
block_names = ["img2img"]
block_trigger_inputs = ["image"]
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step.\n"
+ "This is a sequential pipeline blocks:\n"
+ " - `FluxInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `FluxPrepareLatentsStep` is used to prepare the latents\n"
+ " - `FluxSetTimestepsStep` is used to set the timesteps\n"
"Vae encoder step that encode the image inputs into their latent representations.\n"
+ "This is an auto pipeline block that works for img2img tasks.\n"
+ " - `FluxKontextVaeEncoderStep` (img2img) is used when only `image` is provided."
+ " - if `image` is not provided, step will be skipped."
)
# before_denoise: text2img
FluxBeforeDenoiseBlocks = InsertableDict(
[
("prepare_latents", FluxPrepareLatentsStep()),
("set_timesteps", FluxSetTimestepsStep()),
("prepare_rope_inputs", FluxRoPEInputsStep()),
]
)
class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = FluxBeforeDenoiseBlocks.values()
block_names = FluxBeforeDenoiseBlocks.keys()
@property
def description(self):
return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
# before_denoise: img2img
FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
[
("prepare_latents", FluxPrepareLatentsStep()),
("set_timesteps", FluxImg2ImgSetTimestepsStep()),
("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
("prepare_rope_inputs", FluxRoPEInputsStep()),
]
)
class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [FluxInputStep, FluxImg2ImgSetTimestepsStep, FluxImg2ImgPrepareLatentsStep]
block_names = ["input", "set_timesteps", "prepare_latents"]
block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step for img2img task.\n"
+ "This is a sequential pipeline blocks:\n"
+ " - `FluxInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `FluxImg2ImgSetTimestepsStep` is used to set the timesteps\n"
+ " - `FluxImg2ImgPrepareLatentsStep` is used to prepare the latents\n"
)
return "Before denoise step that prepare the inputs for the denoise step for img2img task."
# before_denoise: all task (text2img, img2img)
class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
block_classes = [FluxBeforeDenoiseStep, FluxImg2ImgBeforeDenoiseStep]
block_names = ["text2image", "img2img"]
block_trigger_inputs = [None, "image_latents"]
model_name = "flux-kontext"
block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
block_names = ["img2img", "text2image"]
block_trigger_inputs = ["image_latents", None]
@property
def description(self):
@@ -98,6 +163,44 @@ class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
)
# before_denoise: FluxKontext
FluxKontextBeforeDenoiseBlocks = InsertableDict(
[
("prepare_latents", FluxPrepareLatentsStep()),
("set_timesteps", FluxSetTimestepsStep()),
("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
]
)
class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = FluxKontextBeforeDenoiseBlocks.values()
block_names = FluxKontextBeforeDenoiseBlocks.keys()
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step\n"
"for img2img/text2img task for Flux Kontext."
)
class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
block_classes = [FluxKontextBeforeDenoiseStep, FluxBeforeDenoiseStep]
block_names = ["img2img", "text2image"]
block_trigger_inputs = ["image_latents", None]
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step.\n"
+ "This is an auto pipeline block that works for text2image.\n"
+ " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
+ " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
)
# denoise: text2image
class FluxAutoDenoiseStep(AutoPipelineBlocks):
block_classes = [FluxDenoiseStep]
@@ -113,7 +216,24 @@ class FluxAutoDenoiseStep(AutoPipelineBlocks):
)
# decode: all task (text2img, img2img, inpainting)
# denoise: Flux Kontext
class FluxKontextAutoDenoiseStep(AutoPipelineBlocks):
block_classes = [FluxKontextDenoiseStep]
block_names = ["denoise"]
block_trigger_inputs = [None]
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents for Flux Kontext. "
"This is a auto pipeline block that works for text2image and img2img tasks."
" - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
)
# decode: all task (text2img, img2img)
class FluxAutoDecodeStep(AutoPipelineBlocks):
block_classes = [FluxDecodeStep]
block_names = ["non-inpaint"]
@@ -124,16 +244,143 @@ class FluxAutoDecodeStep(AutoPipelineBlocks):
return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
# text2image
class FluxAutoBlocks(SequentialPipelineBlocks):
block_classes = [
FluxTextEncoderStep,
FluxAutoVaeEncoderStep,
FluxAutoBeforeDenoiseStep,
FluxAutoDenoiseStep,
FluxAutoDecodeStep,
# inputs: text2image/img2img
FluxImg2ImgBlocks = InsertableDict(
[("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
)
class FluxImg2ImgInputStep(SequentialPipelineBlocks):
model_name = "flux"
block_classes = FluxImg2ImgBlocks.values()
block_names = FluxImg2ImgBlocks.keys()
@property
def description(self):
return "Input step that prepares the inputs for the img2img denoising step. It:\n"
" - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
" - update height/width based `image_latents`, patchify `image_latents`."
class FluxAutoInputStep(AutoPipelineBlocks):
block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
block_names = ["img2img", "text2image"]
block_trigger_inputs = ["image_latents", None]
@property
def description(self):
return (
"Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
" This is an auto pipeline block that works for text2image/img2img tasks.\n"
+ " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
+ " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
)
# inputs: Flux Kontext
FluxKontextBlocks = InsertableDict(
[
("set_resolution", FluxKontextSetResolutionStep()),
("text_inputs", FluxTextInputStep()),
("additional_inputs", FluxKontextInputsDynamicStep()),
]
block_names = ["text_encoder", "image_encoder", "before_denoise", "denoise", "decoder"]
)
class FluxKontextInputStep(SequentialPipelineBlocks):
model_name = "flux-kontext"
block_classes = FluxKontextBlocks.values()
block_names = FluxKontextBlocks.keys()
@property
def description(self):
return (
"Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n"
" - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
" - update height/width based `image_latents`, patchify `image_latents`."
)
class FluxKontextAutoInputStep(AutoPipelineBlocks):
block_classes = [FluxKontextInputStep, FluxTextInputStep]
# block_classes = [FluxKontextInputStep]
block_names = ["img2img", "text2img"]
# block_names = ["img2img"]
block_trigger_inputs = ["image_latents", None]
# block_trigger_inputs = ["image_latents"]
@property
def description(self):
return (
"Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
" This is an auto pipeline block that works for text2image/img2img tasks.\n"
+ " - `FluxKontextInputStep` (img2img) is used when `image_latents` is provided.\n"
+ " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
)
class FluxCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "flux"
block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
block_names = ["input", "before_denoise", "denoise"]
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `FluxAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+ " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+ "This step supports text-to-image and image-to-image tasks for Flux:\n"
+ " - for image-to-image generation, you need to provide `image_latents`\n"
+ " - for text-to-image generation, all you need to provide is prompt embeddings."
)
class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "flux-kontext"
block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextAutoDenoiseStep]
block_names = ["input", "before_denoise", "denoise"]
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `FluxKontextAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+ " - `FluxKontextAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `FluxKontextAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+ "This step supports text-to-image and image-to-image tasks for Flux:\n"
+ " - for image-to-image generation, you need to provide `image_latents`\n"
+ " - for text-to-image generation, all you need to provide is prompt embeddings."
)
# Auto blocks (text2image and img2img)
AUTO_BLOCKS = InsertableDict(
[
("text_encoder", FluxTextEncoderStep()),
("image_encoder", FluxAutoVaeEncoderStep()),
("denoise", FluxCoreDenoiseStep()),
("decode", FluxDecodeStep()),
]
)
AUTO_BLOCKS_KONTEXT = InsertableDict(
[
("text_encoder", FluxTextEncoderStep()),
("image_encoder", FluxKontextAutoVaeEncoderStep()),
("denoise", FluxKontextCoreDenoiseStep()),
("decode", FluxDecodeStep()),
]
)
class FluxAutoBlocks(SequentialPipelineBlocks):
model_name = "flux"
block_classes = AUTO_BLOCKS.values()
block_names = AUTO_BLOCKS.keys()
@property
def description(self):
@@ -144,38 +391,56 @@ class FluxAutoBlocks(SequentialPipelineBlocks):
)
class FluxKontextAutoBlocks(FluxAutoBlocks):
model_name = "flux-kontext"
block_classes = AUTO_BLOCKS_KONTEXT.values()
block_names = AUTO_BLOCKS_KONTEXT.keys()
TEXT2IMAGE_BLOCKS = InsertableDict(
[
("text_encoder", FluxTextEncoderStep),
("input", FluxInputStep),
("prepare_latents", FluxPrepareLatentsStep),
("set_timesteps", FluxSetTimestepsStep),
("denoise", FluxDenoiseStep),
("decode", FluxDecodeStep),
("text_encoder", FluxTextEncoderStep()),
("input", FluxTextInputStep()),
("prepare_latents", FluxPrepareLatentsStep()),
("set_timesteps", FluxSetTimestepsStep()),
("prepare_rope_inputs", FluxRoPEInputsStep()),
("denoise", FluxDenoiseStep()),
("decode", FluxDecodeStep()),
]
)
IMAGE2IMAGE_BLOCKS = InsertableDict(
[
("text_encoder", FluxTextEncoderStep),
("image_encoder", FluxVaeEncoderStep),
("input", FluxInputStep),
("set_timesteps", FluxImg2ImgSetTimestepsStep),
("prepare_latents", FluxImg2ImgPrepareLatentsStep),
("denoise", FluxDenoiseStep),
("decode", FluxDecodeStep),
("text_encoder", FluxTextEncoderStep()),
("vae_encoder", FluxVaeEncoderDynamicStep()),
("input", FluxImg2ImgInputStep()),
("prepare_latents", FluxPrepareLatentsStep()),
("set_timesteps", FluxImg2ImgSetTimestepsStep()),
("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
("prepare_rope_inputs", FluxRoPEInputsStep()),
("denoise", FluxDenoiseStep()),
("decode", FluxDecodeStep()),
]
)
AUTO_BLOCKS = InsertableDict(
FLUX_KONTEXT_BLOCKS = InsertableDict(
[
("text_encoder", FluxTextEncoderStep),
("image_encoder", FluxAutoVaeEncoderStep),
("before_denoise", FluxAutoBeforeDenoiseStep),
("denoise", FluxAutoDenoiseStep),
("decode", FluxAutoDecodeStep),
("text_encoder", FluxTextEncoderStep()),
("vae_encoder", FluxVaeEncoderDynamicStep(sample_mode="argmax")),
("input", FluxKontextInputStep()),
("prepare_latents", FluxPrepareLatentsStep()),
("set_timesteps", FluxSetTimestepsStep()),
("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
("denoise", FluxKontextDenoiseStep()),
("decode", FluxDecodeStep()),
]
)
ALL_BLOCKS = {"text2image": TEXT2IMAGE_BLOCKS, "img2img": IMAGE2IMAGE_BLOCKS, "auto": AUTO_BLOCKS}
ALL_BLOCKS = {
"text2image": TEXT2IMAGE_BLOCKS,
"img2img": IMAGE2IMAGE_BLOCKS,
"auto": AUTO_BLOCKS,
"auto_kontext": AUTO_BLOCKS_KONTEXT,
"kontext": FLUX_KONTEXT_BLOCKS,
}

View File

@@ -55,3 +55,13 @@ class FluxModularPipeline(ModularPipeline, FluxLoraLoaderMixin, TextualInversion
if getattr(self, "transformer", None):
num_channels_latents = self.transformer.config.in_channels // 4
return num_channels_latents
class FluxKontextModularPipeline(FluxModularPipeline):
"""
A ModularPipeline for Flux Kontext.
> [!WARNING] > This is an experimental feature and is likely to change in the future.
"""
default_blocks_name = "FluxKontextAutoBlocks"

View File

@@ -57,8 +57,10 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),
("wan", "WanModularPipeline"),
("flux", "FluxModularPipeline"),
("flux-kontext", "FluxKontextModularPipeline"),
("qwenimage", "QwenImageModularPipeline"),
("qwenimage-edit", "QwenImageEditModularPipeline"),
("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"),
]
)
@@ -1628,7 +1630,8 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
blocks = ModularPipelineBlocks.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
)
except EnvironmentError:
except EnvironmentError as e:
logger.debug(f"EnvironmentError: {e}")
blocks = None
cache_dir = kwargs.pop("cache_dir", None)

View File

@@ -29,13 +29,20 @@ else:
"EDIT_AUTO_BLOCKS",
"EDIT_BLOCKS",
"EDIT_INPAINT_BLOCKS",
"EDIT_PLUS_AUTO_BLOCKS",
"EDIT_PLUS_BLOCKS",
"IMAGE2IMAGE_BLOCKS",
"INPAINT_BLOCKS",
"TEXT2IMAGE_BLOCKS",
"QwenImageAutoBlocks",
"QwenImageEditAutoBlocks",
"QwenImageEditPlusAutoBlocks",
]
_import_structure["modular_pipeline"] = [
"QwenImageEditModularPipeline",
"QwenImageEditPlusModularPipeline",
"QwenImageModularPipeline",
]
_import_structure["modular_pipeline"] = ["QwenImageEditModularPipeline", "QwenImageModularPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
@@ -54,13 +61,20 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
EDIT_AUTO_BLOCKS,
EDIT_BLOCKS,
EDIT_INPAINT_BLOCKS,
EDIT_PLUS_AUTO_BLOCKS,
EDIT_PLUS_BLOCKS,
IMAGE2IMAGE_BLOCKS,
INPAINT_BLOCKS,
TEXT2IMAGE_BLOCKS,
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
QwenImageEditPlusAutoBlocks,
)
from .modular_pipeline import (
QwenImageEditModularPipeline,
QwenImageEditPlusModularPipeline,
QwenImageModularPipeline,
)
from .modular_pipeline import QwenImageEditModularPipeline, QwenImageModularPipeline
else:
import sys

View File

@@ -203,7 +203,6 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
block_state.latents = components.pachifier.pack_latents(block_state.latents)
self.set_block_state(state, block_state)
return components, state
@@ -571,7 +570,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
@property
def description(self) -> str:
return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be place after prepare_latents step"
return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step"
@property
def inputs(self) -> List[InputParam]:

View File

@@ -128,6 +128,61 @@ def get_qwen_prompt_embeds_edit(
return prompt_embeds, encoder_attention_mask
def get_qwen_prompt_embeds_edit_plus(
text_encoder,
processor,
prompt: Union[str, List[str]] = None,
image: Optional[Union[torch.Tensor, List[PIL.Image.Image], PIL.Image.Image]] = None,
prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
img_template_encode: str = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>",
prompt_template_encode_start_idx: int = 64,
device: Optional[torch.device] = None,
):
prompt = [prompt] if isinstance(prompt, str) else prompt
if isinstance(image, list):
base_img_prompt = ""
for i, img in enumerate(image):
base_img_prompt += img_template_encode.format(i + 1)
elif image is not None:
base_img_prompt = img_template_encode.format(1)
else:
base_img_prompt = ""
template = prompt_template_encode
drop_idx = prompt_template_encode_start_idx
txt = [template.format(base_img_prompt + e) for e in prompt]
model_inputs = processor(
text=txt,
images=image,
padding=True,
return_tensors="pt",
).to(device)
outputs = text_encoder(
input_ids=model_inputs.input_ids,
attention_mask=model_inputs.attention_mask,
pixel_values=model_inputs.pixel_values,
image_grid_thw=model_inputs.image_grid_thw,
output_hidden_states=True,
)
hidden_states = outputs.hidden_states[-1]
split_hidden_states = _extract_masked_hidden(hidden_states, model_inputs.attention_mask)
split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
max_seq_len = max([e.size(0) for e in split_hidden_states])
prompt_embeds = torch.stack(
[torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
)
encoder_attention_mask = torch.stack(
[torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
)
prompt_embeds = prompt_embeds.to(device=device)
return prompt_embeds, encoder_attention_mask
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
def retrieve_latents(
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -266,6 +321,83 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
return components, state
class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
model_name = "qwenimage"
def __init__(
self,
input_name: str = "image",
output_name: str = "resized_image",
vae_image_output_name: str = "vae_image",
):
"""Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.
This block resizes an input image or a list input images and exposes the resized result under configurable
input and output names. Use this when you need to wire the resize step to different image fields (e.g.,
"image", "control_image")
Args:
input_name (str, optional): Name of the image field to read from the
pipeline state. Defaults to "image".
output_name (str, optional): Name of the resized image field to write
back to the pipeline state. Defaults to "resized_image".
vae_image_output_name (str, optional): Name of the image field
to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus
processes the input image(s) differently for the VL and the VAE.
"""
if not isinstance(input_name, str) or not isinstance(output_name, str):
raise ValueError(
f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
)
self.condition_image_size = 384 * 384
self._image_input_name = input_name
self._resized_image_output_name = output_name
self._vae_image_output_name = vae_image_output_name
super().__init__()
@property
def intermediate_outputs(self) -> List[OutputParam]:
return super().intermediate_outputs + [
OutputParam(
name=self._vae_image_output_name,
type_hint=List[PIL.Image.Image],
description="The images to be processed which will be further used by the VAE encoder.",
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
images = getattr(block_state, self._image_input_name)
if not is_valid_image_imagelist(images):
raise ValueError(f"Images must be image or list of images but are {type(images)}")
if (
not isinstance(images, torch.Tensor)
and isinstance(images, PIL.Image.Image)
and not isinstance(images, list)
):
images = [images]
# TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s
condition_images = []
vae_images = []
for img in images:
image_width, image_height = img.size
condition_width, condition_height, _ = calculate_dimensions(
self.condition_image_size, image_width / image_height
)
condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width))
vae_images.append(img)
setattr(block_state, self._resized_image_output_name, condition_images)
setattr(block_state, self._vae_image_output_name, vae_images)
self.set_block_state(state, block_state)
return components, state
class QwenImageTextEncoderStep(ModularPipelineBlocks):
model_name = "qwenimage"
@@ -511,6 +643,61 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
return components, state
class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
model_name = "qwenimage"
@property
def expected_configs(self) -> List[ConfigSpec]:
return [
ConfigSpec(
name="prompt_template_encode",
default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
),
ConfigSpec(
name="img_template_encode",
default="Picture {}: <|vision_start|><|image_pad|><|vision_end|>",
),
ConfigSpec(name="prompt_template_encode_start_idx", default=64),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
self.check_inputs(block_state.prompt, block_state.negative_prompt)
device = components._execution_device
block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds_edit_plus(
components.text_encoder,
components.processor,
prompt=block_state.prompt,
image=block_state.resized_image,
prompt_template_encode=components.config.prompt_template_encode,
img_template_encode=components.config.img_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
device=device,
)
if components.requires_unconditional_embeds:
negative_prompt = block_state.negative_prompt or " "
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = (
get_qwen_prompt_embeds_edit_plus(
components.text_encoder,
components.processor,
prompt=negative_prompt,
image=block_state.resized_image,
prompt_template_encode=components.config.prompt_template_encode,
img_template_encode=components.config.img_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
device=device,
)
)
self.set_block_state(state, block_state)
return components, state
class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
model_name = "qwenimage"
@@ -612,12 +799,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("resized_image"),
InputParam("image"),
InputParam("height"),
InputParam("width"),
]
return [InputParam("resized_image"), InputParam("image"), InputParam("height"), InputParam("width")]
@property
def intermediate_outputs(self) -> List[OutputParam]:
@@ -661,6 +843,47 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
return components, state
class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep):
model_name = "qwenimage-edit-plus"
vae_image_size = 1024 * 1024
@property
def description(self) -> str:
return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing."
@property
def inputs(self) -> List[InputParam]:
return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
if block_state.vae_image is None and block_state.image is None:
raise ValueError("`vae_image` and `image` cannot be None at the same time")
if block_state.vae_image is None:
image = block_state.image
self.check_inputs(
height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
)
height = block_state.height or components.default_height
width = block_state.width or components.default_width
block_state.processed_image = components.image_processor.preprocess(
image=image, height=height, width=width
)
else:
width, height = block_state.vae_image[0].size
image = block_state.vae_image
block_state.processed_image = components.image_processor.preprocess(
image=image, height=height, width=width
)
self.set_block_state(state, block_state)
return components, state
class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
model_name = "qwenimage"
@@ -738,7 +961,6 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
dtype=dtype,
latent_channels=components.num_channels_latents,
)
setattr(block_state, self._image_latents_output_name, image_latents)
self.set_block_state(state, block_state)

View File

@@ -37,6 +37,9 @@ from .denoise import (
)
from .encoders import (
QwenImageControlNetVaeEncoderStep,
QwenImageEditPlusProcessImagesInputStep,
QwenImageEditPlusResizeDynamicStep,
QwenImageEditPlusTextEncoderStep,
QwenImageEditResizeDynamicStep,
QwenImageEditTextEncoderStep,
QwenImageInpaintProcessImagesInputStep,
@@ -872,7 +875,151 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
)
# 3. all block presets supported in QwenImage & QwenImage-Edit
#################### QwenImage Edit Plus #####################
# 3. QwenImage-Edit Plus
## 3.1 QwenImage-Edit Plus / edit
#### QwenImage-Edit Plus vl encoder: take both image and text prompts
QwenImageEditPlusVLEncoderBlocks = InsertableDict(
[
("resize", QwenImageEditPlusResizeDynamicStep()),
("encode", QwenImageEditPlusTextEncoderStep()),
]
)
class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageEditPlusVLEncoderBlocks.values()
block_names = QwenImageEditPlusVLEncoderBlocks.keys()
@property
def description(self) -> str:
return "QwenImage-Edit Plus VL encoder step that encode the image an text prompts together."
#### QwenImage-Edit Plus vae encoder
QwenImageEditPlusVaeEncoderBlocks = InsertableDict(
[
("resize", QwenImageEditPlusResizeDynamicStep()), # edit plus has a different resize step
("preprocess", QwenImageEditPlusProcessImagesInputStep()), # vae_image -> processed_image
("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents
]
)
class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageEditPlusVaeEncoderBlocks.values()
block_names = QwenImageEditPlusVaeEncoderBlocks.keys()
@property
def description(self) -> str:
return "Vae encoder step that encode the image inputs into their latent representations."
#### QwenImage Edit Plus presets
EDIT_PLUS_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageEditPlusVLEncoderStep()),
("vae_encoder", QwenImageEditPlusVaeEncoderStep()),
("input", QwenImageEditInputStep()),
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsStep()),
("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
("denoise", QwenImageEditDenoiseStep()),
("decode", QwenImageDecodeStep()),
]
)
# auto before_denoise step for edit tasks
class QwenImageEditPlusAutoBeforeDenoiseStep(AutoPipelineBlocks):
model_name = "qwenimage-edit-plus"
block_classes = [QwenImageEditBeforeDenoiseStep]
block_names = ["edit"]
block_trigger_inputs = ["image_latents"]
@property
def description(self):
return (
"Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n"
+ "This is an auto pipeline block that works for edit (img2img) task.\n"
+ " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n"
+ " - if `image_latents` is not provided, step will be skipped."
)
## 3.2 QwenImage-Edit Plus/auto encoders
class QwenImageEditPlusAutoVaeEncoderStep(AutoPipelineBlocks):
block_classes = [
QwenImageEditPlusVaeEncoderStep,
]
block_names = ["edit"]
block_trigger_inputs = ["image"]
@property
def description(self):
return (
"Vae encoder step that encode the image inputs into their latent representations. \n"
" This is an auto pipeline block that works for edit task.\n"
+ " - `QwenImageEditPlusVaeEncoderStep` (edit) is used when `image` is provided.\n"
+ " - if `image` is not provided, step will be skipped."
)
## 3.3 QwenImage-Edit/auto blocks & presets
class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit-plus"
block_classes = [
QwenImageEditAutoInputStep,
QwenImageEditPlusAutoBeforeDenoiseStep,
QwenImageEditAutoDenoiseStep,
]
block_names = ["input", "before_denoise", "denoise"]
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+ " - `QwenImageEditPlusAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+ "This step support edit (img2img) workflow for QwenImage Edit Plus:\n"
+ " - When `image_latents` is provided, it will be used for edit (img2img) task.\n"
)
EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageEditPlusVLEncoderStep()),
("vae_encoder", QwenImageEditPlusAutoVaeEncoderStep()),
("denoise", QwenImageEditPlusCoreDenoiseStep()),
("decode", QwenImageAutoDecodeStep()),
]
)
class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
model_name = "qwenimage-edit-plus"
block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
@property
def description(self):
return (
"Auto Modular pipeline for edit (img2img) and edit tasks using QwenImage-Edit Plus.\n"
+ "- for edit (img2img) generation, you need to provide `image`\n"
)
# 3. all block presets supported in QwenImage, QwenImage-Edit, QwenImage-Edit Plus
ALL_BLOCKS = {
@@ -880,8 +1027,10 @@ ALL_BLOCKS = {
"img2img": IMAGE2IMAGE_BLOCKS,
"edit": EDIT_BLOCKS,
"edit_inpaint": EDIT_INPAINT_BLOCKS,
"edit_plus": EDIT_PLUS_BLOCKS,
"inpaint": INPAINT_BLOCKS,
"controlnet": CONTROLNET_BLOCKS,
"auto": AUTO_BLOCKS,
"edit_auto": EDIT_AUTO_BLOCKS,
"edit_plus_auto": EDIT_PLUS_AUTO_BLOCKS,
}

View File

@@ -196,3 +196,13 @@ class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
return requires_unconditional_embeds
class QwenImageEditPlusModularPipeline(QwenImageEditModularPipeline):
"""
A ModularPipeline for QwenImage-Edit Plus.
> [!WARNING] > This is an experimental feature and is likely to change in the future.
"""
default_blocks_name = "QwenImageEditPlusAutoBlocks"

View File

@@ -17,7 +17,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import UNet2DConditionLoadersMixin

View File

@@ -95,6 +95,7 @@ from .qwenimage import (
QwenImageControlNetPipeline,
QwenImageEditInpaintPipeline,
QwenImageEditPipeline,
QwenImageEditPlusPipeline,
QwenImageImg2ImgPipeline,
QwenImageInpaintPipeline,
QwenImagePipeline,
@@ -186,6 +187,7 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
("flux-kontext", FluxKontextPipeline),
("qwenimage", QwenImageImg2ImgPipeline),
("qwenimage-edit", QwenImageEditPipeline),
("qwenimage-edit-plus", QwenImageEditPlusPipeline),
]
)

View File

@@ -14,7 +14,6 @@
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from transformers import BertTokenizer
from transformers.activations import QuickGELUActivation as QuickGELU

View File

@@ -18,7 +18,6 @@ from typing import Callable, List, Optional, Union
import numpy as np
import PIL.Image
import torch
import torch.utils.checkpoint
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from ....image_processor import VaeImageProcessor

View File

@@ -16,7 +16,6 @@ import inspect
from typing import Callable, List, Optional, Union
import torch
import torch.utils.checkpoint
from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer
from ....image_processor import VaeImageProcessor

View File

@@ -17,7 +17,6 @@ from typing import List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.utils.checkpoint
from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
from transformers.activations import ACT2FN
from transformers.modeling_outputs import BaseModelOutput

View File

@@ -4,7 +4,6 @@ from typing import List, Optional, Tuple, Union
import numpy as np
import PIL.Image
import torch
import torch.utils.checkpoint
from ...models import UNet2DModel, VQModel
from ...schedulers import (

View File

@@ -121,6 +121,38 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
result = torch.lerp(latents, result, factor)
return result
def tone_map_latents(self, latents: torch.Tensor, compression: float) -> torch.Tensor:
"""
Applies a non-linear tone-mapping function to latent values to reduce their dynamic range in a perceptually
smooth way using a sigmoid-based compression.
This is useful for regularizing high-variance latents or for conditioning outputs during generation, especially
when controlling dynamic behavior with a `compression` factor.
Args:
latents : torch.Tensor
Input latent tensor with arbitrary shape. Expected to be roughly in [-1, 1] or [0, 1] range.
compression : float
Compression strength in the range [0, 1].
- 0.0: No tone-mapping (identity transform)
- 1.0: Full compression effect
Returns:
torch.Tensor
The tone-mapped latent tensor of the same shape as input.
"""
# Remap [0-1] to [0-0.75] and apply sigmoid compression in one shot
scale_factor = compression * 0.75
abs_latents = torch.abs(latents)
# Sigmoid compression: sigmoid shifts large values toward 0.2, small values stay ~1.0
# When scale_factor=0, sigmoid term vanishes, when scale_factor=0.75, full effect
sigmoid_term = torch.sigmoid(4.0 * scale_factor * (abs_latents - 1.0))
scales = 1.0 - 0.8 * scale_factor * sigmoid_term
filtered = latents * scales
return filtered
@staticmethod
# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._normalize_latents
def _normalize_latents(
@@ -196,7 +228,7 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
)
self.vae.disable_tiling()
def check_inputs(self, video, height, width, latents):
def check_inputs(self, video, height, width, latents, tone_map_compression_ratio):
if height % self.vae_spatial_compression_ratio != 0 or width % self.vae_spatial_compression_ratio != 0:
raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
@@ -205,6 +237,9 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
if video is None and latents is None:
raise ValueError("One of `video` or `latents` has to be provided.")
if not (0 <= tone_map_compression_ratio <= 1):
raise ValueError("`tone_map_compression_ratio` must be in the range [0, 1]")
@torch.no_grad()
def __call__(
self,
@@ -215,6 +250,7 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
decode_timestep: Union[float, List[float]] = 0.0,
decode_noise_scale: Optional[Union[float, List[float]]] = None,
adain_factor: float = 0.0,
tone_map_compression_ratio: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
@@ -224,6 +260,7 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
height=height,
width=width,
latents=latents,
tone_map_compression_ratio=tone_map_compression_ratio,
)
if video is not None:
@@ -266,6 +303,9 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
else:
latents = latents_upsampled
if tone_map_compression_ratio > 0.0:
latents = self.tone_map_latents(latents, tone_map_compression_ratio)
if output_type == "latent":
latents = self._normalize_latents(
latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor

View File

@@ -86,15 +86,14 @@ class MarigoldDepthOutput(BaseOutput):
Args:
prediction (`np.ndarray`, `torch.Tensor`):
Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times
width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
Predicted depth maps with values in the range [0, 1]. The shape is `numimages × 1 × height × width` for
`torch.Tensor` or `numimages × height × width × 1` for `np.ndarray`.
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
\times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
for `np.ndarray`.
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `numimages × 1 ×
height × width` for `torch.Tensor` or `numimages × height × width × 1` for `np.ndarray`.
latent (`None`, `torch.Tensor`):
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
The shape is `numimages * numensemble × 4 × latentheight × latentwidth`.
"""
prediction: Union[np.ndarray, torch.Tensor]

View File

@@ -99,17 +99,17 @@ class MarigoldIntrinsicsOutput(BaseOutput):
Args:
prediction (`np.ndarray`, `torch.Tensor`):
Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3
\times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width
\times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of
the intrinsic image decomposition.
Predicted image intrinsics with values in the range [0, 1]. The shape is `(numimages * numtargets) × 3 ×
height × width` for `torch.Tensor` or `(numimages * numtargets) × height × width × 3` for `np.ndarray`,
where `numtargets` corresponds to the number of predicted target modalities of the intrinsic image
decomposition.
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages *
numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times
height \times width \times 3$ for `np.ndarray`.
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `(numimages *
numtargets) × 3 × height × width` for `torch.Tensor` or `(numimages * numtargets) × height × width × 3` for
`np.ndarray`.
latent (`None`, `torch.Tensor`):
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
The shape is $(numimages * numensemble) \times (numtargets * 4) \times latentheight \times latentwidth$.
The shape is `(numimages * numensemble) × (numtargets * 4) × latentheight × latentwidth`.
"""
prediction: Union[np.ndarray, torch.Tensor]

View File

@@ -81,15 +81,14 @@ class MarigoldNormalsOutput(BaseOutput):
Args:
prediction (`np.ndarray`, `torch.Tensor`):
Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times
width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
Predicted normals with values in the range [-1, 1]. The shape is `numimages × 3 × height × width` for
`torch.Tensor` or `numimages × height × width × 3` for `np.ndarray`.
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
\times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
for `np.ndarray`.
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `numimages × 1 ×
height × width` for `torch.Tensor` or `numimages × height × width × 1` for `np.ndarray`.
latent (`None`, `torch.Tensor`):
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
The shape is `numimages * numensemble × 4 × latentheight × latentwidth`.
"""
prediction: Union[np.ndarray, torch.Tensor]

View File

@@ -838,6 +838,9 @@ def load_sub_model(
else:
loading_kwargs["low_cpu_mem_usage"] = False
if is_transformers_model and is_transformers_version(">=", "4.57.0"):
loading_kwargs.pop("offload_state_dict")
if (
quantization_config is not None
and isinstance(quantization_config, PipelineQuantizationConfig)

View File

@@ -18,7 +18,6 @@ from typing import Optional
import torch
import torch.nn as nn
import torch.utils.checkpoint
from ...configuration_utils import ConfigMixin, register_to_config
from ...models.modeling_utils import ModelMixin

Some files were not shown because too many files have changed in this diff Show More