Merge branch 'main' into migrate-lora-pytest

2026-01-29 07:22:12 +03:00 · 2025-10-17 07:55:31 +05:30
parent 0d3da485a0 dbe413668d
commit 4ae5772fef
104 changed files with 2358 additions and 1467 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -38,9 +38,8 @@ jobs:
        run: |
          apt update
          apt install -y libpq-dev postgresql-client
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install -r benchmarks/requirements.txt
+          uv pip install -e ".[quality]"
+          uv pip install -r benchmarks/requirements.txt
      - name: Environment
        run: |
          python utils/print_env.py
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -72,7 +72,6 @@ jobs:
        image-name:
          - diffusers-pytorch-cpu
          - diffusers-pytorch-cuda
-          - diffusers-pytorch-cuda
          - diffusers-pytorch-xformers-cuda
          - diffusers-pytorch-minimum-cuda
          - diffusers-doc-builder
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -12,7 +12,33 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  check-links:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+
+      - name: Install doc-builder
+        run: |
+          uv pip install --system git+https://github.com/huggingface/doc-builder.git@main
+
+      - name: Check documentation links
+        run: |
+          uv run doc-builder check-links docs/source/en
+
  build:
+    needs: check-links
    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
    with:
      commit_sha: ${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/mirror_community_pipeline.yml
+++ b/.github/workflows/mirror_community_pipeline.yml
@@ -74,7 +74,7 @@ jobs:
          python-version: "3.10"
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
+          pip install --upgrade pip
          pip install --upgrade huggingface_hub

      # Check secret is set
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -71,10 +71,9 @@ jobs:
        run: nvidia-smi
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          python -m uv pip install pytest-reportlog
+          uv pip install -e ".[quality]"
+          uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          uv pip install pytest-reportlog
      - name: Environment
        run: |
          python utils/print_env.py
@@ -84,7 +83,7 @@ jobs:
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
@@ -124,11 +123,10 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m uv pip install pytest-reportlog
+        uv pip install -e ".[quality]"
+        uv pip install peft@git+https://github.com/huggingface/peft.git
+        uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        uv pip install pytest-reportlog
    - name: Environment
      run: python utils/print_env.py

@@ -139,7 +137,7 @@ jobs:
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_torch_${{ matrix.module }}_cuda \
          --report-log=tests_torch_${{ matrix.module }}_cuda.log \
@@ -152,7 +150,7 @@ jobs:
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v --make-reports=examples_torch_cuda \
          --report-log=examples_torch_cuda.log \
          examples/
@@ -191,8 +189,7 @@ jobs:
        nvidia-smi
    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
+        uv pip install -e ".[quality,training]"
    - name: Environment
      run: |
        python utils/print_env.py
@@ -201,7 +198,7 @@ jobs:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
        RUN_COMPILE: yes
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -232,11 +229,10 @@ jobs:
        run: nvidia-smi
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          python -m uv pip install pytest-reportlog
+          uv pip install -e ".[quality]"
+          uv pip install peft@git+https://github.com/huggingface/peft.git
+          uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          uv pip install pytest-reportlog
      - name: Environment
        run: |
          python utils/print_env.py
@@ -247,7 +243,7 @@ jobs:
          CUBLAS_WORKSPACE_CONFIG: :16:8
          BIG_GPU_MEMORY: 40
        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -m "big_accelerator" \
            --make-reports=tests_big_gpu_torch_cuda \
            --report-log=tests_big_gpu_torch_cuda.log \
@@ -282,10 +278,9 @@ jobs:

      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          uv pip install -e ".[quality]"
+          uv pip install peft@git+https://github.com/huggingface/peft.git
+          uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git

      - name: Environment
        run: |
@@ -297,7 +292,7 @@ jobs:
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_torch_minimum_version_cuda \
            tests/models/test_modeling_common.py \
@@ -357,13 +352,12 @@ jobs:
        run: nvidia-smi
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install -U ${{ matrix.config.backend }}
+          uv pip install -e ".[quality]"
+          uv pip install -U ${{ matrix.config.backend }}
          if [ "${{ join(matrix.config.additional_deps, ' ') }}" != "" ]; then
-              python -m uv pip install ${{ join(matrix.config.additional_deps, ' ') }}
+              uv pip install ${{ join(matrix.config.additional_deps, ' ') }}
          fi
-          python -m uv pip install pytest-reportlog
+          uv pip install pytest-reportlog
      - name: Environment
        run: |
          python utils/print_env.py
@@ -374,7 +368,7 @@ jobs:
          CUBLAS_WORKSPACE_CONFIG: :16:8
          BIG_GPU_MEMORY: 40
        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            --make-reports=tests_${{ matrix.config.backend }}_torch_cuda \
            --report-log=tests_${{ matrix.config.backend }}_torch_cuda.log \
            tests/quantization/${{ matrix.config.test_location }}
@@ -409,10 +403,9 @@ jobs:
        run: nvidia-smi
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install -U bitsandbytes optimum_quanto
-          python -m uv pip install pytest-reportlog
+          uv pip install -e ".[quality]"
+          uv pip install -U bitsandbytes optimum_quanto
+          uv pip install pytest-reportlog
      - name: Environment
        run: |
          python utils/print_env.py
@@ -423,7 +416,7 @@ jobs:
          CUBLAS_WORKSPACE_CONFIG: :16:8
          BIG_GPU_MEMORY: 40
        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            --make-reports=tests_pipeline_level_quant_torch_cuda \
            --report-log=tests_pipeline_level_quant_torch_cuda.log \
            tests/quantization/test_pipeline_level_quantization.py
@@ -523,11 +516,11 @@ jobs:
 #      - name: Install dependencies
 #        shell: arch -arch arm64 bash {0}
 #        run: |
-#          ${CONDA_RUN} python -m pip install --upgrade pip uv
-#          ${CONDA_RUN} python -m uv pip install -e [quality,test]
-#          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
-#          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
-#          ${CONDA_RUN} python -m uv pip install pytest-reportlog
+#          ${CONDA_RUN} pip install --upgrade pip uv
+#          ${CONDA_RUN} uv pip install -e ".[quality]"
+#          ${CONDA_RUN} uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+#          ${CONDA_RUN} uv pip install accelerate@git+https://github.com/huggingface/accelerate
+#          ${CONDA_RUN} uv pip install pytest-reportlog
 #      - name: Environment
 #        shell: arch -arch arm64 bash {0}
 #        run: |
@@ -538,7 +531,7 @@ jobs:
 #          HF_HOME: /System/Volumes/Data/mnt/cache
 #          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
 #        run: |
-#          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
+#          ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
 #            --report-log=tests_torch_mps.log \
 #            tests/
 #      - name: Failure short reports
@@ -579,11 +572,11 @@ jobs:
 #      - name: Install dependencies
 #        shell: arch -arch arm64 bash {0}
 #        run: |
-#          ${CONDA_RUN} python -m pip install --upgrade pip uv
-#          ${CONDA_RUN} python -m uv pip install -e [quality,test]
-#          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
-#          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
-#          ${CONDA_RUN} python -m uv pip install pytest-reportlog
+#          ${CONDA_RUN} pip install --upgrade pip uv
+#          ${CONDA_RUN} uv pip install -e ".[quality]"
+#          ${CONDA_RUN} uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+#          ${CONDA_RUN} uv pip install accelerate@git+https://github.com/huggingface/accelerate
+#          ${CONDA_RUN} uv pip install pytest-reportlog
 #      - name: Environment
 #        shell: arch -arch arm64 bash {0}
 #        run: |
@@ -594,7 +587,7 @@ jobs:
 #          HF_HOME: /System/Volumes/Data/mnt/cache
 #          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
 #        run: |
-#          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
+#          ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
 #            --report-log=tests_torch_mps.log \
 #            tests/
 #      - name: Failure short reports
--- a/.github/workflows/pr_dependency_test.yml
+++ b/.github/workflows/pr_dependency_test.yml
@@ -25,11 +25,8 @@ jobs:
          python-version: "3.8"
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m pip install --upgrade pip uv
-          python -m uv pip install -e .
-          python -m uv pip install pytest
+          pip install -e .
+          pip install pytest
      - name: Check for soft dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          pytest tests/others/test_dependencies.py
+            pytest tests/others/test_dependencies.py
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -42,7 +42,7 @@ jobs:
          python-version: "3.10"
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
+          pip install --upgrade pip
          pip install .[quality]
      - name: Check quality
        run: make quality
@@ -62,7 +62,7 @@ jobs:
          python-version: "3.10"
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
+          pip install --upgrade pip
          pip install .[quality]
      - name: Check repo consistency
        run: |
@@ -108,21 +108,18 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        uv pip install -e ".[quality]"
+        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run fast PyTorch Pipeline CPU tests
      if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 8 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/modular_pipelines
--- a/.github/workflows/pr_test_fetcher.yml
+++ b/.github/workflows/pr_test_fetcher.yml
@@ -33,8 +33,7 @@ jobs:
        fetch-depth: 0
    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
+        uv pip install -e ".[quality]"
    - name: Environment
      run: |
        python utils/print_env.py
@@ -90,19 +89,16 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pip install -e [quality,test]
-        python -m pip install accelerate
+        uv pip install -e ".[quality]"
+        uv pip install accelerate

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run all selected tests on CPU
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.modules }}_tests_cpu ${{ fromJson(needs.setup_pr_tests.outputs.test_map)[matrix.modules] }}
+        pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.modules }}_tests_cpu ${{ fromJson(needs.setup_pr_tests.outputs.test_map)[matrix.modules] }}

    - name: Failure short reports
      if: ${{ failure() }}
@@ -148,19 +144,16 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pip install -e [quality,test]
+        pip install -e [quality]

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run Hub tests for models, schedulers, and pipelines on a staging env
      if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        HUGGINGFACE_CO_STAGING=true python -m pytest \
+        HUGGINGFACE_CO_STAGING=true pytest \
          -m "is_staging_test" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -38,7 +38,7 @@ jobs:
          python-version: "3.8"
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
+          pip install --upgrade pip
          pip install .[quality]
      - name: Check quality
        run: make quality
@@ -58,7 +58,7 @@ jobs:
          python-version: "3.8"
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
+          pip install --upgrade pip
          pip install .[quality]
      - name: Check repo consistency
        run: |
@@ -114,21 +114,18 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        uv pip install -e ".[quality]"
+        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run fast PyTorch Pipeline CPU tests
      if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 8 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/pipelines
@@ -136,8 +133,7 @@ jobs:
    - name: Run fast PyTorch Model Scheduler CPU tests
      if: ${{ matrix.config.framework == 'pytorch_models' }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx and not Dependency" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/models tests/schedulers tests/others
@@ -145,9 +141,8 @@ jobs:
    - name: Run example PyTorch CPU tests
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install peft timm
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        uv pip install ".[training]"
+        pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples

@@ -195,19 +190,16 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
+        uv pip install -e ".[quality]"

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run Hub tests for models, schedulers, and pipelines on a staging env
      if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        HUGGINGFACE_CO_STAGING=true python -m pytest \
+        HUGGINGFACE_CO_STAGING=true pytest \
          -m "is_staging_test" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests
@@ -249,27 +241,24 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
+        uv pip install -e ".[quality]"
        # TODO (sayakpaul, DN6): revisit `--no-deps`
-        python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
-        python -m uv pip install -U tokenizers
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
-        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
+        uv pip install -U tokenizers
+        uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run fast PyTorch LoRA tests with PEFT
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v \
          --make-reports=tests_peft_main \
          tests/lora/
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v \
          --make-reports=tests_models_lora_peft_main \
          tests/models/ -k "lora"
--- a/.github/workflows/pr_tests_gpu.yml
+++ b/.github/workflows/pr_tests_gpu.yml
@@ -39,7 +39,7 @@ jobs:
          python-version: "3.8"
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
+          pip install --upgrade pip
          pip install .[quality]
      - name: Check quality
        run: make quality
@@ -59,7 +59,7 @@ jobs:
          python-version: "3.8"
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
+          pip install --upgrade pip
          pip install .[quality]
      - name: Check repo consistency
        run: |
@@ -88,8 +88,7 @@ jobs:
          fetch-depth: 2
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
+          uv pip install -e ".[quality]"
      - name: Environment
        run: |
          python utils/print_env.py
@@ -130,10 +129,9 @@ jobs:
          nvidia-smi
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          uv pip install -e ".[quality]"
+          uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git

      - name: Environment
        run: |
@@ -152,13 +150,13 @@ jobs:
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
          if [ "${{ matrix.module }}" = "ip_adapters" ]; then 
-              python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+              pytest -n 1 --max-worker-restart=0 --dist=loadfile \
              -s -v -k "not Flax and not Onnx" \
              --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
              tests/pipelines/${{ matrix.module }}
          else 
              pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
-              python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+              pytest -n 1 --max-worker-restart=0 --dist=loadfile \
              -s -v -k "not Flax and not Onnx and $pattern" \
              --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
              tests/pipelines/${{ matrix.module }}
@@ -200,11 +198,10 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip install -e ".[quality]"
+        uv pip install peft@git+https://github.com/huggingface/peft.git
+        uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git

    - name: Environment
      run: |
@@ -225,10 +222,10 @@ jobs:
      run: |
        pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
        if [ -z "$pattern" ]; then
-          python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
+          pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
          --make-reports=tests_torch_cuda_${{ matrix.module }}  
        else
-          python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
+          pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
          --make-reports=tests_torch_cuda_${{ matrix.module }}  
        fi

@@ -265,22 +262,19 @@ jobs:
        nvidia-smi
    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
-        python -m uv pip install -e [quality,test,training]
+        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip install -e ".[quality,training]"

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run example tests on GPU
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install timm
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
+        uv pip install ".[training]"
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/

    - name: Failure short reports
      if: ${{ failure() }}
--- a/.github/workflows/pr_torch_dependency_test.yml
+++ b/.github/workflows/pr_torch_dependency_test.yml
@@ -25,12 +25,8 @@ jobs:
          python-version: "3.8"
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m pip install --upgrade pip uv
-          python -m uv pip install -e .
-          python -m uv pip install torch torchvision torchaudio
-          python -m uv pip install pytest
+          pip install -e .
+          pip install torch torchvision torchaudio pytest
      - name: Check for soft dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          pytest tests/others/test_dependencies.py
+            pytest tests/others/test_dependencies.py
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -34,8 +34,7 @@ jobs:
          fetch-depth: 2
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
+          uv pip install -e ".[quality]"
      - name: Environment
        run: |
          python utils/print_env.py
@@ -75,9 +74,8 @@ jobs:
          nvidia-smi
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          uv pip install -e ".[quality]"
+          uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
      - name: Environment
        run: |
          python utils/print_env.py
@@ -87,7 +85,7 @@ jobs:
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
            tests/pipelines/${{ matrix.module }}
@@ -126,10 +124,9 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        uv pip install -e ".[quality]"
+        uv pip install peft@git+https://github.com/huggingface/peft.git
+        uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git

    - name: Environment
      run: |
@@ -141,7 +138,7 @@ jobs:
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_torch_cuda_${{ matrix.module }} \
          tests/${{ matrix.module }}
@@ -180,8 +177,7 @@ jobs:
        nvidia-smi
    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
+        uv pip install -e ".[quality,training]"
    - name: Environment
      run: |
        python utils/print_env.py
@@ -190,7 +186,7 @@ jobs:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
        RUN_COMPILE: yes
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -223,8 +219,7 @@ jobs:
        nvidia-smi
    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
+        uv pip install -e ".[quality,training]"
    - name: Environment
      run: |
        python utils/print_env.py
@@ -232,7 +227,7 @@ jobs:
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: cat reports/tests_torch_xformers_cuda_failures_short.txt
@@ -264,21 +259,18 @@ jobs:
        nvidia-smi
    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
+        uv pip install -e ".[quality,training]"

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run example tests on GPU
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install timm
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
+        uv pip install ".[training]"
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/

    - name: Failure short reports
      if: ${{ failure() }}
--- a/.github/workflows/push_tests_fast.yml
+++ b/.github/workflows/push_tests_fast.yml
@@ -60,19 +60,16 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
+        uv pip install -e ".[quality]"

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run fast PyTorch CPU tests
      if: ${{ matrix.config.framework == 'pytorch' }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/
@@ -80,9 +77,8 @@ jobs:
    - name: Run example PyTorch CPU tests
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install peft timm
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+        uv pip install ".[training]"
+        pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples

--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -32,8 +32,7 @@ jobs:
          fetch-depth: 2
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
+          uv pip install -e ".[quality]"
      - name: Environment
        run: |
          python utils/print_env.py
@@ -73,9 +72,8 @@ jobs:
          nvidia-smi
      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          uv pip install -e ".[quality]"
+          uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
      - name: Environment
        run: |
          python utils/print_env.py
@@ -85,7 +83,7 @@ jobs:
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
            tests/pipelines/${{ matrix.module }}
@@ -124,10 +122,9 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        uv pip install -e ".[quality]"
+        uv pip install peft@git+https://github.com/huggingface/peft.git
+        uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git

    - name: Environment
      run: |
@@ -139,7 +136,7 @@ jobs:
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
          --make-reports=tests_torch_${{ matrix.module }}_cuda \
          tests/${{ matrix.module }}
@@ -175,10 +172,9 @@ jobs:

      - name: Install dependencies
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          uv pip install -e ".[quality]"
+          uv pip install peft@git+https://github.com/huggingface/peft.git
+          uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git

      - name: Environment
        run: |
@@ -190,7 +186,7 @@ jobs:
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
            --make-reports=tests_torch_minimum_cuda \
            tests/models/test_modeling_common.py \
@@ -235,8 +231,7 @@ jobs:
        nvidia-smi
    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
+        uv pip install -e ".[quality,training]"
    - name: Environment
      run: |
        python utils/print_env.py
@@ -245,7 +240,7 @@ jobs:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
        RUN_COMPILE: yes
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -278,8 +273,7 @@ jobs:
        nvidia-smi
    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
+        uv pip install -e ".[quality,training]"
    - name: Environment
      run: |
        python utils/print_env.py
@@ -287,7 +281,7 @@ jobs:
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
    - name: Failure short reports
      if: ${{ failure() }}
      run: cat reports/tests_torch_xformers_cuda_failures_short.txt
@@ -321,21 +315,18 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test,training]
+        uv pip install -e ".[quality,training]"

    - name: Environment
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python utils/print_env.py

    - name: Run example tests on GPU
      env:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install timm
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
+        uv pip install ".[training]"
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/

    - name: Failure short reports
      if: ${{ failure() }}
--- a/.github/workflows/run_tests_from_a_pr.yml
+++ b/.github/workflows/run_tests_from_a_pr.yml
@@ -63,9 +63,8 @@ jobs:

      - name: Install pytest
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install peft
+          uv pip install -e ".[quality]"
+          uv pip install peft

      - name: Run tests
        env:
--- a/.gitignore
+++ b/.gitignore
@@ -125,6 +125,9 @@ dmypy.json
 .vs
 .vscode

+# Cursor
+.cursor
+
 # Pycharm
 .idea

--- a/docker/diffusers-doc-builder/Dockerfile
+++ b/docker/diffusers-doc-builder/Dockerfile
@@ -1,56 +1,45 @@
-FROM ubuntu:20.04
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
+RUN apt-get -y update && apt-get install -y bash \
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libglib2.0-0 \
+    libsndfile1-dev \
+    libgl1 \
+    zip \
+    wget

-RUN apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.10 \
-                   python3-pip \
-                   libgl1 \
-                   zip \
-                   wget \
-                   python3.10-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+ENV UV_PYTHON=/usr/local/bin/python

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        invisible_watermark \
-        --extra-index-url https://download.pytorch.org/whl/cpu && \
-    python3.10 -m uv pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        Jinja2 \
-        librosa \
-        numpy==1.26.4 \
-        scipy \
-        tensorboard \
-        transformers \
-        matplotlib \
-        setuptools==69.5.1 \
-        bitsandbytes \
-        torchao \
-        gguf \
-        optimum-quanto
+RUN pip install uv
+RUN uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio \
+    --extra-index-url https://download.pytorch.org/whl/cpu
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
+    accelerate \
+    numpy==1.26.4 \
+    hf_transfer \
+    setuptools==69.5.1 \
+    bitsandbytes \
+    torchao \
+    gguf \
+    optimum-quanto
+
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean

 CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-cpu/Dockerfile
+++ b/docker/diffusers-pytorch-cpu/Dockerfile
@@ -1,50 +1,38 @@
-FROM ubuntu:20.04
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
+RUN apt-get -y update && apt-get install -y bash \
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libglib2.0-0 \
+    libsndfile1-dev \
+    libgl1

-RUN apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.10 \
-                   python3.10-dev \
-                   python3-pip \
-                   libgl1 \
-                   python3.10-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+ENV UV_PYTHON=/usr/local/bin/python

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        invisible_watermark \
-        --extra-index-url https://download.pytorch.org/whl/cpu && \
-    python3.10 -m uv pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        Jinja2 \
-        librosa \
-        numpy==1.26.4 \
-        scipy \
-        tensorboard \
-        transformers matplotlib  \
-        hf_transfer
+RUN pip install uv
+RUN uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio \
+    --extra-index-url https://download.pytorch.org/whl/cpu
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
+    accelerate \
+    numpy==1.26.4 \
+    hf_transfer
+
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean

 CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -2,11 +2,13 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

+ARG PYTHON_VERSION=3.12
 ENV DEBIAN_FRONTEND=noninteractive

 RUN apt-get -y update \
    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
+    && add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update

 RUN apt install -y bash \
    build-essential \
@@ -14,38 +16,34 @@ RUN apt install -y bash \
    git-lfs \
    curl \
    ca-certificates \
+    libglib2.0-0 \
    libsndfile1-dev \
    libgl1 \
-    python3.10 \
-    python3.10-dev \
+    python3 \
    python3-pip \
-    python3.10-venv && \
-    rm -rf /var/lib/apt/lists
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*

-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
+RUN uv pip install --no-cache-dir \
    torch \
    torchvision \
-    torchaudio \
-    invisible_watermark && \
-    python3.10 -m pip install --no-cache-dir \
+    torchaudio
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
    accelerate \
-    datasets \
-    hf-doc-builder \
-    huggingface-hub \
-    hf_transfer \
-    Jinja2 \
-    librosa \
    numpy==1.26.4 \
-    scipy \
-    tensorboard \
-    transformers \
-    pytorch-lightning  \
+    pytorch-lightning \
    hf_transfer

 CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-minimum-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-minimum-cuda/Dockerfile
@@ -2,6 +2,7 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

+ARG PYTHON_VERSION=3.10
 ENV DEBIAN_FRONTEND=noninteractive
 ENV MINIMUM_SUPPORTED_TORCH_VERSION="2.1.0"
 ENV MINIMUM_SUPPORTED_TORCHVISION_VERSION="0.16.0"
@@ -9,7 +10,8 @@ ENV MINIMUM_SUPPORTED_TORCHAUDIO_VERSION="2.1.0"

 RUN apt-get -y update \
    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
+    && add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update

 RUN apt install -y bash \
    build-essential \
@@ -17,37 +19,34 @@ RUN apt install -y bash \
    git-lfs \
    curl \
    ca-certificates \
+    libglib2.0-0 \
    libsndfile1-dev \
    libgl1 \
-    python3.10 \
-    python3.10-dev \
+    python3 \
    python3-pip \
-    python3.10-venv && \
-    rm -rf /var/lib/apt/lists
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*

-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
+RUN uv pip install --no-cache-dir \
    torch==$MINIMUM_SUPPORTED_TORCH_VERSION \
    torchvision==$MINIMUM_SUPPORTED_TORCHVISION_VERSION \
-    torchaudio==$MINIMUM_SUPPORTED_TORCHAUDIO_VERSION \
-    invisible_watermark && \
-    python3.10 -m pip install --no-cache-dir \
+    torchaudio==$MINIMUM_SUPPORTED_TORCHAUDIO_VERSION
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
    accelerate \
-    datasets \
-    hf-doc-builder \
-    huggingface-hub \
-    hf_transfer \
-    Jinja2 \
-    librosa \
    numpy==1.26.4 \
-    scipy \
-    tensorboard \
-    transformers \
+    pytorch-lightning \
    hf_transfer

 CMD ["/bin/bash"]
--- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -2,50 +2,49 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"

+ARG PYTHON_VERSION=3.12
 ENV DEBIAN_FRONTEND=noninteractive

 RUN apt-get -y update \
    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
+    && add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update

 RUN apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   libgl1 \
-                   python3.10 \
-                   python3.10-dev \
-                   python3-pip \
-                   python3.10-venv && \
-    rm -rf /var/lib/apt/lists
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libglib2.0-0 \
+    libsndfile1-dev \
+    libgl1 \
+    python3 \
+    python3-pip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*

-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        invisible_watermark && \
-    python3.10 -m uv pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        hf_transfer \
-        Jinja2 \
-        librosa \
-        numpy==1.26.4 \
-        scipy \
-        tensorboard \
-        transformers \
-        xformers  \
-        hf_transfer
+RUN uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
+    accelerate \
+    numpy==1.26.4 \
+    pytorch-lightning \
+    hf_transfer \
+    xformers

 CMD ["/bin/bash"]
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -49,7 +49,7 @@
  isExpanded: false
  sections:
  - local: using-diffusers/weighted_prompts
-    title: Prompt techniques
+    title: Prompting
  - local: using-diffusers/create_a_server
    title: Create a server
  - local: using-diffusers/batched_inference
--- a/docs/source/en/api/loaders/ip_adapter.md
+++ b/docs/source/en/api/loaders/ip_adapter.md
@@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License.
 [IP-Adapter](https://hf.co/papers/2308.06721) is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder.

 > [!TIP]
-> Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter [loading](../../using-diffusers/loading_adapters#ip-adapter) guide, and you can see how to use it in the [usage](../../using-diffusers/ip_adapter) guide.
+> Learn how to load and use an IP-Adapter checkpoint and image in the [IP-Adapter](../../using-diffusers/ip_adapter) guide,.

 ## IPAdapterMixin

--- a/docs/source/en/api/loaders/lora.md
+++ b/docs/source/en/api/loaders/lora.md
@@ -34,7 +34,7 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.

 > [!TIP]
-> To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
+> To learn more about how to load LoRA weights, see the [LoRA](../../tutorials/using_peft_for_inference) loading guide.

 ## LoraBaseMixin

--- a/docs/source/en/api/loaders/peft.md
+++ b/docs/source/en/api/loaders/peft.md
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # PEFT

-Diffusers supports loading adapters such as [LoRA](../../using-diffusers/loading_adapters) with the [PEFT](https://huggingface.co/docs/peft/index) library with the [`~loaders.peft.PeftAdapterMixin`] class. This allows modeling classes in Diffusers like [`UNet2DConditionModel`], [`SD3Transformer2DModel`] to operate with an adapter.
+Diffusers supports loading adapters such as [LoRA](../../tutorials/using_peft_for_inference) with the [PEFT](https://huggingface.co/docs/peft/index) library with the [`~loaders.peft.PeftAdapterMixin`] class. This allows modeling classes in Diffusers like [`UNet2DConditionModel`], [`SD3Transformer2DModel`] to operate with an adapter.

 > [!TIP]
 > Refer to the [Inference with PEFT](../../tutorials/using_peft_for_inference.md) tutorial for an overview of how to use PEFT in Diffusers for inference.
--- a/docs/source/en/api/loaders/textual_inversion.md
+++ b/docs/source/en/api/loaders/textual_inversion.md
@@ -17,7 +17,7 @@ Textual Inversion is a training method for personalizing models by learning new
 [`TextualInversionLoaderMixin`] provides a function for loading Textual Inversion embeddings from Diffusers and Automatic1111 into the text encoder and loading a special token to activate the embeddings.

 > [!TIP]
-> To learn more about how to load Textual Inversion embeddings, see the [Textual Inversion](../../using-diffusers/loading_adapters#textual-inversion) loading guide.
+> To learn more about how to load Textual Inversion embeddings, see the [Textual Inversion](../../using-diffusers/textual_inversion_inference) loading guide.

 ## TextualInversionLoaderMixin

--- a/docs/source/en/api/loaders/transformer_sd3.md
+++ b/docs/source/en/api/loaders/transformer_sd3.md
@@ -17,7 +17,7 @@ This class is useful when *only* loading weights into a [`SD3Transformer2DModel`
 The [`SD3Transformer2DLoadersMixin`] class currently only loads IP-Adapter weights, but will be used in the future to save weights and load LoRAs.

 > [!TIP]
-> To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
+> To learn more about how to load LoRA weights, see the [LoRA](../../tutorials/using_peft_for_inference) loading guide.

 ## SD3Transformer2DLoadersMixin

--- a/docs/source/en/api/loaders/unet.md
+++ b/docs/source/en/api/loaders/unet.md
@@ -17,7 +17,7 @@ Some training methods - like LoRA and Custom Diffusion - typically target the UN
 The [`UNet2DConditionLoadersMixin`] class provides functions for loading and saving weights, fusing and unfusing LoRAs, disabling and enabling LoRAs, and setting and deleting adapters.

 > [!TIP]
-> To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
+> To learn more about how to load LoRA weights, see the [LoRA](../../tutorials/using_peft_for_inference) guide.

 ## UNet2DConditionLoadersMixin

--- a/docs/source/en/api/pipelines/flux.md
+++ b/docs/source/en/api/pipelines/flux.md
@@ -418,7 +418,7 @@ When unloading the Control LoRA weights, call `pipe.unload_lora_weights(reset_to
 ## IP-Adapter

 > [!TIP]
-> Check out [IP-Adapter](../../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.
+> Check out [IP-Adapter](../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.

 An IP-Adapter lets you prompt Flux with images, in addition to the text prompt. This is especially useful when describing complex concepts that are difficult to articulate through text alone and you have reference images.

--- a/docs/source/en/api/pipelines/hidream.md
+++ b/docs/source/en/api/pipelines/hidream.md
@@ -21,7 +21,7 @@

 ## Available models

-The following models are available for the [`HiDreamImagePipeline`](text-to-image) pipeline:
+The following models are available for the [`HiDreamImagePipeline`] pipeline:

 | Model name | Description |
 |:---|:---|
--- a/docs/source/en/api/pipelines/ltx_video.md
+++ b/docs/source/en/api/pipelines/ltx_video.md
@@ -254,8 +254,8 @@ export_to_video(video, "output.mp4", fps=24)
  pipeline.vae.enable_tiling()

  def round_to_nearest_resolution_acceptable_by_vae(height, width):
-      height = height - (height % pipeline.vae_temporal_compression_ratio)
-      width = width - (width % pipeline.vae_temporal_compression_ratio)
+      height = height - (height % pipeline.vae_spatial_compression_ratio)
+      width = width - (width % pipeline.vae_spatial_compression_ratio)
      return height, width

  prompt = """
@@ -325,6 +325,95 @@ export_to_video(video, "output.mp4", fps=24)

  </details>

+- LTX-Video 0.9.8 distilled model is similar to the 0.9.7 variant. It is guidance and timestep-distilled, and similar inference code can be used as above. An improvement of this version is that it supports generating very long videos. Additionally, it supports using tone mapping to improve the quality of the generated video using the `tone_map_compression_ratio` parameter. The default value of `0.6` is recommended.
+
+  <details>
+  <summary>Show example code</summary>
+  
+  ```python
+  import torch
+  from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
+  from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
+  from diffusers.pipelines.ltx.modeling_latent_upsampler import LTXLatentUpsamplerModel
+  from diffusers.utils import export_to_video, load_video
+
+  pipeline = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.8-13B-distilled", torch_dtype=torch.bfloat16)
+  # TODO: Update the checkpoint here once updated in LTX org
+  upsampler = LTXLatentUpsamplerModel.from_pretrained("a-r-r-o-w/LTX-0.9.8-Latent-Upsampler", torch_dtype=torch.bfloat16)
+  pipe_upsample = LTXLatentUpsamplePipeline(vae=pipeline.vae, latent_upsampler=upsampler).to(torch.bfloat16)
+  pipeline.to("cuda")
+  pipe_upsample.to("cuda")
+  pipeline.vae.enable_tiling()
+
+  def round_to_nearest_resolution_acceptable_by_vae(height, width):
+      height = height - (height % pipeline.vae_spatial_compression_ratio)
+      width = width - (width % pipeline.vae_spatial_compression_ratio)
+      return height, width
+
+  prompt = """The camera pans over a snow-covered mountain range, revealing a vast expanse of snow-capped peaks and valleys.The mountains are covered in a thick layer of snow, with some areas appearing almost white while others have a slightly darker, almost grayish hue. The peaks are jagged and irregular, with some rising sharply into the sky while others are more rounded. The valleys are deep and narrow, with steep slopes that are also covered in snow. The trees in the foreground are mostly bare, with only a few leaves remaining on their branches. The sky is overcast, with thick clouds obscuring the sun. The overall impression is one of peace and tranquility, with the snow-covered mountains standing as a testament to the power and beauty of nature."""
+  # prompt = """A woman walks away from a white Jeep parked on a city street at night, then ascends a staircase and knocks on a door. The woman, wearing a dark jacket and jeans, walks away from the Jeep parked on the left side of the street, her back to the camera; she walks at a steady pace, her arms swinging slightly by her sides; the street is dimly lit, with streetlights casting pools of light on the wet pavement; a man in a dark jacket and jeans walks past the Jeep in the opposite direction; the camera follows the woman from behind as she walks up a set of stairs towards a building with a green door; she reaches the top of the stairs and turns left, continuing to walk towards the building; she reaches the door and knocks on it with her right hand; the camera remains stationary, focused on the doorway; the scene is captured in real-life footage."""
+  negative_prompt = "bright colors, symbols, graffiti, watermarks, worst quality, inconsistent motion, blurry, jittery, distorted"
+  expected_height, expected_width = 480, 832
+  downscale_factor = 2 / 3
+  # num_frames = 161
+  num_frames = 361
+
+  # 1. Generate video at smaller resolution
+  downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
+  downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
+  latents = pipeline(
+      prompt=prompt,
+      negative_prompt=negative_prompt,
+      width=downscaled_width,
+      height=downscaled_height,
+      num_frames=num_frames,
+      timesteps=[1000, 993, 987, 981, 975, 909, 725, 0.03],
+      decode_timestep=0.05,
+      decode_noise_scale=0.025,
+      image_cond_noise_scale=0.0,
+      guidance_scale=1.0,
+      guidance_rescale=0.7,
+      generator=torch.Generator().manual_seed(0),
+      output_type="latent",
+  ).frames
+
+  # 2. Upscale generated video using latent upsampler with fewer inference steps
+  # The available latent upsampler upscales the height/width by 2x
+  upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
+  upscaled_latents = pipe_upsample(
+      latents=latents,
+      adain_factor=1.0,
+      tone_map_compression_ratio=0.6,
+      output_type="latent"
+  ).frames
+
+  # 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
+  video = pipeline(
+      prompt=prompt,
+      negative_prompt=negative_prompt,
+      width=upscaled_width,
+      height=upscaled_height,
+      num_frames=num_frames,
+      denoise_strength=0.999,  # Effectively, 4 inference steps out of 5
+      timesteps=[1000, 909, 725, 421, 0],
+      latents=upscaled_latents,
+      decode_timestep=0.05,
+      decode_noise_scale=0.025,
+      image_cond_noise_scale=0.0,
+      guidance_scale=1.0,
+      guidance_rescale=0.7,
+      generator=torch.Generator().manual_seed(0),
+      output_type="pil",
+  ).frames[0]
+
+  # 4. Downscale the video to the expected resolution
+  video = [frame.resize((expected_width, expected_height)) for frame in video]
+
+  export_to_video(video, "output.mp4", fps=24)
+  ```
+
+  </details>
+
 - LTX-Video supports LoRAs with [`~loaders.LTXVideoLoraLoaderMixin.load_lora_weights`].

  <details>
--- a/docs/source/en/api/pipelines/marigold.md
+++ b/docs/source/en/api/pipelines/marigold.md
@@ -75,7 +75,7 @@ The following is a summary of the recommended checkpoints, all of which produce
 | [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1)                   | Depth        | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference.                    |
 | [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1)               | Normals      | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1.                                                        |
 | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics   | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity.                                                                      | 
-| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1)     | Intrinsics   | HyperSim decomposition of an image &nbsp\\(I\\)&nbsp is comprised of Albedo &nbsp\\(A\\), Diffuse shading &nbsp\\(S\\), and Non-diffuse residual &nbsp\\(R\\): &nbsp\\(I = A*S+R\\). |
+| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1)     | Intrinsics   | HyperSim decomposition of an image $I$ is comprised of Albedo $A$, Diffuse shading $S$, and Non-diffuse residual $R$: $I = A*S+R$. |

 > [!TIP]
 > Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff 
--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -32,7 +32,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Attend-and-Excite](attend_and_excite) | text2image |
 | [AudioLDM](audioldm) | text2audio |
 | [AudioLDM2](audioldm2) | text2audio |
-| [AuraFlow](auraflow) | text2image |
+| [AuraFlow](aura_flow) | text2image |
 | [BLIP Diffusion](blip_diffusion) | text2image |
 | [Bria 3.2](bria_3_2) | text2image |
 | [CogVideoX](cogvideox) | text2video |
--- a/docs/source/en/api/pipelines/qwenimage.md
+++ b/docs/source/en/api/pipelines/qwenimage.md
@@ -109,7 +109,7 @@ image_1 = load_image("https://huggingface.co/datasets/huggingface/documentation-
 image_2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peng.png")
 image = pipe(
    image=[image_1, image_2], 
-    prompt="put the penguin and the cat at a game show called "Qwen Edit Plus Games"", 
+    prompt='''put the penguin and the cat at a game show called "Qwen Edit Plus Games"''', 
    num_inference_steps=50
 ).images[0]
 ```
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
@@ -271,7 +271,7 @@ Check out the full script [here](https://gist.github.com/sayakpaul/508d89d7aad4f

 Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.

-Refer to the [Quantization](../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`StableDiffusion3Pipeline`] for inference with bitsandbytes.
+Refer to the [Quantization](../../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`StableDiffusion3Pipeline`] for inference with bitsandbytes.

 ```py
 import torch
--- a/docs/source/en/api/pipelines/stable_diffusion/svd.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/svd.md
@@ -29,7 +29,7 @@ The abstract from the paper is:

 Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.

-Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
+Check out the [Text or image-to-video](../../../using-diffusers/text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.

 ## StableVideoDiffusionPipeline

--- a/docs/source/en/api/pipelines/text_to_video.md
+++ b/docs/source/en/api/pipelines/text_to_video.md
@@ -172,7 +172,7 @@ Here are some sample outputs:

 Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.

-Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
+Check out the [Text or image-to-video](../../using-diffusers/text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.

 > [!TIP]
 > Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
--- a/docs/source/en/api/utilities.md
+++ b/docs/source/en/api/utilities.md
@@ -26,6 +26,10 @@ Utility and helper functions for working with 🤗 Diffusers.

 [[autodoc]] utils.load_image

+## load_video
+
+[[autodoc]] utils.load_video
+
 ## export_to_gif

 [[autodoc]] utils.export_to_gif
--- a/docs/source/en/optimization/attention_backends.md
+++ b/docs/source/en/optimization/attention_backends.md
@@ -81,6 +81,45 @@ with attention_backend("_flash_3_hub"):
 > [!TIP]
 > Most attention backends support `torch.compile` without graph breaks and can be used to further speed up inference.

+## Checks
+
+The attention dispatcher includes debugging checks that catch common errors before they cause problems.
+
+1. Device checks verify that query, key, and value tensors live on the same device.
+2. Data type checks confirm tensors have matching dtypes and use either bfloat16 or float16.
+3. Shape checks validate tensor dimensions and prevent mixing attention masks with causal flags.
+
+Enable these checks by setting the `DIFFUSERS_ATTN_CHECKS` environment variable. Checks add overhead to every attention operation, so they're disabled by default. 
+
+```bash
+export DIFFUSERS_ATTN_CHECKS=yes
+```
+
+The checks are run now before every attention operation.
+
+```py
+import torch
+
+query = torch.randn(1, 10, 8, 64, dtype=torch.bfloat16, device="cuda")
+key = torch.randn(1, 10, 8, 64, dtype=torch.bfloat16, device="cuda")
+value = torch.randn(1, 10, 8, 64, dtype=torch.bfloat16, device="cuda")
+
+try:
+    with attention_backend("flash"):
+        output = dispatch_attention_fn(query, key, value)
+        print("✓ Flash Attention works with checks enabled")
+except Exception as e:
+    print(f"✗ Flash Attention failed: {e}")
+```
+
+You can also configure the registry directly.
+
+```py
+from diffusers.models.attention_dispatch import _AttentionBackendRegistry
+
+_AttentionBackendRegistry._checks_enabled = True
+```
+
 ## Available backends

 Refer to the table below for a complete list of available attention backends and their variants.
--- a/docs/source/en/training/dreambooth.md
+++ b/docs/source/en/training/dreambooth.md
@@ -548,4 +548,4 @@ Training the DeepFloyd IF model can be challenging, but here are some tips that

 Congratulations on training your DreamBooth model! To learn more about how to use your new model, the following guide may be helpful:

- Learn how to [load a DreamBooth](../using-diffusers/loading_adapters) model for inference if you trained your model with LoRA.
+- Learn how to [load a DreamBooth](../using-diffusers/dreambooth) model for inference if you trained your model with LoRA.
--- a/docs/source/en/training/lcm_distill.md
+++ b/docs/source/en/training/lcm_distill.md
@@ -75,7 +75,7 @@ accelerate launch train_lcm_distill_sd_wds.py \
 Most of the parameters are identical to the parameters in the [Text-to-image](text2image#script-parameters) training guide, so you'll focus on the parameters that are relevant to latent consistency distillation in this guide.

 - `--pretrained_teacher_model`: the path to a pretrained latent diffusion model to use as the teacher model
- `--pretrained_vae_model_name_or_path`: path to a pretrained VAE; the SDXL VAE is known to suffer from numerical instability, so this parameter allows you to specify an alternative VAE (like this [VAE]((https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)) by madebyollin which works in fp16)
+- `--pretrained_vae_model_name_or_path`: path to a pretrained VAE; the SDXL VAE is known to suffer from numerical instability, so this parameter allows you to specify an alternative VAE (like this [VAE](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)) by madebyollin which works in fp16)
 - `--w_min` and `--w_max`: the minimum and maximum guidance scale values for guidance scale sampling
 - `--num_ddim_timesteps`: the number of timesteps for DDIM sampling
 - `--loss_type`: the type of loss (L2 or Huber) to calculate for latent consistency distillation; Huber loss is generally preferred because it's more robust to outliers
@@ -245,5 +245,5 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl

 Congratulations on distilling a LCM model! To learn more about LCM, the following may be helpful:

- Learn how to use [LCMs for inference](../using-diffusers/lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
+- Learn how to use [LCMs for inference](../using-diffusers/inference_with_lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
 - Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -198,5 +198,5 @@ image = pipeline("A naruto with blue eyes").images[0]

 Congratulations on training a new model with LoRA! To learn more about how to use your new model, the following guides may be helpful:

- Learn how to [load different LoRA formats](../using-diffusers/loading_adapters#LoRA) trained using community trainers like Kohya and TheLastBen.
+- Learn how to [load different LoRA formats](../tutorials/using_peft_for_inference) trained using community trainers like Kohya and TheLastBen.
 - Learn how to use and [combine multiple LoRA's](../tutorials/using_peft_for_inference) with PEFT for inference.
--- a/docs/source/en/training/text2image.md
+++ b/docs/source/en/training/text2image.md
@@ -178,5 +178,5 @@ image.save("yoda-naruto.png")

 Congratulations on training your own text-to-image model! To learn more about how to use your new model, the following guides may be helpful:

- Learn how to [load LoRA weights](../using-diffusers/loading_adapters#LoRA) for inference if you trained your model with LoRA.
+- Learn how to [load LoRA weights](../tutorials/using_peft_for_inference) for inference if you trained your model with LoRA.
 - Learn more about how certain parameters like guidance scale or techniques such as prompt weighting can help you control inference in the [Text-to-image](../using-diffusers/conditional_image_generation) task guide.
--- a/docs/source/en/training/text_inversion.md
+++ b/docs/source/en/training/text_inversion.md
@@ -203,5 +203,4 @@ image.save("cat-train.png")

 Congratulations on training your own Textual Inversion model! 🎉 To learn more about how to use your new model, the following guides may be helpful:

- Learn how to [load Textual Inversion embeddings](../using-diffusers/loading_adapters) and also use them as negative embeddings.
- Learn how to use [Textual Inversion](textual_inversion_inference) for inference with Stable Diffusion 1/2 and Stable Diffusion XL.
+- Learn how to [load Textual Inversion embeddings](../using-diffusers/textual_inversion_inference) and also use them as negative embeddings.
--- a/docs/source/en/using-diffusers/batched_inference.md
+++ b/docs/source/en/using-diffusers/batched_inference.md
@@ -16,24 +16,24 @@ Batch inference processes multiple prompts at a time to increase throughput. It

 The downside is increased latency because you must wait for the entire batch to complete, and more GPU memory is required for large batches.

-<hfoptions id="usage">
-<hfoption id="text-to-image">
-
-For text-to-image, pass a list of prompts to the pipeline.
+For text-to-image, pass a list of prompts to the pipeline and for image-to-image, pass a list of images and prompts to the pipeline. The example below demonstrates batched text-to-image inference.

 ```py
 import torch
+import matplotlib.pyplot as plt
 from diffusers import DiffusionPipeline

 pipeline = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
+    torch_dtype=torch.float16,
+    device_map="cuda"
+)

 prompts = [
-    "cinematic photo of A beautiful sunset over mountains, 35mm photograph, film, professional, 4k, highly detailed",
-    "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
-    "pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
+    "Cinematic shot of a cozy coffee shop interior, warm pastel light streaming through a window where a cat rests. Shallow depth of field, glowing cups in soft focus, dreamy lofi-inspired mood, nostalgic tones, framed like a quiet film scene.",
+    "Polaroid-style photograph of a cozy coffee shop interior, bathed in warm pastel light. A cat sits on the windowsill near steaming mugs. Soft, slightly faded tones and dreamy blur evoke nostalgia, a lofi mood, and the intimate, imperfect charm of instant film.",
+    "Soft watercolor illustration of a cozy coffee shop interior, pastel washes of color filling the space. A cat rests peacefully on the windowsill as warm light glows through. Gentle brushstrokes create a dreamy, lofi-inspired atmosphere with whimsical textures and nostalgic calm.",
+    "Isometric pixel-art illustration of a cozy coffee shop interior in detailed 8-bit style. Warm pastel light fills the space as a cat rests on the windowsill. Blocky furniture and tiny mugs add charm, low-res retro graphics enhance the nostalgic, lofi-inspired game aesthetic."
 ]

 images = pipeline(
@@ -52,6 +52,10 @@ plt.tight_layout()
 plt.show()
 ```

+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/batch-inference.png"/>
+</div>
+
 To generate multiple variations of one prompt, use the `num_images_per_prompt` argument.

 ```py
@@ -61,11 +65,18 @@ from diffusers import DiffusionPipeline

 pipeline = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
+    torch_dtype=torch.float16,
+    device_map="cuda"
+)
+
+prompt="""
+Isometric pixel-art illustration of a cozy coffee shop interior in detailed 8-bit style. Warm pastel light fills the
+space as a cat rests on the windowsill. Blocky furniture and tiny mugs add charm, low-res retro graphics enhance the
+nostalgic, lofi-inspired game aesthetic.
+"""

 images = pipeline(
-    prompt="pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics",
+    prompt=prompt,
    num_images_per_prompt=4
 ).images

@@ -81,6 +92,10 @@ plt.tight_layout()
 plt.show()
 ```

+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/batch-inference-2.png"/>
+</div>
+
 Combine both approaches to generate different variations of different prompts.

 ```py
@@ -89,7 +104,7 @@ images = pipeline(
    num_images_per_prompt=2,
 ).images

-fig, axes = plt.subplots(2, 2, figsize=(12, 12))
+fig, axes = plt.subplots(2, 4, figsize=(12, 12))
 axes = axes.flatten()

 for i, image in enumerate(images):
@@ -101,126 +116,18 @@ plt.tight_layout()
 plt.show()
 ```

-</hfoption>
-<hfoption id="image-to-image">
-
-For image-to-image, pass a list of input images and prompts to the pipeline.
-
-```py
-import torch
-from diffusers.utils import load_image
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-
-input_images = [
-    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"),
-    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"),
-    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png")
-]
-
-prompts = [
-    "cinematic photo of a beautiful sunset over mountains, 35mm photograph, film, professional, 4k, highly detailed",
-    "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
-    "pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
-]
-
-images = pipeline(
-    prompt=prompts,
-    image=input_images,
-    guidance_scale=8.0,
-    strength=0.5
-).images
-
-fig, axes = plt.subplots(2, 2, figsize=(12, 12))
-axes = axes.flatten()
-
-for i, image in enumerate(images):
-    axes[i].imshow(image)
-    axes[i].set_title(f"Image {i+1}")
-    axes[i].axis('off')
-
-plt.tight_layout()
-plt.show()
-```
-
-To generate multiple variations of one prompt, use the `num_images_per_prompt` argument.
-
-```py
-import torch
-import matplotlib.pyplot as plt
-from diffusers.utils import load_image
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-
-input_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png")
-
-images = pipeline(
-    prompt="pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics",
-    image=input_image,
-    num_images_per_prompt=4
-).images
-
-fig, axes = plt.subplots(2, 2, figsize=(12, 12))
-axes = axes.flatten()
-
-for i, image in enumerate(images):
-    axes[i].imshow(image)
-    axes[i].set_title(f"Image {i+1}")
-    axes[i].axis('off')
-
-plt.tight_layout()
-plt.show()
-```
-
-Combine both approaches to generate different variations of different prompts.
-
-```py
-input_images = [
-    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"),
-    load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png")
-]
-
-prompts = [
-    "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
-    "pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
-]
-
-images = pipeline(
-    prompt=prompts,
-    image=input_images,
-    num_images_per_prompt=2,
-).images
-
-fig, axes = plt.subplots(2, 2, figsize=(12, 12))
-axes = axes.flatten()
-
-for i, image in enumerate(images):
-    axes[i].imshow(image)
-    axes[i].set_title(f"Image {i+1}")
-    axes[i].axis('off')
-
-plt.tight_layout()
-plt.show()
-```
-
-</hfoption>
-</hfoptions>
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/batch-inference-3.png"/>
+</div>

 ## Deterministic generation

 Enable reproducible batch generation by passing a list of [Generator’s](https://pytorch.org/docs/stable/generated/torch.Generator.html) to the pipeline and tie each `Generator` to a seed to reuse it.

-Use a list comprehension to iterate over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch.
+> [!TIP]
+> Refer to the [Reproducibility](./reusing_seeds) docs to learn more about deterministic algorithms and the `Generator` object.

-Don't multiply the `Generator` by the batch size because that only creates one `Generator` object that is used sequentially for each image in the batch.
+Use a list comprehension to iterate over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch. Don't multiply the `Generator` by the batch size because that only creates one `Generator` object that is used sequentially for each image in the batch.

 ```py
 generator = [torch.Generator(device="cuda").manual_seed(0)] * 3
@@ -234,14 +141,16 @@ from diffusers import DiffusionPipeline

 pipeline = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
+    torch_dtype=torch.float16,
+    device_map="cuda"
+)

 generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(3)]
 prompts = [
-    "cinematic photo of A beautiful sunset over mountains, 35mm photograph, film, professional, 4k, highly detailed",
-    "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
-    "pixel-art a cozy coffee shop interior, low-res, blocky, pixel art style, 8-bit graphics"
+    "Cinematic shot of a cozy coffee shop interior, warm pastel light streaming through a window where a cat rests. Shallow depth of field, glowing cups in soft focus, dreamy lofi-inspired mood, nostalgic tones, framed like a quiet film scene.",
+    "Polaroid-style photograph of a cozy coffee shop interior, bathed in warm pastel light. A cat sits on the windowsill near steaming mugs. Soft, slightly faded tones and dreamy blur evoke nostalgia, a lofi mood, and the intimate, imperfect charm of instant film.",
+    "Soft watercolor illustration of a cozy coffee shop interior, pastel washes of color filling the space. A cat rests peacefully on the windowsill as warm light glows through. Gentle brushstrokes create a dreamy, lofi-inspired atmosphere with whimsical textures and nostalgic calm.",
+    "Isometric pixel-art illustration of a cozy coffee shop interior in detailed 8-bit style. Warm pastel light fills the space as a cat rests on the windowsill. Blocky furniture and tiny mugs add charm, low-res retro graphics enhance the nostalgic, lofi-inspired game aesthetic."
 ]

 images = pipeline(
@@ -261,4 +170,4 @@ plt.tight_layout()
 plt.show()
 ```

-You can use this to iteratively select an image associated with a seed and then improve on it by crafting a more detailed prompt.
+You can use this to select an image associated with a seed and iteratively improve on it by crafting a more detailed prompt.
--- a/docs/source/en/using-diffusers/controlling_generation.md
+++ b/docs/source/en/using-diffusers/controlling_generation.md
@@ -70,32 +70,6 @@ For convenience, we provide a table to denote which methods are inference-only a
 [InstructPix2Pix](../api/pipelines/pix2pix) is fine-tuned from Stable Diffusion to support editing input images. It takes as inputs an image and a prompt describing an edit, and it outputs the edited image.
 InstructPix2Pix has been explicitly trained to work well with [InstructGPT](https://openai.com/blog/instruction-following/)-like prompts.

-## Pix2Pix Zero
-
-[Paper](https://huggingface.co/papers/2302.03027)
-
-[Pix2Pix Zero](../api/pipelines/pix2pix_zero) allows modifying an image so that one concept or subject is translated to another one while preserving general image semantics.
-
-The denoising process is guided from one conceptual embedding towards another conceptual embedding. The intermediate latents are optimized during the denoising process to push the attention maps towards reference attention maps. The reference attention maps are from the denoising process of the input image and are used to encourage semantic preservation.
-
-Pix2Pix Zero can be used both to edit synthetic images as well as real images.
-
- To edit synthetic images, one first generates an image given a caption.
-  Next, we generate image captions for the concept that shall be edited and for the new target concept. We can use a model like [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) for this purpose. Then, "mean" prompt embeddings for both the source and target concepts are created via the text encoder. Finally, the pix2pix-zero algorithm is used to edit the synthetic image.
- To edit a real image, one first generates an image caption using a model like [BLIP](https://huggingface.co/docs/transformers/model_doc/blip). Then one applies DDIM inversion on the prompt and image to generate "inverse" latents. Similar to before, "mean" prompt embeddings for both source and target concepts are created and finally the pix2pix-zero algorithm in combination with the "inverse" latents is used to edit the image.
-
-> [!TIP]
-> Pix2Pix Zero is the first model that allows "zero-shot" image editing. This means that the model
-> can edit an image in less than a minute on a consumer GPU as shown [here](../api/pipelines/pix2pix_zero#usage-example).
-
-As mentioned above, Pix2Pix Zero includes optimizing the latents (and not any of the UNet, VAE, or the text encoder) to steer the generation toward a specific concept. This means that the overall
-pipeline might require more memory than a standard [StableDiffusionPipeline](../api/pipelines/stable_diffusion/text2img).
-
-> [!TIP]
-> An important distinction between methods like InstructPix2Pix and Pix2Pix Zero is that the former
-> involves fine-tuning the pre-trained weights while the latter does not. This means that you can
-> apply Pix2Pix Zero to any of the available Stable Diffusion models.
-
 ## Attend and Excite

 [Paper](https://huggingface.co/papers/2301.13826)
@@ -178,14 +152,6 @@ multi-concept training by design. Like DreamBooth and Textual Inversion, Custom
 teach a pre-trained text-to-image diffusion model about new concepts to generate outputs involving the
 concept(s) of interest.

-## Model Editing
-
-[Paper](https://huggingface.co/papers/2303.08084)
-
-The [text-to-image model editing pipeline](../api/pipelines/model_editing) helps you mitigate some of the incorrect implicit assumptions a pre-trained text-to-image
-diffusion model might make about the subjects present in the input prompt. For example, if you prompt Stable Diffusion to generate images for "A pack of roses", the roses in the generated images
-are more likely to be red. This pipeline helps you change that assumption.
-
 ## DiffEdit

 [Paper](https://huggingface.co/papers/2210.11427)
--- a/docs/source/en/using-diffusers/inference_with_lcm.md
+++ b/docs/source/en/using-diffusers/inference_with_lcm.md
@@ -257,7 +257,7 @@ LCMs are compatible with adapters like LoRA, ControlNet, T2I-Adapter, and Animat

 ### LoRA

-[LoRA](../using-diffusers/loading_adapters#lora) adapters can be rapidly finetuned to learn a new style from just a few images and plugged into a pretrained model to generate images in that style.
+[LoRA](../tutorials/using_peft_for_inference) adapters can be rapidly finetuned to learn a new style from just a few images and plugged into a pretrained model to generate images in that style.

 <hfoptions id="lcm-lora">
 <hfoption id="LCM">
--- a/docs/source/en/using-diffusers/inference_with_tcd_lora.md
+++ b/docs/source/en/using-diffusers/inference_with_tcd_lora.md
@@ -18,7 +18,7 @@ Trajectory Consistency Distillation (TCD) enables a model to generate higher qua

 The major advantages of TCD are:

- Better than Teacher: TCD demonstrates superior generative quality at both small and large inference steps and exceeds the performance of [DPM-Solver++(2S)](../../api/schedulers/multistep_dpm_solver) with Stable Diffusion XL (SDXL). There is no additional discriminator or LPIPS supervision included during TCD training.
+- Better than Teacher: TCD demonstrates superior generative quality at both small and large inference steps and exceeds the performance of [DPM-Solver++(2S)](../api/schedulers/multistep_dpm_solver) with Stable Diffusion XL (SDXL). There is no additional discriminator or LPIPS supervision included during TCD training.

 - Flexible Inference Steps: The inference steps for TCD sampling can be freely adjusted without adversely affecting the image quality.

@@ -166,7 +166,7 @@ image = pipe(
 TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method.

 > [!TIP]
-> Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods.
+> Check out the [Merge LoRAs](../tutorials/using_peft_for_inference#merge) guide to learn more about efficient merging methods.

 ```python
 import torch
--- a/docs/source/en/using-diffusers/sdxl.md
+++ b/docs/source/en/using-diffusers/sdxl.md
@@ -280,7 +280,7 @@ refiner = DiffusionPipeline.from_pretrained(
 ```

 > [!WARNING]
-> You can use SDXL refiner with a different base model. For example, you can use the [Hunyuan-DiT](../../api/pipelines/hunyuandit) or [PixArt-Sigma](../../api/pipelines/pixart_sigma) pipelines to generate images with better prompt adherence. Once you have generated an image, you can pass it to the SDXL refiner model to enhance final generation quality.
+> You can use SDXL refiner with a different base model. For example, you can use the [Hunyuan-DiT](../api/pipelines/hunyuandit) or [PixArt-Sigma](../api/pipelines/pixart_sigma) pipelines to generate images with better prompt adherence. Once you have generated an image, you can pass it to the SDXL refiner model to enhance final generation quality.

 Generate an image from the base model, and set the model output to **latent** space:

--- a/docs/source/en/using-diffusers/weighted_prompts.md
+++ b/docs/source/en/using-diffusers/weighted_prompts.md
@@ -10,423 +10,96 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Prompt techniques
-
 [[open-in-colab]]

-Prompts are important because they describe what you want a diffusion model to generate. The best prompts are detailed, specific, and well-structured to help the model realize your vision. But crafting a great prompt takes time and effort and sometimes it may not be enough because language and words can be imprecise. This is where you need to boost your prompt with other techniques, such as prompt enhancing and prompt weighting, to get the results you want.
+# Prompting

-This guide will show you how you can use these prompt techniques to generate high-quality images with lower effort and adjust the weight of certain keywords in a prompt.
+Prompts describes what a model should generate. Good prompts are detailed, specific, and structured and they generate better images and videos.

-## Prompt engineering
+This guide shows you how to write effective prompts and introduces techniques that make them stronger.

-> [!TIP]
-> This is not an exhaustive guide on prompt engineering, but it will help you understand the necessary parts of a good prompt. We encourage you to continue experimenting with different prompts and combine them in new ways to see what works best. As you write more prompts, you'll develop an intuition for what works and what doesn't!
+## Writing good prompts

-New diffusion models do a pretty good job of generating high-quality images from a basic prompt, but it is still important to create a well-written prompt to get the best results. Here are a few tips for writing a good prompt:
+Every effective prompt needs three core elements.

-1. What is the image *medium*? Is it a photo, a painting, a 3D illustration, or something else?
-2. What is the image *subject*? Is it a person, animal, object, or scene?
-3. What *details* would you like to see in the image? This is where you can get really creative and have a lot of fun experimenting with different words to bring your image to life. For example, what is the lighting like? What is the vibe and aesthetic? What kind of art or illustration style are you looking for? The more specific and precise words you use, the better the model will understand what you want to generate.
+1. <span class="underline decoration-sky-500 decoration-2 underline-offset-4">Subject</span> - what you want to generate. Start your prompt here.
+2. <span class="underline decoration-pink-500 decoration-2 underline-offset-4">Style</span> - the medium or aesthetic. How should it look?
+3. <span class="underline decoration-green-500 decoration-2 underline-offset-4">Context</span> - details about actions, setting, and mood.
+
+Use these elements as a structured narrative, not a keyword list. Modern models understand language better than keyword matching. Start simple, then add details.
+
+Context is especially important for creating better prompts. Try adding lighting, artistic details, and mood.

 <div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/plain-prompt.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"A photo of a banana-shaped couch in a living room"</figcaption>
+  <div class="flex-1 text-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ok-prompt.png" class="w-full h-auto object-cover rounded-lg">
+    <figcaption class="mt-2 text-sm text-gray-500">A <span class="underline decoration-sky-500 decoration-2 underline-offset-1">cute cat</span> <span class="underline decoration-pink-500 decoration-2 underline-offset-1">lounges on a leaf in a pool during a peaceful summer afternoon</span>, in <span class="underline decoration-green-500 decoration-2 underline-offset-1">lofi art style, illustration</span>.</figcaption>
  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"A vibrant yellow banana-shaped couch sits in a cozy living room, its curve cradling a pile of colorful cushions. on the wooden floor, a patterned rug adds a touch of eclectic charm, and a potted plant sits in the corner, reaching towards the sunlight filtering through the windows"</figcaption>
+  <div class="flex-1 text-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/better-prompt.png" class="w-full h-auto object-cover rounded-lg"/>
+    <figcaption class="mt-2 text-sm text-gray-500">A cute cat lounges on a floating leaf in a sparkling pool during a peaceful summer afternoon. Clear reflections ripple across the water, with sunlight casting soft, smooth highlights. The illustration is detailed and polished, with elegant lines and harmonious colors, evoking a relaxing, serene, and whimsical lofi mood, anime-inspired and visually comforting.</figcaption>
  </div>
 </div>

-## Prompt enhancing with GPT2
-
-Prompt enhancing is a technique for quickly improving prompt quality without spending too much effort constructing one. It uses a model like GPT2 pretrained on Stable Diffusion text prompts to automatically enrich a prompt with additional important keywords to generate high-quality images.
-
-The technique works by curating a list of specific keywords and forcing the model to generate those words to enhance the original prompt. This way, your prompt can be "a cat" and GPT2 can enhance the prompt to "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic".
+Be specific and add context. Use photography terms like lens type, focal length, camera angles, and depth of field.

 > [!TIP]
-> You should also use a [*offset noise*](https://www.crosslabs.org//blog/diffusion-with-offset-noise) LoRA to improve the contrast in bright and dark images and create better lighting overall. This [LoRA](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_offset_example-lora_1.0.safetensors) is available from [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0).
-
-Start by defining certain styles and a list of words (you can check out a more comprehensive list of [words](https://hf.co/LykosAI/GPT-Prompt-Expansion-Fooocus-v2/blob/main/positive.txt) and [styles](https://github.com/lllyasviel/Fooocus/tree/main/sdxl_styles) used by Fooocus) to enhance a prompt with.
-
-```py
-import torch
-from transformers import GenerationConfig, GPT2LMHeadModel, GPT2Tokenizer, LogitsProcessor, LogitsProcessorList
-from diffusers import StableDiffusionXLPipeline
-
-styles = {
-    "cinematic": "cinematic film still of {prompt}, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
-    "anime": "anime artwork of {prompt}, anime style, key visual, vibrant, studio anime, highly detailed",
-    "photographic": "cinematic photo of {prompt}, 35mm photograph, film, professional, 4k, highly detailed",
-    "comic": "comic of {prompt}, graphic illustration, comic art, graphic novel art, vibrant, highly detailed",
-    "lineart": "line art drawing {prompt}, professional, sleek, modern, minimalist, graphic, line art, vector graphics",
-    "pixelart": " pixel-art {prompt}, low-res, blocky, pixel art style, 8-bit graphics",
-}
-
-words = [
-    "aesthetic", "astonishing", "beautiful", "breathtaking", "composition", "contrasted", "epic", "moody", "enhanced",
-    "exceptional", "fascinating", "flawless", "glamorous", "glorious", "illumination", "impressive", "improved",
-    "inspirational", "magnificent", "majestic", "hyperrealistic", "smooth", "sharp", "focus", "stunning", "detailed",
-    "intricate", "dramatic", "high", "quality", "perfect", "light", "ultra", "highly", "radiant", "satisfying",
-    "soothing", "sophisticated", "stylish", "sublime", "terrific", "touching", "timeless", "wonderful", "unbelievable",
-    "elegant", "awesome", "amazing", "dynamic", "trendy",
-]
-```
-
-You may have noticed in the `words` list, there are certain words that can be paired together to create something more meaningful. For example, the words "high" and "quality" can be combined to create "high quality". Let's pair these words together and remove the words that can't be paired.
-
-```py
-word_pairs = ["highly detailed", "high quality", "enhanced quality", "perfect composition", "dynamic light"]
-
-def find_and_order_pairs(s, pairs):
-    words = s.split()
-    found_pairs = []
-    for pair in pairs:
-        pair_words = pair.split()
-        if pair_words[0] in words and pair_words[1] in words:
-            found_pairs.append(pair)
-            words.remove(pair_words[0])
-            words.remove(pair_words[1])
-
-    for word in words[:]:
-        for pair in pairs:
-            if word in pair.split():
-                words.remove(word)
-                break
-    ordered_pairs = ", ".join(found_pairs)
-    remaining_s = ", ".join(words)
-    return ordered_pairs, remaining_s
-```
-
-Next, implement a custom [`~transformers.LogitsProcessor`] class that assigns tokens in the `words` list a value of 0 and assigns tokens not in the `words` list a negative value so they aren't picked during generation. This way, generation is biased towards words in the `words` list. After a word from the list is used, it is also assigned a negative value so it isn't picked again.
-
-```py
-class CustomLogitsProcessor(LogitsProcessor):
-    def __init__(self, bias):
-        super().__init__()
-        self.bias = bias
-
-    def __call__(self, input_ids, scores):
-        if len(input_ids.shape) == 2:
-            last_token_id = input_ids[0, -1]
-            self.bias[last_token_id] = -1e10
-        return scores + self.bias
-
-word_ids = [tokenizer.encode(word, add_prefix_space=True)[0] for word in words]
-bias = torch.full((tokenizer.vocab_size,), -float("Inf")).to("cuda")
-bias[word_ids] = 0
-processor = CustomLogitsProcessor(bias)
-processor_list = LogitsProcessorList([processor])
-```
-
-Combine the prompt and the `cinematic` style prompt defined in the `styles` dictionary earlier.
-
-```py
-prompt = "a cat basking in the sun on a roof in Turkey"
-style = "cinematic"
-
-prompt = styles[style].format(prompt=prompt)
-prompt
-"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"
-```
-
-Load a GPT2 tokenizer and model from the [Gustavosta/MagicPrompt-Stable-Diffusion](https://huggingface.co/Gustavosta/MagicPrompt-Stable-Diffusion) checkpoint (this specific checkpoint is trained to generate prompts) to enhance the prompt.
-
-```py
-tokenizer = GPT2Tokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
-model = GPT2LMHeadModel.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion", torch_dtype=torch.float16).to(
-    "cuda"
-)
-model.eval()
-
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-token_count = inputs["input_ids"].shape[1]
-max_new_tokens = 50 - token_count
-
-generation_config = GenerationConfig(
-    penalty_alpha=0.7,
-    top_k=50,
-    eos_token_id=model.config.eos_token_id,
-    pad_token_id=model.config.eos_token_id,
-    pad_token=model.config.pad_token_id,
-    do_sample=True,
-)
-
-with torch.no_grad():
-    generated_ids = model.generate(
-        input_ids=inputs["input_ids"],
-        attention_mask=inputs["attention_mask"],
-        max_new_tokens=max_new_tokens,
-        generation_config=generation_config,
-        logits_processor=proccesor_list,
-    )
-```
-
-Then you can combine the input prompt and the generated prompt. Feel free to take a look at what the generated prompt (`generated_part`) is, the word pairs that were found (`pairs`), and the remaining words (`words`). This is all packed together in the `enhanced_prompt`.
-
-```py
-output_tokens = [tokenizer.decode(generated_id, skip_special_tokens=True) for generated_id in generated_ids]
-input_part, generated_part = output_tokens[0][: len(prompt)], output_tokens[0][len(prompt) :]
-pairs, words = find_and_order_pairs(generated_part, word_pairs)
-formatted_generated_part = pairs + ", " + words
-enhanced_prompt = input_part + ", " + formatted_generated_part
-enhanced_prompt
-["cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic"]
-```
-
-Finally, load a pipeline and the offset noise LoRA with a *low weight* to generate an image with the enhanced prompt.
-
-```py
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
-
-pipeline.load_lora_weights(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    weight_name="sd_xl_offset_example-lora_1.0.safetensors",
-    adapter_name="offset",
-)
-pipeline.set_adapters(["offset"], adapter_weights=[0.2])
-
-image = pipeline(
-    enhanced_prompt,
-    width=1152,
-    height=896,
-    guidance_scale=7.5,
-    num_inference_steps=25,
-).images[0]
-image
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"a cat basking in the sun on a roof in Turkey"</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/enhanced-prompt.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"</figcaption>
-  </div>
-</div>
+> Try a [prompt enhancer](https://huggingface.co/models?sort=downloads&search=prompt+enhancer) to help improve your prompt structure.

 ## Prompt weighting

-Prompt weighting provides a way to emphasize or de-emphasize certain parts of a prompt, allowing for more control over the generated image. A prompt can include several concepts, which gets turned into contextualized text embeddings. The embeddings are used by the model to condition its cross-attention layers to generate an image (read the Stable Diffusion [blog post](https://huggingface.co/blog/stable_diffusion) to learn more about how it works).
+Prompt weighting makes some words stronger and others weaker. It scales attention scores so you control how much influence each concept has.

-Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt embeddings is to use [Stable Diffusion Long Prompt Weighted Embedding](https://github.com/xhinker/sd_embed) (sd_embed). Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [negative_prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
+Diffusers handles this through `prompt_embeds` and `pooled_prompt_embeds` arguments which take scaled text embedding vectors. Use the [sd_embed](https://github.com/xhinker/sd_embed) library to generate these embeddings. It also supports longer prompts.

-> [!TIP]
-> If your favorite pipeline doesn't have a `prompt_embeds` parameter, please open an [issue](https://github.com/huggingface/diffusers/issues/new/choose) so we can add it!
-
-This guide will show you how to weight your prompts with sd_embed.
-
-Before you begin, make sure you have the latest version of sd_embed installed:
-
-```bash
-pip install git+https://github.com/xhinker/sd_embed.git@main
-```
-
-For this example, let's use [`StableDiffusionXLPipeline`].
+> [!NOTE]
+> The sd_embed library only supports Stable Diffusion, Stable Diffusion XL, Stable Diffusion 3, Stable Cascade, and Flux. Prompt weighting doesn't necessarily help for newer models like Flux which already has very good prompt adherence.

 ```py
-from diffusers import StableDiffusionXLPipeline, UniPCMultistepScheduler
-import torch
-
-pipe = StableDiffusionXLPipeline.from_pretrained("Lykon/dreamshaper-xl-1-0", torch_dtype=torch.float16)
-pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-pipe.to("cuda")
+!uv pip install git+https://github.com/xhinker/sd_embed.git@main
 ```

-To upweight or downweight a concept, surround the text with parentheses. More parentheses applies a heavier weight on the text. You can also append a numerical multiplier to the text to indicate how much you want to increase or decrease its weights by.
+Format weighted text with numerical multipliers or parentheses. More parentheses mean stronger weighting.

 | format | multiplier |
 |---|---|
-| `(hippo)` | increase by 1.1x |
-| `((hippo))` | increase by 1.21x |
-| `(hippo:1.5)` | increase by 1.5x |
-| `(hippo:0.5)` | decrease by 4x |
+| `(cat)` | increase by 1.1x |
+| `((cat))` | increase by 1.21x |
+| `(cat:1.5)` | increase by 1.5x |
+| `(cat:0.5)` | decrease by 4x |

-Create a prompt and use a combination of parentheses and numerical multipliers to upweight various text.
+Create a weighted prompt and pass it to [get_weighted_text_embeddings_sdxl](https://github.com/xhinker/sd_embed/blob/4a47f71150a22942fa606fb741a1c971d95ba56f/src/sd_embed/embedding_funcs.py#L405) to generate embeddings.
+
+> [!TIP]
+> You could also pass negative prompts to `negative_prompt_embeds` and `negative_pooled_prompt_embeds`.

 ```py
+import torch
+from diffusers import DiffusionPipeline
 from sd_embed.embedding_funcs import get_weighted_text_embeddings_sdxl

-prompt = """A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus. 
-This imaginative creature features the distinctive, bulky body of a hippo, 
-but with a texture and appearance resembling a golden-brown, crispy waffle. 
-The creature might have elements like waffle squares across its skin and a syrup-like sheen. 
-It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting, 
-possibly including oversized utensils or plates in the background. 
-The image should evoke a sense of playful absurdity and culinary fantasy.
-"""
-
-neg_prompt = """\
-skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
-(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
-extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
-(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
-bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
-(normal quality:2),lowres,((monochrome)),((grayscale))
-"""
-```
-
-Use the `get_weighted_text_embeddings_sdxl` function to generate the prompt embeddings and the negative prompt embeddings. It'll also generated the pooled and negative pooled prompt embeddings since you're using the SDXL model.
-
-> [!TIP]
-> You can safely ignore the error message below about the token index length exceeding the models maximum sequence length. All your tokens will be used in the embedding process.
->
-> ```
-> Token indices sequence length is longer than the specified maximum sequence length for this model
-> ```
-
-```py
-( 
-  prompt_embeds,
-  prompt_neg_embeds,
-  pooled_prompt_embeds,
-  negative_pooled_prompt_embeds
-) = get_weighted_text_embeddings_sdxl(
-    pipe,
-    prompt=prompt,
-    neg_prompt=neg_prompt
+pipeline = DiffusionPipeline.from_pretrained(
+    "Lykon/dreamshaper-xl-1-0", torch_dtype=torch.bfloat16, device_map="cuda"
 )

-image = pipe(
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=prompt_neg_embeds,
-    pooled_prompt_embeds=pooled_prompt_embeds,
-    negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-    num_inference_steps=30,
-    height=1024,
-    width=1024 + 512,
-    guidance_scale=4.0,
-    generator=torch.Generator("cuda").manual_seed(2)
-).images[0]
-image
+prompt = """
+A (cute cat:1.4) lounges on a (floating leaf:1.2) in a (sparkling pool:1.1) during a peaceful summer afternoon.
+Gentle ripples reflect pastel skies, while (sunlight:1.1) casts soft highlights. The illustration is smooth and polished
+with elegant, sketchy lines and subtle gradients, evoking a ((whimsical, nostalgic, dreamy lofi atmosphere:2.0)), 
+(anime-inspired:1.6), calming, comforting, and visually serene.
+"""
+
+prompt_embeds, _, pooled_prompt_embeds, *_ = get_weighted_text_embeddings_sdxl(pipeline, prompt=prompt)
+```
+
+Pass the embeddings to `prompt_embeds` and `pooled_prompt_embeds` to generate your image.
+
+```py
+image = pipeline(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds).images[0]
 ```

 <div class="flex justify-center">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_sdxl.png"/>
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/prompt-embed-sdxl.png"/>
 </div>

-> [!TIP]
-> Refer to the [sd_embed](https://github.com/xhinker/sd_embed) repository for additional details about long prompt weighting for FLUX.1, Stable Cascade, and Stable Diffusion 1.5.
-
-### Textual inversion
-
-[Textual inversion](../training/text_inversion) is a technique for learning a specific concept from some images which you can use to generate new images conditioned on that concept.
-
-Create a pipeline and use the [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] function to load the textual inversion embeddings (feel free to browse the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer) for 100+ trained concepts):
-
-```py
-import torch
-from diffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained(
-  "stable-diffusion-v1-5/stable-diffusion-v1-5",
-  torch_dtype=torch.float16,
-).to("cuda")
-pipe.load_textual_inversion("sd-concepts-library/midjourney-style")
-```
-
-Add the `<midjourney-style>` text to the prompt to trigger the textual inversion.
-
-```py
-from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
-
-prompt = """<midjourney-style> A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus. 
-This imaginative creature features the distinctive, bulky body of a hippo, 
-but with a texture and appearance resembling a golden-brown, crispy waffle. 
-The creature might have elements like waffle squares across its skin and a syrup-like sheen. 
-It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting, 
-possibly including oversized utensils or plates in the background. 
-The image should evoke a sense of playful absurdity and culinary fantasy.
-"""
-
-neg_prompt = """\
-skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
-(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
-extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
-(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
-bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
-(normal quality:2),lowres,((monochrome)),((grayscale))
-"""
-```
-
-Use the `get_weighted_text_embeddings_sd15` function to generate the prompt embeddings and the negative prompt embeddings.
-
-```py
-( 
-  prompt_embeds,
-  prompt_neg_embeds,
-) = get_weighted_text_embeddings_sd15(
-    pipe,
-    prompt=prompt,
-    neg_prompt=neg_prompt
-)
-
-image = pipe(
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=prompt_neg_embeds,
-    height=768,
-    width=896,
-    guidance_scale=4.0,
-    generator=torch.Generator("cuda").manual_seed(2)
-).images[0]
-image
-```
-
-<div class="flex justify-center">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_textual_inversion.png"/>
-</div>
-
-### DreamBooth
-
-[DreamBooth](../training/dreambooth) is a technique for generating contextualized images of a subject given just a few images of the subject to train on. It is similar to textual inversion, but DreamBooth trains the full model whereas textual inversion only fine-tunes the text embeddings. This means you should use [`~DiffusionPipeline.from_pretrained`] to load the DreamBooth model (feel free to browse the [Stable Diffusion Dreambooth Concepts Library](https://huggingface.co/sd-dreambooth-library) for 100+ trained models):
-
-```py
-import torch
-from diffusers import DiffusionPipeline, UniPCMultistepScheduler
-
-pipe = DiffusionPipeline.from_pretrained("sd-dreambooth-library/dndcoverart-v1", torch_dtype=torch.float16).to("cuda")
-pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-```
-
-Depending on the model you use, you'll need to incorporate the model's unique identifier into your prompt. For example, the `dndcoverart-v1` model uses the identifier `dndcoverart`:
-
-```py
-from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15
-
-prompt = """dndcoverart of A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus. 
-This imaginative creature features the distinctive, bulky body of a hippo, 
-but with a texture and appearance resembling a golden-brown, crispy waffle. 
-The creature might have elements like waffle squares across its skin and a syrup-like sheen. 
-It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting, 
-possibly including oversized utensils or plates in the background. 
-The image should evoke a sense of playful absurdity and culinary fantasy.
-"""
-
-neg_prompt = """\
-skin spots,acnes,skin blemishes,age spot,(ugly:1.2),(duplicate:1.2),(morbid:1.21),(mutilated:1.2),\
-(tranny:1.2),mutated hands,(poorly drawn hands:1.5),blurry,(bad anatomy:1.2),(bad proportions:1.3),\
-extra limbs,(disfigured:1.2),(missing arms:1.2),(extra legs:1.2),(fused fingers:1.5),\
-(too many fingers:1.5),(unclear eyes:1.2),lowers,bad hands,missing fingers,extra digit,\
-bad hands,missing fingers,(extra arms and legs),(worst quality:2),(low quality:2),\
-(normal quality:2),lowres,((monochrome)),((grayscale))
-"""
-
-(
-    prompt_embeds
-    , prompt_neg_embeds
-) = get_weighted_text_embeddings_sd15(
-    pipe
-    , prompt = prompt
-    , neg_prompt = neg_prompt
-)
-```
-
-<div class="flex justify-center">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd_embed_dreambooth.png"/>
-</div>
+Prompt weighting works with [Textual inversion](./textual_inversion_inference) and [DreamBooth](./dreambooth) adapters too.
--- a/docs/source/en/using-diffusers/write_own_pipeline.md
+++ b/docs/source/en/using-diffusers/write_own_pipeline.md
@@ -280,5 +280,5 @@ This is really what 🧨 Diffusers is designed for: to make it intuitive and eas

 For your next steps, feel free to:

-* Learn how to [build and contribute a pipeline](../using-diffusers/contribute_pipeline) to 🧨 Diffusers. We can't wait and see what you'll come up with!
+* Learn how to [build and contribute a pipeline](../conceptual/contribution) to 🧨 Diffusers. We can't wait and see what you'll come up with!
 * Explore [existing pipelines](../api/pipelines/overview) in the library, and see if you can deconstruct and build a pipeline from scratch using the models and schedulers separately.
--- a/docs/source/ko/conceptual/ethical_guidelines.md
+++ b/docs/source/ko/conceptual/ethical_guidelines.md
@@ -14,51 +14,47 @@ specific language governing permissions and limitations under the License.

 ## 서문 [[preamble]]

-[Diffusers](https://huggingface.co/docs/diffusers/index)는 사전 훈련된 diffusion 모델을 제공하며 추론 및 훈련을 위한 모듈식 툴박스로 사용됩니다.
+[Diffusers](https://huggingface.co/docs/diffusers/index)는 사전 훈련된 diffusion 모델을 제공하며, 추론과 훈련을 위한 모듈형 툴박스로 활용됩니다.

-이 기술의 실제 적용과 사회에 미칠 수 있는 부정적인 영향을 고려하여 Diffusers 라이브러리의 개발, 사용자 기여 및 사용에 윤리 지침을 제공하는 것이 중요하다고 생각합니다.
-
-이이 기술을 사용함에 따른 위험은 여전히 검토 중이지만, 몇 가지 예를 들면: 예술가들에 대한 저작권 문제; 딥 페이크의 악용; 부적절한 맥락에서의 성적 콘텐츠 생성; 동의 없는 사칭; 소수자 집단의 억압을 영속화하는 유해한 사회적 편견 등이 있습니다.
-
-우리는 위험을 지속적으로 추적하고 커뮤니티의 응답과 소중한 피드백에 따라 다음 지침을 조정할 것입니다.
+이 기술의 실제 적용 사례와 사회에 미칠 수 있는 잠재적 부정적 영향을 고려할 때, Diffusers 라이브러리의 개발, 사용자 기여, 사용에 윤리 지침을 제공하는 것이 중요하다고 생각합니다.

+이 기술 사용과 관련된 위험은 여전히 검토 중이지만, 예를 들면: 예술가의 저작권 문제, 딥페이크 악용, 부적절한 맥락에서의 성적 콘텐츠 생성, 비동의 사칭, 소수자 집단 억압을 영속화하는 유해한 사회적 편견 등이 있습니다.
+우리는 이러한 위험을 지속적으로 추적하고, 커뮤니티의 반응과 소중한 피드백에 따라 아래 지침을 조정할 것입니다.

 ## 범위 [[scope]]

-Diffusers 커뮤니티는 프로젝트의 개발에 다음과 같은 윤리 지침을 적용하며, 특히 윤리적 문제와 관련된 민감한 주제에 대한 커뮤니티의 기여를 조정하는 데 도움을 줄 것입니다.
-
+Diffusers 커뮤니티는 프로젝트 개발에 다음 윤리 지침을 적용하며, 특히 윤리적 문제와 관련된 민감한 주제에 대해 커뮤니티의 기여를 조정하는 데 도움을 줄 것입니다.

 ## 윤리 지침 [[ethical-guidelines]]

-다음 윤리 지침은 일반적으로 적용되지만, 민감한 윤리적 문제와 관련하여 기술적 선택을 할 때 이를 우선적으로 적용할 것입니다. 나아가, 해당 기술의 최신 동향과 관련된 새로운 위험이 발생함에 따라 이러한 윤리 원칙을 조정할 것을 약속드립니다.
+다음 윤리 지침은 일반적으로 적용되지만, 윤리적으로 민감한 문제와 관련된 기술적 선택을 할 때 우선적으로 적용됩니다. 또한, 해당 기술의 최신 동향과 관련된 새로운 위험이 발생함에 따라 이러한 윤리 원칙을 지속적으로 조정할 것을 약속합니다.

- **투명성**: 우리는 PR을 관리하고, 사용자에게 우리의 선택을 설명하며, 기술적 의사결정을 내릴 때 투명성을 유지할 것을 약속합니다.
+- **투명성**: 우리는 PR 관리, 사용자에게 선택의 이유 설명, 기술적 의사결정 과정에서 투명성을 유지할 것을 약속합니다.

- **일관성**: 우리는 프로젝트 관리에서 사용자들에게 동일한 수준의 관심을 보장하고 기술적으로 안정되고 일관된 상태를 유지할 것을 약속합니다.
+- **일관성**: 프로젝트 관리에서 모든 사용자에게 동일한 수준의 관심을 보장하고, 기술적으로 안정적이고 일관된 상태를 유지할 것을 약속합니다.

- **간결성**: Diffusers 라이브러리를 사용하고 활용하기 쉽게 만들기 위해, 프로젝트의 목표를 간결하고 일관성 있게 유지할 것을 약속합니다.
+- **간결성**: Diffusers 라이브러리를 쉽게 사용하고 활용할 수 있도록, 프로젝트의 목표를 간결하고 일관성 있게 유지할 것을 약속합니다.

- **접근성**: Diffusers 프로젝트는 기술적 전문 지식 없어도 프로젝트 운영에 참여할 수 있는 기여자의 진입장벽을 낮춥니다. 이를 통해 연구 결과물이 커뮤니티에 더 잘 접근할 수 있게 됩니다.
+- **접근성**: Diffusers 프로젝트는 기술적 전문지식이 없어도 기여할 수 있도록 진입장벽을 낮춥니다. 이를 통해 연구 결과물이 커뮤니티에 더 잘 접근될 수 있습니다.

- **재현성**: 우리는 Diffusers 라이브러리를 통해 제공되는 업스트림(upstream) 코드, 모델 및 데이터셋의 재현성에 대해 투명하게 공개할 것을 목표로 합니다.
-
- **책임**: 우리는 커뮤니티와 팀워크를 통해, 이 기술의 잠재적인 위험과 위험을 예측하고 완화하는 데 대한 공동 책임을 가지고 있습니다.
+- **재현성**: 우리는 Diffusers 라이브러리를 통해 제공되는 업스트림 코드, 모델, 데이터셋의 재현성에 대해 투명하게 공개하는 것을 목표로 합니다.

+- **책임**: 커뮤니티와 팀워크를 통해, 이 기술의 잠재적 위험을 예측하고 완화하는 데 공동 책임을 집니다.

 ## 구현 사례: 안전 기능과 메커니즘 [[examples-of-implementations-safety-features-and-mechanisms]]

-팀은 diffusion 기술과 관련된 잠재적인 윤리 및 사회적 위험에 대처하기 위한 기술적 및 비기술적 도구를 제공하고자 하고 있습니다. 또한, 커뮤니티의 참여는 이러한 기능의 구현하고 우리와 함께 인식을 높이는 데 매우 중요합니다.
+팀은 diffusion 기술과 관련된 잠재적 윤리 및 사회적 위험에 대응하기 위해 기술적·비기술적 도구를 제공하고자 노력하고 있습니다. 또한, 커뮤니티의 참여는 이러한 기능 구현과 인식 제고에 매우 중요합니다.

- [**커뮤니티 탭**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): 이를 통해 커뮤니티는 프로젝트에 대해 토론하고 더 나은 협력을 할 수 있습니다.
+- [**커뮤니티 탭**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): 커뮤니티가 프로젝트에 대해 토론하고 더 나은 협업을 할 수 있도록 지원합니다.

- **편향 탐색 및 평가**: Hugging Face 팀은 Stable Diffusion 모델의 편향성을 대화형으로 보여주는 [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer)을 제공합니다. 이런 의미에서, 우리는 편향 탐색 및 평가를 지원하고 장려합니다.
+- **편향 탐색 및 평가**: Hugging Face 팀은 Stable Diffusion 모델의 편향성을 대화형으로 보여주는 [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer)를 제공합니다. 우리는 이러한 편향 탐색과 평가를 지원하고 장려합니다.

 - **배포에서의 안전 유도**

-  - [**안전한 Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_safe): 이는 필터되지 않은 웹 크롤링 데이터셋으로 훈련된 Stable Diffusion과 같은 모델이 부적절한 변질에 취약한 문제를 완화합니다. 관련 논문: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://huggingface.co/papers/2211.05105).
+  - [**안전한 Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_safe): 필터링되지 않은 웹 크롤링 데이터셋으로 훈련된 Stable Diffusion과 같은 모델이 부적절하게 변질되는 문제를 완화합니다. 관련 논문: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://huggingface.co/papers/2211.05105).

-  - [**안전 검사기**](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py): 이미지가 생성된 후에 이미자가 임베딩 공간에서 일련의 하드코딩된 유해 개념의 클래스일 확률을 확인하고 비교합니다. 유해 개념은 역공학을 방지하기 위해 의도적으로 숨겨져 있습니다.
+  - [**안전 검사기**](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py): 생성된 이미지가 임베딩 공간에서 하드코딩된 유해 개념 클래스와 일치할 확률을 확인하고 비교합니다. 유해 개념은 역공학을 방지하기 위해 의도적으로 숨겨져 있습니다.

- **Hub에서의 단계적인 배포**: 특히 민감한 상황에서는 일부 리포지토리에 대한 접근을 제한해야 합니다. 이 단계적인 배포는 중간 단계로, 리포지토리 작성자가 사용에 대한 더 많은 통제력을 갖게 합니다.
+- **Hub에서의 단계적 배포**: 특히 민감한 상황에서는 일부 리포지토리에 대한 접근을 제한할 수 있습니다. 단계적 배포는 리포지토리 작성자가 사용에 대해 더 많은 통제권을 갖도록 하는 중간 단계입니다.

- **라이선싱**: [OpenRAILs](https://huggingface.co/blog/open_rail)와 같은 새로운 유형의 라이선싱을 통해 자유로운 접근을 보장하면서도 더 책임 있는 사용을 위한 일련의 제한을 둘 수 있습니다.
+- **라이선싱**: [OpenRAILs](https://huggingface.co/blog/open_rail)와 같은 새로운 유형의 라이선스를 통해 자유로운 접근을 보장하면서도 보다 책임 있는 사용을 위한 일련의 제한을 둘 수 있습니다.
--- a/examples/dreambooth/train_dreambooth_lora_qwen_image.py
+++ b/examples/dreambooth/train_dreambooth_lora_qwen_image.py
@@ -1338,7 +1338,7 @@ def main(args):
                        batch["pixel_values"] = batch["pixel_values"].to(
                            accelerator.device, non_blocking=True, dtype=vae.dtype
                        )
-                    latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
+                        latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
                if train_dataset.custom_instance_prompts:
                    with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
                        prompt_embeds, prompt_embeds_mask = compute_text_embeddings(
--- a/examples/text_to_image/requirements.txt
+++ b/examples/text_to_image/requirements.txt
@@ -5,4 +5,4 @@ datasets>=2.19.1
 ftfy
 tensorboard
 Jinja2
-peft==0.7.0
+peft>=0.17.0
--- a/examples/text_to_image/requirements_sdxl.txt
+++ b/examples/text_to_image/requirements_sdxl.txt
@@ -5,4 +5,4 @@ ftfy
 tensorboard
 Jinja2
 datasets
-peft==0.7.0
+peft>=0.17.0
--- a/scripts/convert_ltx_to_diffusers.py
+++ b/scripts/convert_ltx_to_diffusers.py
@@ -369,6 +369,15 @@ def get_spatial_latent_upsampler_config(version: str) -> Dict[str, Any]:
            "spatial_upsample": True,
            "temporal_upsample": False,
        }
+    elif version == "0.9.8":
+        config = {
+            "in_channels": 128,
+            "mid_channels": 512,
+            "num_blocks_per_stage": 4,
+            "dims": 3,
+            "spatial_upsample": True,
+            "temporal_upsample": False,
+        }
    else:
        raise ValueError(f"Unsupported version: {version}")
    return config
@@ -402,7 +411,7 @@ def get_args():
        "--version",
        type=str,
        default="0.9.0",
-        choices=["0.9.0", "0.9.1", "0.9.5", "0.9.7"],
+        choices=["0.9.0", "0.9.1", "0.9.5", "0.9.7", "0.9.8"],
        help="Version of the LTX model",
    )
    return parser.parse_args()
--- a/setup.py
+++ b/setup.py
@@ -145,6 +145,7 @@ _deps = [
    "black",
    "phonemizer",
    "opencv-python",
+    "timm",
 ]

 # this is a lookup table with items like:
@@ -218,7 +219,7 @@ class DepsTableUpdateCommand(Command):
 extras = {}
 extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder")
 extras["docs"] = deps_list("hf-doc-builder")
-extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft")
+extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft", "timm")
 extras["test"] = deps_list(
    "compel",
    "GitPython",
--- a/src/diffusers/init.py
+++ b/src/diffusers/init.py
@@ -386,10 +386,14 @@ else:
    _import_structure["modular_pipelines"].extend(
        [
            "FluxAutoBlocks",
+            "FluxKontextAutoBlocks",
+            "FluxKontextModularPipeline",
            "FluxModularPipeline",
            "QwenImageAutoBlocks",
            "QwenImageEditAutoBlocks",
            "QwenImageEditModularPipeline",
+            "QwenImageEditPlusAutoBlocks",
+            "QwenImageEditPlusModularPipeline",
            "QwenImageModularPipeline",
            "StableDiffusionXLAutoBlocks",
            "StableDiffusionXLModularPipeline",
@@ -1048,10 +1052,14 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    else:
        from .modular_pipelines import (
            FluxAutoBlocks,
+            FluxKontextAutoBlocks,
+            FluxKontextModularPipeline,
            FluxModularPipeline,
            QwenImageAutoBlocks,
            QwenImageEditAutoBlocks,
            QwenImageEditModularPipeline,
+            QwenImageEditPlusAutoBlocks,
+            QwenImageEditPlusModularPipeline,
            QwenImageModularPipeline,
            StableDiffusionXLAutoBlocks,
            StableDiffusionXLModularPipeline,
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -52,4 +52,5 @@ deps = {
    "black": "black",
    "phonemizer": "phonemizer",
    "opencv-python": "opencv-python",
+    "timm": "timm",
 }
--- a/src/diffusers/hooks/context_parallel.py
+++ b/src/diffusers/hooks/context_parallel.py
@@ -17,7 +17,10 @@ from dataclasses import dataclass
 from typing import Dict, List, Type, Union

 import torch
-import torch.distributed._functional_collectives as funcol
+
+
+if torch.distributed.is_available():
+    import torch.distributed._functional_collectives as funcol

 from ..models._modeling_parallel import (
    ContextParallelConfig,
--- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py
@@ -18,7 +18,6 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
--- a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
@@ -23,7 +23,6 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin
--- a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py
@@ -17,7 +17,6 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin
--- a/src/diffusers/models/controlnets/controlnet_xs.py
+++ b/src/diffusers/models/controlnets/controlnet_xs.py
@@ -16,7 +16,6 @@ from math import gcd
 from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
-import torch.utils.checkpoint
 from torch import Tensor, nn

 from ...configuration_utils import ConfigMixin, register_to_config
--- a/src/diffusers/models/transformers/stable_audio_transformer.py
+++ b/src/diffusers/models/transformers/stable_audio_transformer.py
@@ -18,7 +18,6 @@ from typing import Dict, Optional, Union
 import numpy as np
 import torch
 import torch.nn as nn
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
--- a/src/diffusers/models/transformers/transformer_ltx.py
+++ b/src/diffusers/models/transformers/transformer_ltx.py
@@ -353,7 +353,9 @@ class LTXVideoTransformerBlock(nn.Module):
        norm_hidden_states = self.norm1(hidden_states)

        num_ada_params = self.scale_shift_table.shape[0]
-        ada_values = self.scale_shift_table[None, None] + temb.reshape(batch_size, temb.size(1), num_ada_params, -1)
+        ada_values = self.scale_shift_table[None, None].to(temb.device) + temb.reshape(
+            batch_size, temb.size(1), num_ada_params, -1
+        )
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ada_values.unbind(dim=2)
        norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa

--- a/src/diffusers/models/transformers/transformer_wan.py
+++ b/src/diffusers/models/transformers/transformer_wan.py
@@ -682,12 +682,12 @@ class WanTransformer3DModel(
        # 5. Output norm, projection & unpatchify
        if temb.ndim == 3:
            # batch_size, seq_len, inner_dim (wan 2.2 ti2v)
-            shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
+            shift, scale = (self.scale_shift_table.unsqueeze(0).to(temb.device) + temb.unsqueeze(2)).chunk(2, dim=2)
            shift = shift.squeeze(2)
            scale = scale.squeeze(2)
        else:
            # batch_size, inner_dim
-            shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+            shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)

        # Move the shift and scale tensors to the same device as hidden_states.
        # When using multi-GPU inference via accelerate these will be on the
--- a/src/diffusers/models/transformers/transformer_wan_vace.py
+++ b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -103,7 +103,7 @@ class WanVACETransformerBlock(nn.Module):
            control_hidden_states = control_hidden_states + hidden_states

        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
-            self.scale_shift_table + temb.float()
+            self.scale_shift_table.to(temb.device) + temb.float()
        ).chunk(6, dim=1)

        # 1. Self-attention
@@ -361,7 +361,7 @@ class WanVACETransformer3DModel(
                    hidden_states = hidden_states + control_hint * scale

        # 6. Output norm, projection & unpatchify
-        shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+        shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)

        # Move the shift and scale tensors to the same device as hidden_states.
        # When using multi-GPU inference via accelerate these will be on the
--- a/src/diffusers/models/unets/unet_2d_condition.py
+++ b/src/diffusers/models/unets/unet_2d_condition.py
@@ -16,7 +16,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
--- a/src/diffusers/models/unets/unet_3d_condition.py
+++ b/src/diffusers/models/unets/unet_3d_condition.py
@@ -18,7 +18,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import UNet2DConditionLoadersMixin
--- a/src/diffusers/models/unets/unet_i2vgen_xl.py
+++ b/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -16,7 +16,6 @@ from typing import Any, Dict, Optional, Tuple, Union

 import torch
 import torch.nn as nn
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import UNet2DConditionLoadersMixin
--- a/src/diffusers/models/unets/unet_kandinsky3.py
+++ b/src/diffusers/models/unets/unet_kandinsky3.py
@@ -16,7 +16,6 @@ from dataclasses import dataclass
 from typing import Dict, Tuple, Union

 import torch
-import torch.utils.checkpoint
 from torch import nn

 from ...configuration_utils import ConfigMixin, register_to_config
--- a/src/diffusers/models/unets/unet_motion_model.py
+++ b/src/diffusers/models/unets/unet_motion_model.py
@@ -18,7 +18,6 @@ from typing import Any, Dict, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, UNet2DConditionLoadersMixin
--- a/src/diffusers/modular_pipelines/init.py
+++ b/src/diffusers/modular_pipelines/init.py
@@ -46,12 +46,19 @@ else:
    ]
    _import_structure["stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline"]
    _import_structure["wan"] = ["WanAutoBlocks", "WanModularPipeline"]
-    _import_structure["flux"] = ["FluxAutoBlocks", "FluxModularPipeline"]
+    _import_structure["flux"] = [
+        "FluxAutoBlocks",
+        "FluxModularPipeline",
+        "FluxKontextAutoBlocks",
+        "FluxKontextModularPipeline",
+    ]
    _import_structure["qwenimage"] = [
        "QwenImageAutoBlocks",
        "QwenImageModularPipeline",
        "QwenImageEditModularPipeline",
        "QwenImageEditAutoBlocks",
+        "QwenImageEditPlusModularPipeline",
+        "QwenImageEditPlusAutoBlocks",
    ]
    _import_structure["components_manager"] = ["ComponentsManager"]

@@ -63,7 +70,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from ..utils.dummy_pt_objects import *  # noqa F403
    else:
        from .components_manager import ComponentsManager
-        from .flux import FluxAutoBlocks, FluxModularPipeline
+        from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
        from .modular_pipeline import (
            AutoPipelineBlocks,
            BlockState,
@@ -78,6 +85,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            QwenImageAutoBlocks,
            QwenImageEditAutoBlocks,
            QwenImageEditModularPipeline,
+            QwenImageEditPlusAutoBlocks,
+            QwenImageEditPlusModularPipeline,
            QwenImageModularPipeline,
        )
        from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
--- a/src/diffusers/modular_pipelines/flux/init.py
+++ b/src/diffusers/modular_pipelines/flux/init.py
@@ -25,14 +25,18 @@ else:
    _import_structure["modular_blocks"] = [
        "ALL_BLOCKS",
        "AUTO_BLOCKS",
+        "AUTO_BLOCKS_KONTEXT",
+        "FLUX_KONTEXT_BLOCKS",
        "TEXT2IMAGE_BLOCKS",
        "FluxAutoBeforeDenoiseStep",
        "FluxAutoBlocks",
-        "FluxAutoBlocks",
        "FluxAutoDecodeStep",
        "FluxAutoDenoiseStep",
+        "FluxKontextAutoBlocks",
+        "FluxKontextAutoDenoiseStep",
+        "FluxKontextBeforeDenoiseStep",
    ]
-    _import_structure["modular_pipeline"] = ["FluxModularPipeline"]
+    _import_structure["modular_pipeline"] = ["FluxKontextModularPipeline", "FluxModularPipeline"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
@@ -45,13 +49,18 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .modular_blocks import (
            ALL_BLOCKS,
            AUTO_BLOCKS,
+            AUTO_BLOCKS_KONTEXT,
+            FLUX_KONTEXT_BLOCKS,
            TEXT2IMAGE_BLOCKS,
            FluxAutoBeforeDenoiseStep,
            FluxAutoBlocks,
            FluxAutoDecodeStep,
            FluxAutoDenoiseStep,
+            FluxKontextAutoBlocks,
+            FluxKontextAutoDenoiseStep,
+            FluxKontextBeforeDenoiseStep,
        )
-        from .modular_pipeline import FluxModularPipeline
+        from .modular_pipeline import FluxKontextModularPipeline, FluxModularPipeline
 else:
    import sys

--- a/src/diffusers/modular_pipelines/flux/before_denoise.py
+++ b/src/diffusers/modular_pipelines/flux/before_denoise.py
@@ -13,12 +13,12 @@
 # limitations under the License.

 import inspect
-from typing import Any, List, Optional, Tuple, Union
+from typing import List, Optional, Union

 import numpy as np
 import torch

-from ...models import AutoencoderKL
+from ...pipelines import FluxPipeline
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor
@@ -104,48 +104,6 @@ def calculate_shift(
    return mu


-# Adapted from the original implementation.
-def prepare_latents_img2img(
-    vae, scheduler, image, timestep, batch_size, num_channels_latents, height, width, dtype, device, generator
-):
-    if isinstance(generator, list) and len(generator) != batch_size:
-        raise ValueError(
-            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-        )
-
-    vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
-    latent_channels = vae.config.latent_channels
-
-    # VAE applies 8x compression on images but we must also account for packing which requires
-    # latent height and width to be divisible by 2.
-    height = 2 * (int(height) // (vae_scale_factor * 2))
-    width = 2 * (int(width) // (vae_scale_factor * 2))
-    shape = (batch_size, num_channels_latents, height, width)
-    latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
-
-    image = image.to(device=device, dtype=dtype)
-    if image.shape[1] != latent_channels:
-        image_latents = _encode_vae_image(image=image, generator=generator)
-    else:
-        image_latents = image
-    if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
-        # expand init_latents for batch_size
-        additional_image_per_prompt = batch_size // image_latents.shape[0]
-        image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
-    elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
-        raise ValueError(
-            f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
-        )
-    else:
-        image_latents = torch.cat([image_latents], dim=0)
-
-    noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-    latents = scheduler.scale_noise(image_latents, timestep, noise)
-    latents = _pack_latents(latents, batch_size, num_channels_latents, height, width)
-    return latents, latent_image_ids
-
-
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -160,43 +118,6 @@ def retrieve_latents(
        raise AttributeError("Could not access latents of provided encoder_output")


-def _pack_latents(latents, batch_size, num_channels_latents, height, width):
-    latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
-    latents = latents.permute(0, 2, 4, 1, 3, 5)
-    latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
-
-    return latents
-
-
-def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
-    latent_image_ids = torch.zeros(height, width, 3)
-    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
-    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
-
-    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
-
-    latent_image_ids = latent_image_ids.reshape(
-        latent_image_id_height * latent_image_id_width, latent_image_id_channels
-    )
-
-    return latent_image_ids.to(device=device, dtype=dtype)
-
-
-# Cannot use "# Copied from" because it introduces weird indentation errors.
-def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator):
-    if isinstance(generator, list):
-        image_latents = [
-            retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0])
-        ]
-        image_latents = torch.cat(image_latents, dim=0)
-    else:
-        image_latents = retrieve_latents(vae.encode(image), generator=generator)
-
-    image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
-
-    return image_latents
-
-
 def _get_initial_timesteps_and_optionals(
    transformer,
    scheduler,
@@ -231,92 +152,6 @@ def _get_initial_timesteps_and_optionals(
    return timesteps, num_inference_steps, sigmas, guidance


-class FluxInputStep(ModularPipelineBlocks):
-    model_name = "flux"
-
-    @property
-    def description(self) -> str:
-        return (
-            "Input processing step that:\n"
-            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
-            "  2. Adjusts input tensor shapes based on `batch_size` (number of prompts) and `num_images_per_prompt`\n\n"
-            "All input tensors are expected to have either batch_size=1 or match the batch_size\n"
-            "of prompt_embeds. The tensors will be duplicated across the batch dimension to\n"
-            "have a final batch_size of batch_size * num_images_per_prompt."
-        )
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam("num_images_per_prompt", default=1),
-            InputParam(
-                "prompt_embeds",
-                required=True,
-                type_hint=torch.Tensor,
-                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
-            ),
-            InputParam(
-                "pooled_prompt_embeds",
-                type_hint=torch.Tensor,
-                description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.",
-            ),
-            # TODO: support negative embeddings?
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[str]:
-        return [
-            OutputParam(
-                "batch_size",
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
-            ),
-            OutputParam(
-                "dtype",
-                type_hint=torch.dtype,
-                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
-            ),
-            OutputParam(
-                "prompt_embeds",
-                type_hint=torch.Tensor,
-                description="text embeddings used to guide the image generation",
-            ),
-            OutputParam(
-                "pooled_prompt_embeds",
-                type_hint=torch.Tensor,
-                description="pooled text embeddings used to guide the image generation",
-            ),
-            # TODO: support negative embeddings?
-        ]
-
-    def check_inputs(self, components, block_state):
-        if block_state.prompt_embeds is not None and block_state.pooled_prompt_embeds is not None:
-            if block_state.prompt_embeds.shape[0] != block_state.pooled_prompt_embeds.shape[0]:
-                raise ValueError(
-                    "`prompt_embeds` and `pooled_prompt_embeds` must have the same batch size when passed directly, but"
-                    f" got: `prompt_embeds` {block_state.prompt_embeds.shape} != `pooled_prompt_embeds`"
-                    f" {block_state.pooled_prompt_embeds.shape}."
-                )
-
-    @torch.no_grad()
-    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
-        # TODO: consider adding negative embeddings?
-        block_state = self.get_block_state(state)
-        self.check_inputs(components, block_state)
-
-        block_state.batch_size = block_state.prompt_embeds.shape[0]
-        block_state.dtype = block_state.prompt_embeds.dtype
-
-        _, seq_len, _ = block_state.prompt_embeds.shape
-        block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
-        block_state.prompt_embeds = block_state.prompt_embeds.view(
-            block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
-        )
-        self.set_block_state(state, block_state)
-
-        return components, state
-
-
 class FluxSetTimestepsStep(ModularPipelineBlocks):
    model_name = "flux"

@@ -385,6 +220,10 @@ class FluxSetTimestepsStep(ModularPipelineBlocks):
        block_state.sigmas = sigmas
        block_state.guidance = guidance

+        # We set the index here to remove DtoH sync, helpful especially during compilation.
+        # Check out more details here: https://github.com/huggingface/diffusers/pull/11696
+        components.scheduler.set_begin_index(0)
+
        self.set_block_state(state, block_state)
        return components, state

@@ -428,11 +267,6 @@ class FluxImg2ImgSetTimestepsStep(ModularPipelineBlocks):
                type_hint=int,
                description="The number of denoising steps to perform at inference time",
            ),
-            OutputParam(
-                "latent_timestep",
-                type_hint=torch.Tensor,
-                description="The timestep that represents the initial noise level for image-to-image generation",
-            ),
            OutputParam("guidance", type_hint=torch.Tensor, description="Optional guidance to be used."),
        ]

@@ -480,8 +314,6 @@ class FluxImg2ImgSetTimestepsStep(ModularPipelineBlocks):
        block_state.sigmas = sigmas
        block_state.guidance = guidance

-        block_state.latent_timestep = timesteps[:1].repeat(batch_size)
-
        self.set_block_state(state, block_state)
        return components, state

@@ -520,11 +352,6 @@ class FluxPrepareLatentsStep(ModularPipelineBlocks):
            OutputParam(
                "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
            ),
-            OutputParam(
-                "latent_image_ids",
-                type_hint=torch.Tensor,
-                description="IDs computed from the image sequence needed for RoPE",
-            ),
        ]

    @staticmethod
@@ -548,20 +375,13 @@ class FluxPrepareLatentsStep(ModularPipelineBlocks):
        generator,
        latents=None,
    ):
-        # Couldn't use the `prepare_latents` method directly from Flux because I decided to copy over
-        # the packing methods here. So, for example, `comp._pack_latents()` won't work if we were
-        # to go with the "# Copied from ..." approach. Or maybe there's a way?
-
-        # VAE applies 8x compression on images but we must also account for packing which requires
-        # latent height and width to be divisible by 2.
        height = 2 * (int(height) // (comp.vae_scale_factor * 2))
        width = 2 * (int(width) // (comp.vae_scale_factor * 2))

        shape = (batch_size, num_channels_latents, height, width)

        if latents is not None:
-            latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
-            return latents.to(device=device, dtype=dtype), latent_image_ids
+            return latents.to(device=device, dtype=dtype)

        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
@@ -569,26 +389,23 @@ class FluxPrepareLatentsStep(ModularPipelineBlocks):
                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
            )

+        # TODO: move packing latents code to a patchifier similar to Qwen
        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        latents = _pack_latents(latents, batch_size, num_channels_latents, height, width)
+        latents = FluxPipeline._pack_latents(latents, batch_size, num_channels_latents, height, width)

-        latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
-
-        return latents, latent_image_ids
+        return latents

    @torch.no_grad()
    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
-
        block_state.height = block_state.height or components.default_height
        block_state.width = block_state.width or components.default_width
        block_state.device = components._execution_device
-        block_state.dtype = torch.bfloat16  # TODO: okay to hardcode this?
        block_state.num_channels_latents = components.num_channels_latents

        self.check_inputs(components, block_state)
        batch_size = block_state.batch_size * block_state.num_images_per_prompt
-        block_state.latents, block_state.latent_image_ids = self.prepare_latents(
+        block_state.latents = self.prepare_latents(
            components,
            batch_size,
            block_state.num_channels_latents,
@@ -608,82 +425,194 @@ class FluxPrepareLatentsStep(ModularPipelineBlocks):
 class FluxImg2ImgPrepareLatentsStep(ModularPipelineBlocks):
    model_name = "flux"

-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [ComponentSpec("vae", AutoencoderKL), ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
-
    @property
    def description(self) -> str:
-        return "Step that prepares the latents for the image-to-image generation process"
+        return "Step that adds noise to image latents for image-to-image. Should be run after `set_timesteps`,"
+        " `prepare_latents`. Both noise and image latents should already be patchified."

    @property
-    def inputs(self) -> List[Tuple[str, Any]]:
+    def expected_components(self) -> List[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+
+    @property
+    def inputs(self) -> List[InputParam]:
        return [
-            InputParam("height", type_hint=int),
-            InputParam("width", type_hint=int),
-            InputParam("latents", type_hint=Optional[torch.Tensor]),
-            InputParam("num_images_per_prompt", type_hint=int, default=1),
-            InputParam("generator"),
            InputParam(
-                "image_latents",
+                name="latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The latents representing the reference image for image-to-image/inpainting generation. Can be generated in vae_encode step.",
+                description="The initial random noised, can be generated in prepare latent step.",
            ),
            InputParam(
-                "latent_timestep",
+                name="image_latents",
                required=True,
                type_hint=torch.Tensor,
-                description="The timestep that represents the initial noise level for image-to-image/inpainting generation. Can be generated in set_timesteps step.",
+                description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
            ),
            InputParam(
-                "batch_size",
+                name="timesteps",
                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
            ),
-            InputParam("dtype", required=True, type_hint=torch.dtype, description="The dtype of the model inputs"),
        ]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
-                "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
-            ),
-            OutputParam(
-                "latent_image_ids",
+                name="initial_noise",
                type_hint=torch.Tensor,
-                description="IDs computed from the image sequence needed for RoPE",
+                description="The initial random noised used for inpainting denoising.",
            ),
        ]

+    @staticmethod
+    def check_inputs(image_latents, latents):
+        if image_latents.shape[0] != latents.shape[0]:
+            raise ValueError(
+                f"`image_latents` must have have same batch size as `latents`, but got {image_latents.shape[0]} and {latents.shape[0]}"
+            )
+
+        if image_latents.ndim != 3:
+            raise ValueError(f"`image_latents` must have 3 dimensions (patchified), but got {image_latents.ndim}")
+
    @torch.no_grad()
    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

-        block_state.device = components._execution_device
-        block_state.dtype = torch.bfloat16  # TODO: okay to hardcode this?
-        block_state.num_channels_latents = components.num_channels_latents
-        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
-        block_state.device = components._execution_device
+        self.check_inputs(image_latents=block_state.image_latents, latents=block_state.latents)

-        # TODO: implement `check_inputs`
-        batch_size = block_state.batch_size * block_state.num_images_per_prompt
-        if block_state.latents is None:
-            block_state.latents, block_state.latent_image_ids = prepare_latents_img2img(
-                components.vae,
-                components.scheduler,
-                block_state.image_latents,
-                block_state.latent_timestep,
-                batch_size,
-                block_state.num_channels_latents,
-                block_state.height,
-                block_state.width,
-                block_state.dtype,
-                block_state.device,
-                block_state.generator,
-            )
+        # prepare latent timestep
+        latent_timestep = block_state.timesteps[:1].repeat(block_state.latents.shape[0])
+
+        # make copy of initial_noise
+        block_state.initial_noise = block_state.latents
+
+        # scale noise
+        block_state.latents = components.scheduler.scale_noise(
+            block_state.image_latents, latent_timestep, block_state.latents
+        )
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class FluxRoPEInputsStep(ModularPipelineBlocks):
+    model_name = "flux"
+
+    @property
+    def description(self) -> str:
+        return "Step that prepares the RoPE inputs for the denoising process. Should be placed after text encoder and latent preparation steps."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="prompt_embeds"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="txt_ids",
+                kwargs_type="denoiser_input_fields",
+                type_hint=List[int],
+                description="The sequence lengths of the prompt embeds, used for RoPE calculation.",
+            ),
+            OutputParam(
+                name="img_ids",
+                kwargs_type="denoiser_input_fields",
+                type_hint=List[int],
+                description="The sequence lengths of the image latents, used for RoPE calculation.",
+            ),
+        ]
+
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        prompt_embeds = block_state.prompt_embeds
+        device, dtype = prompt_embeds.device, prompt_embeds.dtype
+        block_state.txt_ids = torch.zeros(prompt_embeds.shape[1], 3).to(
+            device=prompt_embeds.device, dtype=prompt_embeds.dtype
+        )
+
+        height = 2 * (int(block_state.height) // (components.vae_scale_factor * 2))
+        width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
+        block_state.img_ids = FluxPipeline._prepare_latent_image_ids(None, height // 2, width // 2, device, dtype)
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class FluxKontextRoPEInputsStep(ModularPipelineBlocks):
+    model_name = "flux-kontext"
+
+    @property
+    def description(self) -> str:
+        return "Step that prepares the RoPE inputs for the denoising process of Flux Kontext. Should be placed after text encoder and latent preparation steps."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="image_height"),
+            InputParam(name="image_width"),
+            InputParam(name="height"),
+            InputParam(name="width"),
+            InputParam(name="prompt_embeds"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="txt_ids",
+                kwargs_type="denoiser_input_fields",
+                type_hint=List[int],
+                description="The sequence lengths of the prompt embeds, used for RoPE calculation.",
+            ),
+            OutputParam(
+                name="img_ids",
+                kwargs_type="denoiser_input_fields",
+                type_hint=List[int],
+                description="The sequence lengths of the image latents, used for RoPE calculation.",
+            ),
+        ]
+
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        prompt_embeds = block_state.prompt_embeds
+        device, dtype = prompt_embeds.device, prompt_embeds.dtype
+        block_state.txt_ids = torch.zeros(prompt_embeds.shape[1], 3).to(
+            device=prompt_embeds.device, dtype=prompt_embeds.dtype
+        )
+
+        img_ids = None
+        if (
+            getattr(block_state, "image_height", None) is not None
+            and getattr(block_state, "image_width", None) is not None
+        ):
+            image_latent_height = 2 * (int(block_state.image_height) // (components.vae_scale_factor * 2))
+            image_latent_width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
+            img_ids = FluxPipeline._prepare_latent_image_ids(
+                None, image_latent_height // 2, image_latent_width // 2, device, dtype
+            )
+            # image ids are the same as latent ids with the first dimension set to 1 instead of 0
+            img_ids[..., 0] = 1
+
+        height = 2 * (int(block_state.height) // (components.vae_scale_factor * 2))
+        width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
+        latent_ids = FluxPipeline._prepare_latent_image_ids(None, height // 2, width // 2, device, dtype)
+
+        if img_ids is not None:
+            latent_ids = torch.cat([latent_ids, img_ids], dim=0)
+
+        block_state.img_ids = latent_ids

        self.set_block_state(state, block_state)

--- a/src/diffusers/modular_pipelines/flux/denoise.py
+++ b/src/diffusers/modular_pipelines/flux/denoise.py
@@ -76,18 +76,17 @@ class FluxLoopDenoiser(ModularPipelineBlocks):
                description="Pooled prompt embeddings",
            ),
            InputParam(
-                "text_ids",
+                "txt_ids",
                required=True,
                type_hint=torch.Tensor,
                description="IDs computed from text sequence needed for RoPE",
            ),
            InputParam(
-                "latent_image_ids",
+                "img_ids",
                required=True,
                type_hint=torch.Tensor,
                description="IDs computed from image sequence needed for RoPE",
            ),
-            # TODO: guidance
        ]

    @torch.no_grad()
@@ -101,8 +100,8 @@ class FluxLoopDenoiser(ModularPipelineBlocks):
            encoder_hidden_states=block_state.prompt_embeds,
            pooled_projections=block_state.pooled_prompt_embeds,
            joint_attention_kwargs=block_state.joint_attention_kwargs,
-            txt_ids=block_state.text_ids,
-            img_ids=block_state.latent_image_ids,
+            txt_ids=block_state.txt_ids,
+            img_ids=block_state.img_ids,
            return_dict=False,
        )[0]
        block_state.noise_pred = noise_pred
@@ -110,6 +109,96 @@ class FluxLoopDenoiser(ModularPipelineBlocks):
        return components, block_state


+class FluxKontextLoopDenoiser(ModularPipelineBlocks):
+    model_name = "flux-kontext"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [ComponentSpec("transformer", FluxTransformer2DModel)]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that denoise the latents for Flux Kontext. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `FluxDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("joint_attention_kwargs"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
+            ),
+            InputParam(
+                "image_latents",
+                type_hint=torch.Tensor,
+                description="Image latents to use for the denoising process. Can be generated in prepare_latent step.",
+            ),
+            InputParam(
+                "guidance",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Guidance scale as a tensor",
+            ),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Prompt embeddings",
+            ),
+            InputParam(
+                "pooled_prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Pooled prompt embeddings",
+            ),
+            InputParam(
+                "txt_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="IDs computed from text sequence needed for RoPE",
+            ),
+            InputParam(
+                "img_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="IDs computed from latent sequence needed for RoPE",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(
+        self, components: FluxModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        latents = block_state.latents
+        latent_model_input = latents
+        image_latents = block_state.image_latents
+        if image_latents is not None:
+            latent_model_input = torch.cat([latent_model_input, image_latents], dim=1)
+
+        timestep = t.expand(latents.shape[0]).to(latents.dtype)
+        noise_pred = components.transformer(
+            hidden_states=latent_model_input,
+            timestep=timestep / 1000,
+            guidance=block_state.guidance,
+            encoder_hidden_states=block_state.prompt_embeds,
+            pooled_projections=block_state.pooled_prompt_embeds,
+            joint_attention_kwargs=block_state.joint_attention_kwargs,
+            txt_ids=block_state.txt_ids,
+            img_ids=block_state.img_ids,
+            return_dict=False,
+        )[0]
+        noise_pred = noise_pred[:, : latents.size(1)]
+        block_state.noise_pred = noise_pred
+
+        return components, block_state
+
+
 class FluxLoopAfterDenoiser(ModularPipelineBlocks):
    model_name = "flux"

@@ -195,9 +284,6 @@ class FluxDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
        block_state.num_warmup_steps = max(
            len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
        )
-        # We set the index here to remove DtoH sync, helpful especially during compilation.
-        # Check out more details here: https://github.com/huggingface/diffusers/pull/11696
-        components.scheduler.set_begin_index(0)
        with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
            for i, t in enumerate(block_state.timesteps):
                components, block_state = self.loop_step(components, block_state, i=i, t=t)
@@ -225,3 +311,20 @@ class FluxDenoiseStep(FluxDenoiseLoopWrapper):
            " - `FluxLoopAfterDenoiser`\n"
            "This block supports both text2image and img2img tasks."
        )
+
+
+class FluxKontextDenoiseStep(FluxDenoiseLoopWrapper):
+    model_name = "flux-kontext"
+    block_classes = [FluxKontextLoopDenoiser, FluxLoopAfterDenoiser]
+    block_names = ["denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `FluxDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `FluxKontextLoopDenoiser`\n"
+            " - `FluxLoopAfterDenoiser`\n"
+            "This block supports both text2image and img2img tasks."
+        )
--- a/src/diffusers/modular_pipelines/flux/encoders.py
+++ b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -20,12 +20,12 @@ import torch
 from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast

 from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
+from ...image_processor import VaeImageProcessor, is_valid_image, is_valid_image_imagelist
 from ...loaders import FluxLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL
 from ...utils import USE_PEFT_BACKEND, is_ftfy_available, logging, scale_lora_layers, unscale_lora_layers
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
-from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import FluxModularPipeline


@@ -67,89 +67,219 @@ def retrieve_latents(
        raise AttributeError("Could not access latents of provided encoder_output")


-class FluxVaeEncoderStep(ModularPipelineBlocks):
+def encode_vae_image(vae: AutoencoderKL, image: torch.Tensor, generator: torch.Generator, sample_mode="sample"):
+    if isinstance(generator, list):
+        image_latents = [
+            retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode)
+            for i in range(image.shape[0])
+        ]
+        image_latents = torch.cat(image_latents, dim=0)
+    else:
+        image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode=sample_mode)
+
+    image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
+
+    return image_latents
+
+
+class FluxProcessImagesInputStep(ModularPipelineBlocks):
    model_name = "flux"

    @property
    def description(self) -> str:
-        return "Vae Encoder step that encode the input image into a latent representation"
+        return "Image Preprocess step."

    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
-            ComponentSpec("vae", AutoencoderKL),
            ComponentSpec(
                "image_processor",
                VaeImageProcessor,
-                config=FrozenDict({"vae_scale_factor": 16, "vae_latent_channels": 16}),
+                config=FrozenDict({"vae_scale_factor": 16}),
                default_creation_method="from_config",
            ),
        ]

    @property
    def inputs(self) -> List[InputParam]:
+        return [InputParam("resized_image"), InputParam("image"), InputParam("height"), InputParam("width")]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam(name="processed_image")]
+
+    @staticmethod
+    def check_inputs(height, width, vae_scale_factor):
+        if height is not None and height % (vae_scale_factor * 2) != 0:
+            raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
+
+        if width is not None and width % (vae_scale_factor * 2) != 0:
+            raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
+
+    @torch.no_grad()
+    def __call__(self, components: FluxModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        if block_state.resized_image is None and block_state.image is None:
+            raise ValueError("`resized_image` and `image` cannot be None at the same time")
+
+        if block_state.resized_image is None:
+            image = block_state.image
+            self.check_inputs(
+                height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
+            )
+            height = block_state.height or components.default_height
+            width = block_state.width or components.default_width
+        else:
+            width, height = block_state.resized_image[0].size
+            image = block_state.resized_image
+
+        block_state.processed_image = components.image_processor.preprocess(image=image, height=height, width=width)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class FluxKontextProcessImagesInputStep(ModularPipelineBlocks):
+    model_name = "flux-kontext"
+
+    def __init__(self, _auto_resize=True):
+        self._auto_resize = _auto_resize
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        return (
+            "Image preprocess step for Flux Kontext. The preprocessed image goes to the VAE.\n"
+            "Kontext works as a T2I model, too, in case no input image is provided."
+        )
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
        return [
-            InputParam("image", required=True),
-            InputParam("height"),
-            InputParam("width"),
-            InputParam("generator"),
-            InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
-            InputParam(
-                "preprocess_kwargs",
-                type_hint=Optional[dict],
-                description="A kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under `self.image_processor` in [diffusers.image_processor.VaeImageProcessor]",
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
            ),
        ]

+    @property
+    def inputs(self) -> List[InputParam]:
+        return [InputParam("image")]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam(name="processed_image")]
+
+    @torch.no_grad()
+    def __call__(self, components: FluxModularPipeline, state: PipelineState):
+        from ...pipelines.flux.pipeline_flux_kontext import PREFERRED_KONTEXT_RESOLUTIONS
+
+        block_state = self.get_block_state(state)
+        images = block_state.image
+
+        if images is None:
+            block_state.processed_image = None
+
+        else:
+            multiple_of = components.image_processor.config.vae_scale_factor
+
+            if not is_valid_image_imagelist(images):
+                raise ValueError(f"Images must be image or list of images but are {type(images)}")
+
+            if is_valid_image(images):
+                images = [images]
+
+            img = images[0]
+            image_height, image_width = components.image_processor.get_default_height_width(img)
+            aspect_ratio = image_width / image_height
+            if self._auto_resize:
+                # Kontext is trained on specific resolutions, using one of them is recommended
+                _, image_width, image_height = min(
+                    (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+                )
+            image_width = image_width // multiple_of * multiple_of
+            image_height = image_height // multiple_of * multiple_of
+            images = components.image_processor.resize(images, image_height, image_width)
+            block_state.processed_image = components.image_processor.preprocess(images, image_height, image_width)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class FluxVaeEncoderDynamicStep(ModularPipelineBlocks):
+    model_name = "flux"
+
+    def __init__(
+        self, input_name: str = "processed_image", output_name: str = "image_latents", sample_mode: str = "sample"
+    ):
+        """Initialize a VAE encoder step for converting images to latent representations.
+
+        Both the input and output names are configurable so this block can be configured to process to different image
+        inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").
+
+        Args:
+            input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
+                Examples: "processed_image" or "processed_control_image"
+            output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
+                Examples: "image_latents" or "control_image_latents"
+            sample_mode (str, optional): Sampling mode to be used.
+
+        Examples:
+            # Basic usage with default settings (includes image processor): # FluxImageVaeEncoderDynamicStep()
+
+            # Custom input/output names for control image: # FluxImageVaeEncoderDynamicStep(
+                input_name="processed_control_image", output_name="control_image_latents"
+            )
+        """
+        self._image_input_name = input_name
+        self._image_latents_output_name = output_name
+        self.sample_mode = sample_mode
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        components = [ComponentSpec("vae", AutoencoderKL)]
+        return components
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        inputs = [InputParam(self._image_input_name), InputParam("generator")]
+        return inputs
+
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
-                "image_latents",
+                self._image_latents_output_name,
                type_hint=torch.Tensor,
-                description="The latents representing the reference image for image-to-image/inpainting generation",
+                description="The latents representing the reference image",
            )
        ]

-    @staticmethod
-    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image with self.vae->vae
-    def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator):
-        if isinstance(generator, list):
-            image_latents = [
-                retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0])
-            ]
-            image_latents = torch.cat(image_latents, dim=0)
-        else:
-            image_latents = retrieve_latents(vae.encode(image), generator=generator)
-
-        image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
-
-        return image_latents
-
    @torch.no_grad()
    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
-        block_state.preprocess_kwargs = block_state.preprocess_kwargs or {}
-        block_state.device = components._execution_device
-        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
+        image = getattr(block_state, self._image_input_name)

-        block_state.image = components.image_processor.preprocess(
-            block_state.image, height=block_state.height, width=block_state.width, **block_state.preprocess_kwargs
-        )
-        block_state.image = block_state.image.to(device=block_state.device, dtype=block_state.dtype)
+        if image is None:
+            setattr(block_state, self._image_latents_output_name, None)
+        else:
+            device = components._execution_device
+            dtype = components.vae.dtype
+            image = image.to(device=device, dtype=dtype)

-        block_state.batch_size = block_state.image.shape[0]
-
-        # if generator is a list, make sure the length of it matches the length of images (both should be batch_size)
-        if isinstance(block_state.generator, list) and len(block_state.generator) != block_state.batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
-                f" size of {block_state.batch_size}. Make sure the batch size matches the length of the generators."
+            # Encode image into latents
+            image_latents = encode_vae_image(
+                image=image, vae=components.vae, generator=block_state.generator, sample_mode=self.sample_mode
            )
-
-        block_state.image_latents = self._encode_vae_image(
-            components.vae, image=block_state.image, generator=block_state.generator
-        )
+            setattr(block_state, self._image_latents_output_name, image_latents)

        self.set_block_state(state, block_state)

@@ -161,7 +291,7 @@ class FluxTextEncoderStep(ModularPipelineBlocks):

    @property
    def description(self) -> str:
-        return "Text Encoder step that generate text_embeddings to guide the video generation"
+        return "Text Encoder step that generate text_embeddings to guide the image generation"

    @property
    def expected_components(self) -> List[ComponentSpec]:
@@ -172,15 +302,12 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
            ComponentSpec("tokenizer_2", T5TokenizerFast),
        ]

-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return []
-
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("prompt"),
            InputParam("prompt_2"),
+            InputParam("max_sequence_length", type_hint=int, default=512, required=False),
            InputParam("joint_attention_kwargs"),
        ]

@@ -189,19 +316,16 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
        return [
            OutputParam(
                "prompt_embeds",
+                kwargs_type="denoiser_input_fields",
                type_hint=torch.Tensor,
                description="text embeddings used to guide the image generation",
            ),
            OutputParam(
                "pooled_prompt_embeds",
+                kwargs_type="denoiser_input_fields",
                type_hint=torch.Tensor,
                description="pooled text embeddings used to guide the image generation",
            ),
-            OutputParam(
-                "text_ids",
-                type_hint=torch.Tensor,
-                description="ids from the text sequence for RoPE",
-            ),
        ]

    @staticmethod
@@ -212,16 +336,10 @@ class FluxTextEncoderStep(ModularPipelineBlocks):

    @staticmethod
    def _get_t5_prompt_embeds(
-        components,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        max_sequence_length: int,
-        device: torch.device,
+        components, prompt: Union[str, List[str]], max_sequence_length: int, device: torch.device
    ):
        dtype = components.text_encoder_2.dtype
-
        prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt)

        if isinstance(components, TextualInversionLoaderMixin):
            prompt = components.maybe_convert_prompt(prompt, components.tokenizer_2)
@@ -247,23 +365,11 @@ class FluxTextEncoderStep(ModularPipelineBlocks):

        prompt_embeds = components.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-        _, seq_len, _ = prompt_embeds.shape
-
-        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
        return prompt_embeds

    @staticmethod
-    def _get_clip_prompt_embeds(
-        components,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        device: torch.device,
-    ):
+    def _get_clip_prompt_embeds(components, prompt: Union[str, List[str]], device: torch.device):
        prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt)

        if isinstance(components, TextualInversionLoaderMixin):
            prompt = components.maybe_convert_prompt(prompt, components.tokenizer)
@@ -293,10 +399,6 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
        prompt_embeds = prompt_embeds.pooler_output
        prompt_embeds = prompt_embeds.to(dtype=components.text_encoder.dtype, device=device)

-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
-
        return prompt_embeds

    @staticmethod
@@ -305,34 +407,11 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
        prompt: Union[str, List[str]],
        prompt_2: Union[str, List[str]],
        device: Optional[torch.device] = None,
-        num_images_per_prompt: int = 1,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        max_sequence_length: int = 512,
        lora_scale: Optional[float] = None,
    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in all text-encoders
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            lora_scale (`float`, *optional*):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-        """
        device = device or components._execution_device

        # set lora scale so that monkey patched LoRA
@@ -357,12 +436,10 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
                components,
                prompt=prompt,
                device=device,
-                num_images_per_prompt=num_images_per_prompt,
            )
            prompt_embeds = FluxTextEncoderStep._get_t5_prompt_embeds(
                components,
                prompt=prompt_2,
-                num_images_per_prompt=num_images_per_prompt,
                max_sequence_length=max_sequence_length,
                device=device,
            )
@@ -377,10 +454,7 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
                # Retrieve the original scale by scaling back the LoRA layers
                unscale_lora_layers(components.text_encoder_2, lora_scale)

-        dtype = components.text_encoder.dtype if components.text_encoder is not None else torch.bfloat16
-        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
-
-        return prompt_embeds, pooled_prompt_embeds, text_ids
+        return prompt_embeds, pooled_prompt_embeds

    @torch.no_grad()
    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
@@ -396,14 +470,14 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
            if block_state.joint_attention_kwargs is not None
            else None
        )
-        (block_state.prompt_embeds, block_state.pooled_prompt_embeds, block_state.text_ids) = self.encode_prompt(
+        block_state.prompt_embeds, block_state.pooled_prompt_embeds = self.encode_prompt(
            components,
            prompt=block_state.prompt,
            prompt_2=None,
            prompt_embeds=None,
            pooled_prompt_embeds=None,
            device=block_state.device,
-            num_images_per_prompt=1,  # TODO: hardcoded for now.
+            max_sequence_length=block_state.max_sequence_length,
            lora_scale=block_state.text_encoder_lora_scale,
        )

--- a/src/diffusers/modular_pipelines/flux/inputs.py
+++ b/src/diffusers/modular_pipelines/flux/inputs.py
@@ -0,0 +1,359 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import torch
+
+from ...pipelines import FluxPipeline
+from ...utils import logging
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import InputParam, OutputParam
+
+# TODO: consider making these common utilities for modular if they are not pipeline-specific.
+from ..qwenimage.inputs import calculate_dimension_from_latents, repeat_tensor_to_batch_size
+from .modular_pipeline import FluxModularPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+class FluxTextInputStep(ModularPipelineBlocks):
+    model_name = "flux"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Text input processing step that standardizes text embeddings for the pipeline.\n"
+            "This step:\n"
+            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
+            "  2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)"
+        )
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("num_images_per_prompt", default=1),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
+            ),
+            InputParam(
+                "pooled_prompt_embeds",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.",
+            ),
+            # TODO: support negative embeddings?
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "batch_size",
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
+            ),
+            OutputParam(
+                "dtype",
+                type_hint=torch.dtype,
+                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
+            ),
+            OutputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="text embeddings used to guide the image generation",
+            ),
+            OutputParam(
+                "pooled_prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="pooled text embeddings used to guide the image generation",
+            ),
+            # TODO: support negative embeddings?
+        ]
+
+    def check_inputs(self, components, block_state):
+        if block_state.prompt_embeds is not None and block_state.pooled_prompt_embeds is not None:
+            if block_state.prompt_embeds.shape[0] != block_state.pooled_prompt_embeds.shape[0]:
+                raise ValueError(
+                    "`prompt_embeds` and `pooled_prompt_embeds` must have the same batch size when passed directly, but"
+                    f" got: `prompt_embeds` {block_state.prompt_embeds.shape} != `pooled_prompt_embeds`"
+                    f" {block_state.pooled_prompt_embeds.shape}."
+                )
+
+    @torch.no_grad()
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        # TODO: consider adding negative embeddings?
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        block_state.batch_size = block_state.prompt_embeds.shape[0]
+        block_state.dtype = block_state.prompt_embeds.dtype
+
+        _, seq_len, _ = block_state.prompt_embeds.shape
+        block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
+        block_state.prompt_embeds = block_state.prompt_embeds.view(
+            block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
+        )
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+# Adapted from `QwenImageInputsDynamicStep`
+class FluxInputsDynamicStep(ModularPipelineBlocks):
+    model_name = "flux"
+
+    def __init__(
+        self,
+        image_latent_inputs: List[str] = ["image_latents"],
+        additional_batch_inputs: List[str] = [],
+    ):
+        if not isinstance(image_latent_inputs, list):
+            image_latent_inputs = [image_latent_inputs]
+        if not isinstance(additional_batch_inputs, list):
+            additional_batch_inputs = [additional_batch_inputs]
+
+        self._image_latent_inputs = image_latent_inputs
+        self._additional_batch_inputs = additional_batch_inputs
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        # Functionality section
+        summary_section = (
+            "Input processing step that:\n"
+            "  1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
+            "  2. For additional batch inputs: Expands batch dimensions to match final batch size"
+        )
+
+        # Inputs info
+        inputs_info = ""
+        if self._image_latent_inputs or self._additional_batch_inputs:
+            inputs_info = "\n\nConfigured inputs:"
+            if self._image_latent_inputs:
+                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+            if self._additional_batch_inputs:
+                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+
+        # Placement guidance
+        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
+
+        return summary_section + inputs_info + placement_section
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        inputs = [
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="height"),
+            InputParam(name="width"),
+        ]
+
+        # Add image latent inputs
+        for image_latent_input_name in self._image_latent_inputs:
+            inputs.append(InputParam(name=image_latent_input_name))
+
+        # Add additional batch inputs
+        for input_name in self._additional_batch_inputs:
+            inputs.append(InputParam(name=input_name))
+
+        return inputs
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(name="image_height", type_hint=int, description="The height of the image latents"),
+            OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
+        ]
+
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
+        for image_latent_input_name in self._image_latent_inputs:
+            image_latent_tensor = getattr(block_state, image_latent_input_name)
+            if image_latent_tensor is None:
+                continue
+
+            # 1. Calculate height/width from latents
+            height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
+            block_state.height = block_state.height or height
+            block_state.width = block_state.width or width
+
+            if not hasattr(block_state, "image_height"):
+                block_state.image_height = height
+            if not hasattr(block_state, "image_width"):
+                block_state.image_width = width
+
+            # 2. Patchify the image latent tensor
+            # TODO: Implement patchifier for Flux.
+            latent_height, latent_width = image_latent_tensor.shape[2:]
+            image_latent_tensor = FluxPipeline._pack_latents(
+                image_latent_tensor, block_state.batch_size, image_latent_tensor.shape[1], latent_height, latent_width
+            )
+
+            # 3. Expand batch size
+            image_latent_tensor = repeat_tensor_to_batch_size(
+                input_name=image_latent_input_name,
+                input_tensor=image_latent_tensor,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, image_latent_input_name, image_latent_tensor)
+
+        # Process additional batch inputs (only batch expansion)
+        for input_name in self._additional_batch_inputs:
+            input_tensor = getattr(block_state, input_name)
+            if input_tensor is None:
+                continue
+
+            # Only expand batch size
+            input_tensor = repeat_tensor_to_batch_size(
+                input_name=input_name,
+                input_tensor=input_tensor,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, input_name, input_tensor)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class FluxKontextInputsDynamicStep(FluxInputsDynamicStep):
+    model_name = "flux-kontext"
+
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
+        for image_latent_input_name in self._image_latent_inputs:
+            image_latent_tensor = getattr(block_state, image_latent_input_name)
+            if image_latent_tensor is None:
+                continue
+
+            # 1. Calculate height/width from latents
+            # Unlike the `FluxInputsDynamicStep`, we don't overwrite the `block.height` and `block.width`
+            height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
+            if not hasattr(block_state, "image_height"):
+                block_state.image_height = height
+            if not hasattr(block_state, "image_width"):
+                block_state.image_width = width
+
+            # 2. Patchify the image latent tensor
+            # TODO: Implement patchifier for Flux.
+            latent_height, latent_width = image_latent_tensor.shape[2:]
+            image_latent_tensor = FluxPipeline._pack_latents(
+                image_latent_tensor, block_state.batch_size, image_latent_tensor.shape[1], latent_height, latent_width
+            )
+
+            # 3. Expand batch size
+            image_latent_tensor = repeat_tensor_to_batch_size(
+                input_name=image_latent_input_name,
+                input_tensor=image_latent_tensor,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, image_latent_input_name, image_latent_tensor)
+
+        # Process additional batch inputs (only batch expansion)
+        for input_name in self._additional_batch_inputs:
+            input_tensor = getattr(block_state, input_name)
+            if input_tensor is None:
+                continue
+
+            # Only expand batch size
+            input_tensor = repeat_tensor_to_batch_size(
+                input_name=input_name,
+                input_tensor=input_tensor,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, input_name, input_tensor)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class FluxKontextSetResolutionStep(ModularPipelineBlocks):
+    model_name = "flux-kontext"
+
+    def description(self):
+        return (
+            "Determines the height and width to be used during the subsequent computations.\n"
+            "It should always be placed _before_ the latent preparation step."
+        )
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        inputs = [
+            InputParam(name="height"),
+            InputParam(name="width"),
+            InputParam(name="max_area", type_hint=int, default=1024**2),
+        ]
+        return inputs
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(name="height", type_hint=int, description="The height of the initial noisy latents"),
+            OutputParam(name="width", type_hint=int, description="The width of the initial noisy latents"),
+        ]
+
+    @staticmethod
+    def check_inputs(height, width, vae_scale_factor):
+        if height is not None and height % (vae_scale_factor * 2) != 0:
+            raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
+
+        if width is not None and width % (vae_scale_factor * 2) != 0:
+            raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
+
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        height = block_state.height or components.default_height
+        width = block_state.width or components.default_width
+        self.check_inputs(height, width, components.vae_scale_factor)
+
+        original_height, original_width = height, width
+        max_area = block_state.max_area
+        aspect_ratio = width / height
+        width = round((max_area * aspect_ratio) ** 0.5)
+        height = round((max_area / aspect_ratio) ** 0.5)
+
+        multiple_of = components.vae_scale_factor * 2
+        width = width // multiple_of * multiple_of
+        height = height // multiple_of * multiple_of
+
+        if height != original_height or width != original_width:
+            logger.warning(
+                f"Generation `height` and `width` have been adjusted to {height} and {width} to fit the model requirements."
+            )
+
+        block_state.height = height
+        block_state.width = width
+
+        self.set_block_state(state, block_state)
+        return components, state
--- a/src/diffusers/modular_pipelines/flux/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -18,21 +18,49 @@ from ..modular_pipeline_utils import InsertableDict
 from .before_denoise import (
    FluxImg2ImgPrepareLatentsStep,
    FluxImg2ImgSetTimestepsStep,
-    FluxInputStep,
+    FluxKontextRoPEInputsStep,
    FluxPrepareLatentsStep,
+    FluxRoPEInputsStep,
    FluxSetTimestepsStep,
 )
 from .decoders import FluxDecodeStep
-from .denoise import FluxDenoiseStep
-from .encoders import FluxTextEncoderStep, FluxVaeEncoderStep
+from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
+from .encoders import (
+    FluxKontextProcessImagesInputStep,
+    FluxProcessImagesInputStep,
+    FluxTextEncoderStep,
+    FluxVaeEncoderDynamicStep,
+)
+from .inputs import (
+    FluxInputsDynamicStep,
+    FluxKontextInputsDynamicStep,
+    FluxKontextSetResolutionStep,
+    FluxTextInputStep,
+)


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


 # vae encoder (run before before_denoise)
+FluxImg2ImgVaeEncoderBlocks = InsertableDict(
+    [("preprocess", FluxProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep())]
+)
+
+
+class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "flux"
+
+    block_classes = FluxImg2ImgVaeEncoderBlocks.values()
+    block_names = FluxImg2ImgVaeEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
 class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [FluxVaeEncoderStep]
+    block_classes = [FluxImg2ImgVaeEncoderStep]
    block_names = ["img2img"]
    block_trigger_inputs = ["image"]

@@ -41,52 +69,89 @@ class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
        return (
            "Vae encoder step that encode the image inputs into their latent representations.\n"
            + "This is an auto pipeline block that works for img2img tasks.\n"
-            + " - `FluxVaeEncoderStep` (img2img) is used when only `image` is provided."
-            + " - if `image` is provided, step will be skipped."
+            + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
+            + " - if `image` is not provided, step will be skipped."
        )


-# before_denoise: text2img, img2img
-class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [
-        FluxInputStep,
-        FluxPrepareLatentsStep,
-        FluxSetTimestepsStep,
-    ]
-    block_names = ["input", "prepare_latents", "set_timesteps"]
+# Flux Kontext vae encoder (run before before_denoise)
+
+FluxKontextVaeEncoderBlocks = InsertableDict(
+    [("preprocess", FluxKontextProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep(sample_mode="argmax"))]
+)
+
+
+class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "flux-kontext"
+
+    block_classes = FluxKontextVaeEncoderBlocks.values()
+    block_names = FluxKontextVaeEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
+class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [FluxKontextVaeEncoderStep]
+    block_names = ["img2img"]
+    block_trigger_inputs = ["image"]

    @property
    def description(self):
        return (
-            "Before denoise step that prepare the inputs for the denoise step.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `FluxInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `FluxPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `FluxSetTimestepsStep` is used to set the timesteps\n"
+            "Vae encoder step that encode the image inputs into their latent representations.\n"
+            + "This is an auto pipeline block that works for img2img tasks.\n"
+            + " - `FluxKontextVaeEncoderStep` (img2img) is used when only `image` is provided."
+            + " - if `image` is not provided, step will be skipped."
        )


+# before_denoise: text2img
+FluxBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+    ]
+)
+
+
+class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = FluxBeforeDenoiseBlocks.values()
+    block_names = FluxBeforeDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
+
+
 # before_denoise: img2img
+FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
+        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+    ]
+)
+
+
 class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [FluxInputStep, FluxImg2ImgSetTimestepsStep, FluxImg2ImgPrepareLatentsStep]
-    block_names = ["input", "set_timesteps", "prepare_latents"]
+    block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
+    block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()

    @property
    def description(self):
-        return (
-            "Before denoise step that prepare the inputs for the denoise step for img2img task.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `FluxInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `FluxImg2ImgSetTimestepsStep` is used to set the timesteps\n"
-            + " - `FluxImg2ImgPrepareLatentsStep` is used to prepare the latents\n"
-        )
+        return "Before denoise step that prepare the inputs for the denoise step for img2img task."


 # before_denoise: all task (text2img, img2img)
 class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxBeforeDenoiseStep, FluxImg2ImgBeforeDenoiseStep]
-    block_names = ["text2image", "img2img"]
-    block_trigger_inputs = [None, "image_latents"]
+    model_name = "flux-kontext"
+    block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]

    @property
    def description(self):
@@ -98,6 +163,44 @@ class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
        )


+# before_denoise: FluxKontext
+
+FluxKontextBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
+    ]
+)
+
+
+class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = FluxKontextBeforeDenoiseBlocks.values()
+    block_names = FluxKontextBeforeDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step\n"
+            "for img2img/text2img task for Flux Kontext."
+        )
+
+
+class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    block_classes = [FluxKontextBeforeDenoiseStep, FluxBeforeDenoiseStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is an auto pipeline block that works for text2image.\n"
+            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
+            + " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
+        )
+
+
 # denoise: text2image
 class FluxAutoDenoiseStep(AutoPipelineBlocks):
    block_classes = [FluxDenoiseStep]
@@ -113,7 +216,24 @@ class FluxAutoDenoiseStep(AutoPipelineBlocks):
        )


-# decode: all task (text2img, img2img, inpainting)
+# denoise: Flux Kontext
+
+
+class FluxKontextAutoDenoiseStep(AutoPipelineBlocks):
+    block_classes = [FluxKontextDenoiseStep]
+    block_names = ["denoise"]
+    block_trigger_inputs = [None]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents for Flux Kontext. "
+            "This is a auto pipeline block that works for text2image and img2img tasks."
+            " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
+        )
+
+
+# decode: all task (text2img, img2img)
 class FluxAutoDecodeStep(AutoPipelineBlocks):
    block_classes = [FluxDecodeStep]
    block_names = ["non-inpaint"]
@@ -124,16 +244,143 @@ class FluxAutoDecodeStep(AutoPipelineBlocks):
        return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"


-# text2image
-class FluxAutoBlocks(SequentialPipelineBlocks):
-    block_classes = [
-        FluxTextEncoderStep,
-        FluxAutoVaeEncoderStep,
-        FluxAutoBeforeDenoiseStep,
-        FluxAutoDenoiseStep,
-        FluxAutoDecodeStep,
+# inputs: text2image/img2img
+FluxImg2ImgBlocks = InsertableDict(
+    [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
+)
+
+
+class FluxImg2ImgInputStep(SequentialPipelineBlocks):
+    model_name = "flux"
+    block_classes = FluxImg2ImgBlocks.values()
+    block_names = FluxImg2ImgBlocks.keys()
+
+    @property
+    def description(self):
+        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
+        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+        " - update height/width based `image_latents`, patchify `image_latents`."
+
+
+class FluxAutoInputStep(AutoPipelineBlocks):
+    block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+            + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
+            + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
+        )
+
+
+# inputs: Flux Kontext
+
+FluxKontextBlocks = InsertableDict(
+    [
+        ("set_resolution", FluxKontextSetResolutionStep()),
+        ("text_inputs", FluxTextInputStep()),
+        ("additional_inputs", FluxKontextInputsDynamicStep()),
    ]
-    block_names = ["text_encoder", "image_encoder", "before_denoise", "denoise", "decoder"]
+)
+
+
+class FluxKontextInputStep(SequentialPipelineBlocks):
+    model_name = "flux-kontext"
+    block_classes = FluxKontextBlocks.values()
+    block_names = FluxKontextBlocks.keys()
+
+    @property
+    def description(self):
+        return (
+            "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n"
+            " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+            " - update height/width based `image_latents`, patchify `image_latents`."
+        )
+
+
+class FluxKontextAutoInputStep(AutoPipelineBlocks):
+    block_classes = [FluxKontextInputStep, FluxTextInputStep]
+    # block_classes = [FluxKontextInputStep]
+    block_names = ["img2img", "text2img"]
+    # block_names = ["img2img"]
+    block_trigger_inputs = ["image_latents", None]
+    # block_trigger_inputs = ["image_latents"]
+
+    @property
+    def description(self):
+        return (
+            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+            + " - `FluxKontextInputStep` (img2img) is used when `image_latents` is provided.\n"
+            + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
+        )
+
+
+class FluxCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux"
+    block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `FluxAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
+            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings."
+        )
+
+
+class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux-kontext"
+    block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextAutoDenoiseStep]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `FluxKontextAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `FluxKontextAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `FluxKontextAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
+            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings."
+        )
+
+
+# Auto blocks (text2image and img2img)
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", FluxTextEncoderStep()),
+        ("image_encoder", FluxAutoVaeEncoderStep()),
+        ("denoise", FluxCoreDenoiseStep()),
+        ("decode", FluxDecodeStep()),
+    ]
+)
+
+AUTO_BLOCKS_KONTEXT = InsertableDict(
+    [
+        ("text_encoder", FluxTextEncoderStep()),
+        ("image_encoder", FluxKontextAutoVaeEncoderStep()),
+        ("denoise", FluxKontextCoreDenoiseStep()),
+        ("decode", FluxDecodeStep()),
+    ]
+)
+
+
+class FluxAutoBlocks(SequentialPipelineBlocks):
+    model_name = "flux"
+
+    block_classes = AUTO_BLOCKS.values()
+    block_names = AUTO_BLOCKS.keys()

    @property
    def description(self):
@@ -144,38 +391,56 @@ class FluxAutoBlocks(SequentialPipelineBlocks):
        )


+class FluxKontextAutoBlocks(FluxAutoBlocks):
+    model_name = "flux-kontext"
+
+    block_classes = AUTO_BLOCKS_KONTEXT.values()
+    block_names = AUTO_BLOCKS_KONTEXT.keys()
+
+
 TEXT2IMAGE_BLOCKS = InsertableDict(
    [
-        ("text_encoder", FluxTextEncoderStep),
-        ("input", FluxInputStep),
-        ("prepare_latents", FluxPrepareLatentsStep),
-        ("set_timesteps", FluxSetTimestepsStep),
-        ("denoise", FluxDenoiseStep),
-        ("decode", FluxDecodeStep),
+        ("text_encoder", FluxTextEncoderStep()),
+        ("input", FluxTextInputStep()),
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+        ("denoise", FluxDenoiseStep()),
+        ("decode", FluxDecodeStep()),
    ]
 )

 IMAGE2IMAGE_BLOCKS = InsertableDict(
    [
-        ("text_encoder", FluxTextEncoderStep),
-        ("image_encoder", FluxVaeEncoderStep),
-        ("input", FluxInputStep),
-        ("set_timesteps", FluxImg2ImgSetTimestepsStep),
-        ("prepare_latents", FluxImg2ImgPrepareLatentsStep),
-        ("denoise", FluxDenoiseStep),
-        ("decode", FluxDecodeStep),
+        ("text_encoder", FluxTextEncoderStep()),
+        ("vae_encoder", FluxVaeEncoderDynamicStep()),
+        ("input", FluxImg2ImgInputStep()),
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
+        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+        ("denoise", FluxDenoiseStep()),
+        ("decode", FluxDecodeStep()),
    ]
 )

-AUTO_BLOCKS = InsertableDict(
+FLUX_KONTEXT_BLOCKS = InsertableDict(
    [
-        ("text_encoder", FluxTextEncoderStep),
-        ("image_encoder", FluxAutoVaeEncoderStep),
-        ("before_denoise", FluxAutoBeforeDenoiseStep),
-        ("denoise", FluxAutoDenoiseStep),
-        ("decode", FluxAutoDecodeStep),
+        ("text_encoder", FluxTextEncoderStep()),
+        ("vae_encoder", FluxVaeEncoderDynamicStep(sample_mode="argmax")),
+        ("input", FluxKontextInputStep()),
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
+        ("denoise", FluxKontextDenoiseStep()),
+        ("decode", FluxDecodeStep()),
    ]
 )

-
-ALL_BLOCKS = {"text2image": TEXT2IMAGE_BLOCKS, "img2img": IMAGE2IMAGE_BLOCKS, "auto": AUTO_BLOCKS}
+ALL_BLOCKS = {
+    "text2image": TEXT2IMAGE_BLOCKS,
+    "img2img": IMAGE2IMAGE_BLOCKS,
+    "auto": AUTO_BLOCKS,
+    "auto_kontext": AUTO_BLOCKS_KONTEXT,
+    "kontext": FLUX_KONTEXT_BLOCKS,
+}
--- a/src/diffusers/modular_pipelines/flux/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/flux/modular_pipeline.py
@@ -55,3 +55,13 @@ class FluxModularPipeline(ModularPipeline, FluxLoraLoaderMixin, TextualInversion
        if getattr(self, "transformer", None):
            num_channels_latents = self.transformer.config.in_channels // 4
        return num_channels_latents
+
+
+class FluxKontextModularPipeline(FluxModularPipeline):
+    """
+    A ModularPipeline for Flux Kontext.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "FluxKontextAutoBlocks"
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -57,8 +57,10 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
        ("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),
        ("wan", "WanModularPipeline"),
        ("flux", "FluxModularPipeline"),
+        ("flux-kontext", "FluxKontextModularPipeline"),
        ("qwenimage", "QwenImageModularPipeline"),
        ("qwenimage-edit", "QwenImageEditModularPipeline"),
+        ("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"),
    ]
 )

@@ -1628,7 +1630,8 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
            blocks = ModularPipelineBlocks.from_pretrained(
                pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
            )
-        except EnvironmentError:
+        except EnvironmentError as e:
+            logger.debug(f"EnvironmentError: {e}")
            blocks = None

        cache_dir = kwargs.pop("cache_dir", None)
--- a/src/diffusers/modular_pipelines/qwenimage/init.py
+++ b/src/diffusers/modular_pipelines/qwenimage/init.py
@@ -29,13 +29,20 @@ else:
        "EDIT_AUTO_BLOCKS",
        "EDIT_BLOCKS",
        "EDIT_INPAINT_BLOCKS",
+        "EDIT_PLUS_AUTO_BLOCKS",
+        "EDIT_PLUS_BLOCKS",
        "IMAGE2IMAGE_BLOCKS",
        "INPAINT_BLOCKS",
        "TEXT2IMAGE_BLOCKS",
        "QwenImageAutoBlocks",
        "QwenImageEditAutoBlocks",
+        "QwenImageEditPlusAutoBlocks",
+    ]
+    _import_structure["modular_pipeline"] = [
+        "QwenImageEditModularPipeline",
+        "QwenImageEditPlusModularPipeline",
+        "QwenImageModularPipeline",
    ]
-    _import_structure["modular_pipeline"] = ["QwenImageEditModularPipeline", "QwenImageModularPipeline"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
@@ -54,13 +61,20 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            EDIT_AUTO_BLOCKS,
            EDIT_BLOCKS,
            EDIT_INPAINT_BLOCKS,
+            EDIT_PLUS_AUTO_BLOCKS,
+            EDIT_PLUS_BLOCKS,
            IMAGE2IMAGE_BLOCKS,
            INPAINT_BLOCKS,
            TEXT2IMAGE_BLOCKS,
            QwenImageAutoBlocks,
            QwenImageEditAutoBlocks,
+            QwenImageEditPlusAutoBlocks,
+        )
+        from .modular_pipeline import (
+            QwenImageEditModularPipeline,
+            QwenImageEditPlusModularPipeline,
+            QwenImageModularPipeline,
        )
-        from .modular_pipeline import QwenImageEditModularPipeline, QwenImageModularPipeline
 else:
    import sys

--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -203,7 +203,6 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
        block_state.latents = components.pachifier.pack_latents(block_state.latents)

        self.set_block_state(state, block_state)
-
        return components, state


@@ -571,7 +570,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):

    @property
    def description(self) -> str:
-        return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be place after prepare_latents step"
+        return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step"

    @property
    def inputs(self) -> List[InputParam]:
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -128,6 +128,61 @@ def get_qwen_prompt_embeds_edit(
    return prompt_embeds, encoder_attention_mask


+def get_qwen_prompt_embeds_edit_plus(
+    text_encoder,
+    processor,
+    prompt: Union[str, List[str]] = None,
+    image: Optional[Union[torch.Tensor, List[PIL.Image.Image], PIL.Image.Image]] = None,
+    prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    img_template_encode: str = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>",
+    prompt_template_encode_start_idx: int = 64,
+    device: Optional[torch.device] = None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    if isinstance(image, list):
+        base_img_prompt = ""
+        for i, img in enumerate(image):
+            base_img_prompt += img_template_encode.format(i + 1)
+    elif image is not None:
+        base_img_prompt = img_template_encode.format(1)
+    else:
+        base_img_prompt = ""
+
+    template = prompt_template_encode
+
+    drop_idx = prompt_template_encode_start_idx
+    txt = [template.format(base_img_prompt + e) for e in prompt]
+
+    model_inputs = processor(
+        text=txt,
+        images=image,
+        padding=True,
+        return_tensors="pt",
+    ).to(device)
+    outputs = text_encoder(
+        input_ids=model_inputs.input_ids,
+        attention_mask=model_inputs.attention_mask,
+        pixel_values=model_inputs.pixel_values,
+        image_grid_thw=model_inputs.image_grid_thw,
+        output_hidden_states=True,
+    )
+
+    hidden_states = outputs.hidden_states[-1]
+    split_hidden_states = _extract_masked_hidden(hidden_states, model_inputs.attention_mask)
+    split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
+    attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
+    max_seq_len = max([e.size(0) for e in split_hidden_states])
+    prompt_embeds = torch.stack(
+        [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
+    )
+    encoder_attention_mask = torch.stack(
+        [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
+    )
+
+    prompt_embeds = prompt_embeds.to(device=device)
+    return prompt_embeds, encoder_attention_mask
+
+
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -266,6 +321,83 @@ class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
        return components, state


+class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
+    model_name = "qwenimage"
+
+    def __init__(
+        self,
+        input_name: str = "image",
+        output_name: str = "resized_image",
+        vae_image_output_name: str = "vae_image",
+    ):
+        """Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.
+
+        This block resizes an input image or a list input images and exposes the resized result under configurable
+        input and output names. Use this when you need to wire the resize step to different image fields (e.g.,
+        "image", "control_image")
+
+        Args:
+            input_name (str, optional): Name of the image field to read from the
+                pipeline state. Defaults to "image".
+            output_name (str, optional): Name of the resized image field to write
+                back to the pipeline state. Defaults to "resized_image".
+            vae_image_output_name (str, optional): Name of the image field
+                to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus
+                processes the input image(s) differently for the VL and the VAE.
+        """
+        if not isinstance(input_name, str) or not isinstance(output_name, str):
+            raise ValueError(
+                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
+            )
+        self.condition_image_size = 384 * 384
+        self._image_input_name = input_name
+        self._resized_image_output_name = output_name
+        self._vae_image_output_name = vae_image_output_name
+        super().__init__()
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return super().intermediate_outputs + [
+            OutputParam(
+                name=self._vae_image_output_name,
+                type_hint=List[PIL.Image.Image],
+                description="The images to be processed which will be further used by the VAE encoder.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        images = getattr(block_state, self._image_input_name)
+
+        if not is_valid_image_imagelist(images):
+            raise ValueError(f"Images must be image or list of images but are {type(images)}")
+
+        if (
+            not isinstance(images, torch.Tensor)
+            and isinstance(images, PIL.Image.Image)
+            and not isinstance(images, list)
+        ):
+            images = [images]
+
+        # TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s
+        condition_images = []
+        vae_images = []
+        for img in images:
+            image_width, image_height = img.size
+            condition_width, condition_height, _ = calculate_dimensions(
+                self.condition_image_size, image_width / image_height
+            )
+            condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width))
+            vae_images.append(img)
+
+        setattr(block_state, self._resized_image_output_name, condition_images)
+        setattr(block_state, self._vae_image_output_name, vae_images)
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
    model_name = "qwenimage"

@@ -511,6 +643,61 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
        return components, state


+class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
+    model_name = "qwenimage"
+
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [
+            ConfigSpec(
+                name="prompt_template_encode",
+                default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+            ),
+            ConfigSpec(
+                name="img_template_encode",
+                default="Picture {}: <|vision_start|><|image_pad|><|vision_end|>",
+            ),
+            ConfigSpec(name="prompt_template_encode_start_idx", default=64),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        self.check_inputs(block_state.prompt, block_state.negative_prompt)
+
+        device = components._execution_device
+
+        block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds_edit_plus(
+            components.text_encoder,
+            components.processor,
+            prompt=block_state.prompt,
+            image=block_state.resized_image,
+            prompt_template_encode=components.config.prompt_template_encode,
+            img_template_encode=components.config.img_template_encode,
+            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+            device=device,
+        )
+
+        if components.requires_unconditional_embeds:
+            negative_prompt = block_state.negative_prompt or " "
+            block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = (
+                get_qwen_prompt_embeds_edit_plus(
+                    components.text_encoder,
+                    components.processor,
+                    prompt=negative_prompt,
+                    image=block_state.resized_image,
+                    prompt_template_encode=components.config.prompt_template_encode,
+                    img_template_encode=components.config.img_template_encode,
+                    prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+                    device=device,
+                )
+            )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
    model_name = "qwenimage"

@@ -612,12 +799,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):

    @property
    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam("resized_image"),
-            InputParam("image"),
-            InputParam("height"),
-            InputParam("width"),
-        ]
+        return [InputParam("resized_image"), InputParam("image"), InputParam("height"), InputParam("width")]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
@@ -661,6 +843,47 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
        return components, state


+class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep):
+    model_name = "qwenimage-edit-plus"
+    vae_image_size = 1024 * 1024
+
+    @property
+    def description(self) -> str:
+        return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        if block_state.vae_image is None and block_state.image is None:
+            raise ValueError("`vae_image` and `image` cannot be None at the same time")
+
+        if block_state.vae_image is None:
+            image = block_state.image
+            self.check_inputs(
+                height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
+            )
+            height = block_state.height or components.default_height
+            width = block_state.width or components.default_width
+            block_state.processed_image = components.image_processor.preprocess(
+                image=image, height=height, width=width
+            )
+        else:
+            width, height = block_state.vae_image[0].size
+            image = block_state.vae_image
+
+            block_state.processed_image = components.image_processor.preprocess(
+                image=image, height=height, width=width
+            )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
    model_name = "qwenimage"

@@ -738,7 +961,6 @@ class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
            dtype=dtype,
            latent_channels=components.num_channels_latents,
        )
-
        setattr(block_state, self._image_latents_output_name, image_latents)

        self.set_block_state(state, block_state)
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
@@ -37,6 +37,9 @@ from .denoise import (
 )
 from .encoders import (
    QwenImageControlNetVaeEncoderStep,
+    QwenImageEditPlusProcessImagesInputStep,
+    QwenImageEditPlusResizeDynamicStep,
+    QwenImageEditPlusTextEncoderStep,
    QwenImageEditResizeDynamicStep,
    QwenImageEditTextEncoderStep,
    QwenImageInpaintProcessImagesInputStep,
@@ -872,7 +875,151 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
        )


-# 3. all block presets supported in QwenImage & QwenImage-Edit
+#################### QwenImage Edit Plus #####################
+
+# 3. QwenImage-Edit Plus
+
+## 3.1 QwenImage-Edit Plus / edit
+
+#### QwenImage-Edit Plus vl encoder: take both image and text prompts
+QwenImageEditPlusVLEncoderBlocks = InsertableDict(
+    [
+        ("resize", QwenImageEditPlusResizeDynamicStep()),
+        ("encode", QwenImageEditPlusTextEncoderStep()),
+    ]
+)
+
+
+class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = QwenImageEditPlusVLEncoderBlocks.values()
+    block_names = QwenImageEditPlusVLEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "QwenImage-Edit Plus VL encoder step that encode the image an text prompts together."
+
+
+#### QwenImage-Edit Plus vae encoder
+QwenImageEditPlusVaeEncoderBlocks = InsertableDict(
+    [
+        ("resize", QwenImageEditPlusResizeDynamicStep()),  # edit plus has a different resize step
+        ("preprocess", QwenImageEditPlusProcessImagesInputStep()),  # vae_image -> processed_image
+        ("encode", QwenImageVaeEncoderDynamicStep()),  # processed_image -> image_latents
+    ]
+)
+
+
+class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = QwenImageEditPlusVaeEncoderBlocks.values()
+    block_names = QwenImageEditPlusVaeEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that encode the image inputs into their latent representations."
+
+
+#### QwenImage Edit Plus presets
+EDIT_PLUS_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", QwenImageEditPlusVLEncoderStep()),
+        ("vae_encoder", QwenImageEditPlusVaeEncoderStep()),
+        ("input", QwenImageEditInputStep()),
+        ("prepare_latents", QwenImagePrepareLatentsStep()),
+        ("set_timesteps", QwenImageSetTimestepsStep()),
+        ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
+        ("denoise", QwenImageEditDenoiseStep()),
+        ("decode", QwenImageDecodeStep()),
+    ]
+)
+
+
+# auto before_denoise step for edit tasks
+class QwenImageEditPlusAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = [QwenImageEditBeforeDenoiseStep]
+    block_names = ["edit"]
+    block_trigger_inputs = ["image_latents"]
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n"
+            + "This is an auto pipeline block that works for edit (img2img) task.\n"
+            + " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n"
+            + " - if `image_latents` is not provided, step will be skipped."
+        )
+
+
+## 3.2 QwenImage-Edit Plus/auto encoders
+
+
+class QwenImageEditPlusAutoVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [
+        QwenImageEditPlusVaeEncoderStep,
+    ]
+    block_names = ["edit"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations. \n"
+            " This is an auto pipeline block that works for edit task.\n"
+            + " - `QwenImageEditPlusVaeEncoderStep` (edit) is used when `image` is provided.\n"
+            + " - if `image` is not provided, step will be skipped."
+        )
+
+
+## 3.3 QwenImage-Edit/auto blocks & presets
+
+
+class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = [
+        QwenImageEditAutoInputStep,
+        QwenImageEditPlusAutoBeforeDenoiseStep,
+        QwenImageEditAutoDenoiseStep,
+    ]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `QwenImageEditPlusAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+            + "This step support edit (img2img) workflow for QwenImage Edit Plus:\n"
+            + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n"
+        )
+
+
+EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", QwenImageEditPlusVLEncoderStep()),
+        ("vae_encoder", QwenImageEditPlusAutoVaeEncoderStep()),
+        ("denoise", QwenImageEditPlusCoreDenoiseStep()),
+        ("decode", QwenImageAutoDecodeStep()),
+    ]
+)
+
+
+class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
+    block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
+
+    @property
+    def description(self):
+        return (
+            "Auto Modular pipeline for edit (img2img) and edit tasks using QwenImage-Edit Plus.\n"
+            + "- for edit (img2img) generation, you need to provide `image`\n"
+        )
+
+
+# 3. all block presets supported in QwenImage, QwenImage-Edit, QwenImage-Edit Plus


 ALL_BLOCKS = {
@@ -880,8 +1027,10 @@ ALL_BLOCKS = {
    "img2img": IMAGE2IMAGE_BLOCKS,
    "edit": EDIT_BLOCKS,
    "edit_inpaint": EDIT_INPAINT_BLOCKS,
+    "edit_plus": EDIT_PLUS_BLOCKS,
    "inpaint": INPAINT_BLOCKS,
    "controlnet": CONTROLNET_BLOCKS,
    "auto": AUTO_BLOCKS,
    "edit_auto": EDIT_AUTO_BLOCKS,
+    "edit_plus_auto": EDIT_PLUS_AUTO_BLOCKS,
 }
--- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
@@ -196,3 +196,13 @@ class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
            requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1

        return requires_unconditional_embeds
+
+
+class QwenImageEditPlusModularPipeline(QwenImageEditModularPipeline):
+    """
+    A ModularPipeline for QwenImage-Edit Plus.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "QwenImageEditPlusAutoBlocks"
--- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
@@ -17,7 +17,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import UNet2DConditionLoadersMixin
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -95,6 +95,7 @@ from .qwenimage import (
    QwenImageControlNetPipeline,
    QwenImageEditInpaintPipeline,
    QwenImageEditPipeline,
+    QwenImageEditPlusPipeline,
    QwenImageImg2ImgPipeline,
    QwenImageInpaintPipeline,
    QwenImagePipeline,
@@ -186,6 +187,7 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
        ("flux-kontext", FluxKontextPipeline),
        ("qwenimage", QwenImageImg2ImgPipeline),
        ("qwenimage-edit", QwenImageEditPipeline),
+        ("qwenimage-edit-plus", QwenImageEditPlusPipeline),
    ]
 )

--- a/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
+++ b/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
@@ -14,7 +14,6 @@
 from typing import Optional, Tuple, Union

 import torch
-import torch.utils.checkpoint
 from torch import nn
 from transformers import BertTokenizer
 from transformers.activations import QuickGELUActivation as QuickGELU
--- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -18,7 +18,6 @@ from typing import Callable, List, Optional, Union
 import numpy as np
 import PIL.Image
 import torch
-import torch.utils.checkpoint
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

 from ....image_processor import VaeImageProcessor
--- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -16,7 +16,6 @@ import inspect
 from typing import Callable, List, Optional, Union

 import torch
-import torch.utils.checkpoint
 from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer

 from ....image_processor import VaeImageProcessor
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -17,7 +17,6 @@ from typing import List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
-import torch.utils.checkpoint
 from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import BaseModelOutput
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -4,7 +4,6 @@ from typing import List, Optional, Tuple, Union
 import numpy as np
 import PIL.Image
 import torch
-import torch.utils.checkpoint

 from ...models import UNet2DModel, VQModel
 from ...schedulers import (
--- a/src/diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py
+++ b/src/diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py
@@ -121,6 +121,38 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
        result = torch.lerp(latents, result, factor)
        return result

+    def tone_map_latents(self, latents: torch.Tensor, compression: float) -> torch.Tensor:
+        """
+        Applies a non-linear tone-mapping function to latent values to reduce their dynamic range in a perceptually
+        smooth way using a sigmoid-based compression.
+
+        This is useful for regularizing high-variance latents or for conditioning outputs during generation, especially
+        when controlling dynamic behavior with a `compression` factor.
+
+        Args:
+            latents : torch.Tensor
+                Input latent tensor with arbitrary shape. Expected to be roughly in [-1, 1] or [0, 1] range.
+            compression : float
+                Compression strength in the range [0, 1].
+                - 0.0: No tone-mapping (identity transform)
+                - 1.0: Full compression effect
+
+        Returns:
+            torch.Tensor
+                The tone-mapped latent tensor of the same shape as input.
+        """
+        # Remap [0-1] to [0-0.75] and apply sigmoid compression in one shot
+        scale_factor = compression * 0.75
+        abs_latents = torch.abs(latents)
+
+        # Sigmoid compression: sigmoid shifts large values toward 0.2, small values stay ~1.0
+        # When scale_factor=0, sigmoid term vanishes, when scale_factor=0.75, full effect
+        sigmoid_term = torch.sigmoid(4.0 * scale_factor * (abs_latents - 1.0))
+        scales = 1.0 - 0.8 * scale_factor * sigmoid_term
+
+        filtered = latents * scales
+        return filtered
+
    @staticmethod
    # Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._normalize_latents
    def _normalize_latents(
@@ -196,7 +228,7 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
        )
        self.vae.disable_tiling()

-    def check_inputs(self, video, height, width, latents):
+    def check_inputs(self, video, height, width, latents, tone_map_compression_ratio):
        if height % self.vae_spatial_compression_ratio != 0 or width % self.vae_spatial_compression_ratio != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")

@@ -205,6 +237,9 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
        if video is None and latents is None:
            raise ValueError("One of `video` or `latents` has to be provided.")

+        if not (0 <= tone_map_compression_ratio <= 1):
+            raise ValueError("`tone_map_compression_ratio` must be in the range [0, 1]")
+
    @torch.no_grad()
    def __call__(
        self,
@@ -215,6 +250,7 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
        decode_timestep: Union[float, List[float]] = 0.0,
        decode_noise_scale: Optional[Union[float, List[float]]] = None,
        adain_factor: float = 0.0,
+        tone_map_compression_ratio: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
@@ -224,6 +260,7 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
            height=height,
            width=width,
            latents=latents,
+            tone_map_compression_ratio=tone_map_compression_ratio,
        )

        if video is not None:
@@ -266,6 +303,9 @@ class LTXLatentUpsamplePipeline(DiffusionPipeline):
        else:
            latents = latents_upsampled

+        if tone_map_compression_ratio > 0.0:
+            latents = self.tone_map_latents(latents, tone_map_compression_ratio)
+
        if output_type == "latent":
            latents = self._normalize_latents(
                latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
@@ -86,15 +86,14 @@ class MarigoldDepthOutput(BaseOutput):

    Args:
        prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times
-            width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
+            Predicted depth maps with values in the range [0, 1]. The shape is `numimages × 1 × height × width` for
+            `torch.Tensor` or `numimages × height × width × 1` for `np.ndarray`.
        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
-            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
-            for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `numimages × 1 ×
+            height × width` for `torch.Tensor` or `numimages × height × width × 1` for `np.ndarray`.
        latent (`None`, `torch.Tensor`):
            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
-            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+            The shape is `numimages * numensemble × 4 × latentheight × latentwidth`.
    """

    prediction: Union[np.ndarray, torch.Tensor]
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
@@ -99,17 +99,17 @@ class MarigoldIntrinsicsOutput(BaseOutput):

    Args:
        prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3
-            \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width
-            \times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of
-            the intrinsic image decomposition.
+            Predicted image intrinsics with values in the range [0, 1]. The shape is `(numimages * numtargets) × 3 ×
+            height × width` for `torch.Tensor` or `(numimages * numtargets) × height × width × 3` for `np.ndarray`,
+            where `numtargets` corresponds to the number of predicted target modalities of the intrinsic image
+            decomposition.
        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages *
-            numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times
-            height \times width \times 3$ for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `(numimages *
+            numtargets) × 3 × height × width` for `torch.Tensor` or `(numimages * numtargets) × height × width × 3` for
+            `np.ndarray`.
        latent (`None`, `torch.Tensor`):
            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
-            The shape is $(numimages * numensemble) \times (numtargets * 4) \times latentheight \times latentwidth$.
+            The shape is `(numimages * numensemble) × (numtargets * 4) × latentheight × latentwidth`.
    """

    prediction: Union[np.ndarray, torch.Tensor]
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
@@ -81,15 +81,14 @@ class MarigoldNormalsOutput(BaseOutput):

    Args:
        prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times
-            width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
+            Predicted normals with values in the range [-1, 1]. The shape is `numimages × 3 × height × width` for
+            `torch.Tensor` or `numimages × height × width × 3` for `np.ndarray`.
        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
-            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
-            for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `numimages × 1 ×
+            height × width` for `torch.Tensor` or `numimages × height × width × 1` for `np.ndarray`.
        latent (`None`, `torch.Tensor`):
            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
-            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+            The shape is `numimages * numensemble × 4 × latentheight × latentwidth`.
    """

    prediction: Union[np.ndarray, torch.Tensor]
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -838,6 +838,9 @@ def load_sub_model(
        else:
            loading_kwargs["low_cpu_mem_usage"] = False

+    if is_transformers_model and is_transformers_version(">=", "4.57.0"):
+        loading_kwargs.pop("offload_state_dict")
+
    if (
        quantization_config is not None
        and isinstance(quantization_config, PipelineQuantizationConfig)
--- a/src/diffusers/pipelines/stable_audio/modeling_stable_audio.py
+++ b/src/diffusers/pipelines/stable_audio/modeling_stable_audio.py
@@ -18,7 +18,6 @@ from typing import Optional

 import torch
 import torch.nn as nn
-import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models.modeling_utils import ModelMixin
--- a/Show More
+++ b/Show More