Improve nightly tests (#7385)

* flesh out the nightly tests * address feedback.
2026-01-27 17:22:53 +03:00 · 2024-03-28 13:26:34 +05:30
parent 0cbc78f04c
commit 73f28708be
1 changed files with 304 additions and 76 deletions
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -12,110 +12,96 @@ env:
  PYTEST_TIMEOUT: 600
  RUN_SLOW: yes
  RUN_NIGHTLY: yes
+  PIPELINE_USAGE_CUTOFF: 5000
  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

 jobs:
-  run_nightly_tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          - name: Nightly PyTorch CUDA tests on Ubuntu
-            framework: pytorch
-            runner: docker-gpu
-            image: diffusers/diffusers-pytorch-cuda
-            report: torch_cuda
-          - name: Nightly Flax TPU tests on Ubuntu
-            framework: flax
-            runner: docker-tpu
-            image: diffusers/diffusers-flax-tpu
-            report: flax_tpu
-          - name: Nightly ONNXRuntime CUDA tests on Ubuntu
-            framework: onnxruntime
-            runner: docker-gpu
-            image: diffusers/diffusers-onnxruntime-cuda
-            report: onnx_cuda
-
-    name: ${{ matrix.config.name }}
-
-    runs-on: ${{ matrix.config.runner }}
-
-    container:
-      image: ${{ matrix.config.image }}
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
-
-    defaults:
-      run:
-        shell: bash
-
+  setup_torch_cuda_pipeline_matrix:
+    name: Setup Torch Pipelines Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
-
-      - name: NVIDIA-SMI
-        if: ${{ matrix.config.runner == 'docker-gpu' }}
-        run: |
-          nvidia-smi
-
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
      - name: Install dependencies
        run: |
+          pip install -e .
+          pip install huggingface_hub
+      - name: Fetch Pipeline Matrix
+        id: fetch_pipeline_matrix
+        run: |
+          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
+          echo $matrix
+          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
+
+      - name: Pipeline Tests Artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: test-pipelines.json
+          path: reports
+
+  run_nightly_tests_for_torch_pipelines:
+    name: Torch Pipelines CUDA Nightly Tests
+    needs: setup_torch_cuda_pipeline_matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+      
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
-          python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers
-          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
          python -m uv pip install pytest-reportlog
-
+      
      - name: Environment
        run: |
          python utils/print_env.py
-
-      - name: Run nightly PyTorch CUDA tests
-        if: ${{ matrix.config.framework == 'pytorch' }}
+      
+      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests 
        env:
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
            -s -v -k "not Flax and not Onnx" \
-            --make-reports=tests_${{ matrix.config.report }} \
-            --report-log=${{ matrix.config.report }}.log \
-            tests/ 
-
-      - name: Run nightly Flax TPU tests
-        if: ${{ matrix.config.framework == 'flax' }}
-        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m pytest -n 0 \
-            -s -v -k "Flax" \
-            --make-reports=tests_${{ matrix.config.report }} \
-            --report-log=${{ matrix.config.report }}.log \
-            tests/
-
-      - name: Run nightly ONNXRuntime CUDA tests
-        if: ${{ matrix.config.framework == 'onnxruntime' }}
-        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -s -v -k "Onnx" \
-            --make-reports=tests_${{ matrix.config.report }} \
-            --report-log=${{ matrix.config.report }}.log \ 
-            tests/
-
+            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
+            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ 
+            tests/pipelines/${{ matrix.module }}
+      
      - name: Failure short reports
        if: ${{ failure() }}
-        run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+        run: |
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
+          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
-          name: ${{ matrix.config.report }}_test_reports
+          name: pipeline_${{ matrix.module }}_test_reports
          path: reports
      
      - name: Generate Report and Notify Channel
@@ -124,6 +110,248 @@ jobs:
          pip install slack_sdk tabulate
          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

+  run_nightly_tests_for_other_torch_modules:
+    name: Torch Non-Pipelines CUDA Nightly Tests
+    runs-on: docker-gpu
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        module: [models, schedulers, others, examples]
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run nightly PyTorch CUDA tests for non-pipeline modules
+      if: ${{ matrix.module != 'examples'}} 
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_torch_${{ matrix.module }}_cuda \
+          --report-log=tests_torch_${{ matrix.module }}_cuda.log \ 
+          tests/${{ matrix.module }}
+
+    - name: Run nightly example tests with Torch
+      if: ${{ matrix.module == 'examples' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v --make-reports=examples_torch_cuda \
+          --report-log=examples_torch_cuda.log \ 
+          examples/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt 
+        cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_${{ matrix.module }}_cuda_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_lora_nightly_tests:
+    name: Nightly LoRA Tests with PEFT and TORCH
+    runs-on: docker-gpu
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run nightly LoRA tests with PEFT and Torch
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_torch_lora_cuda \
+          --report-log=tests_torch_lora_cuda.log \ 
+          tests/lora
+    
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_torch_lora_cuda_stats.txt 
+        cat reports/tests_torch_lora_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_lora_cuda_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+  
+  run_flax_tpu_tests:
+    name: Nightly Flax TPU Tests
+    runs-on: docker-tpu
+    container:
+      image: diffusers/diffusers-flax-tpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run nightly Flax TPU tests
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 0 \
+          -s -v -k "Flax" \
+          --make-reports=tests_flax_tpu \
+          --report-log=tests_flax_tpu.log \ 
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_flax_tpu_stats.txt
+        cat reports/tests_flax_tpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: flax_tpu_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_nightly_onnx_tests:
+    name: Nightly ONNXRuntime CUDA tests on Ubuntu
+    runs-on: docker-gpu
+    container:
+      image: diffusers/diffusers-onnxruntime-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+    
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: nvidia-smi
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+    
+    - name: Run nightly ONNXRuntime CUDA tests
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_onnx_cuda \
+          --report-log=tests_onnx_cuda.log \ 
+          tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_onnx_cuda_stats.txt
+        cat reports/tests_onnx_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: ${{ matrix.config.report }}_test_reports
+        path: reports
+    
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+
  run_nightly_tests_apple_m1:
    name: Nightly PyTorch MPS tests on MacOS
    runs-on: [ self-hosted, apple-m1 ]