[CI] Framework and hardware-specific CI tests (#997)

* [WIP][CI] Framework and hardware-specific docker images for CI tests * username * fix cpu * try out the image * push latest * update workspace * no root isolation for actions * add a flax image * flax and onnx matrix * fix runners * add reports * onnxruntime image * retry tpu * fix * fix * build onnxruntime * naming * onnxruntime-gpu image * onnxruntime-gpu image, slow tests * latest jax version * trigger flax * run flax tests in one thread * fast flax tests on cpu * fast flax tests on cpu * trigger slow tests * rebuild torch cuda * force cuda provider * fix onnxruntime tests * trigger slow * don't specify gpu for tpu * optimize * memory limit * fix flax tests * disable docker cache
2026-01-27 17:22:53 +03:00 · 2022-11-02 14:07:07 +01:00
parent b1ec61ee45
commit 4e59bcc680
16 changed files with 540 additions and 67 deletions
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -0,0 +1,50 @@
+name: Build Docker images (nightly)
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *" # every day at midnight
+
+concurrency:
+  group: docker-image-builds
+  cancel-in-progress: false
+
+env:
+  REGISTRY: diffusers
+
+jobs:
+  build-docker-images:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        image-name:
+          - diffusers-pytorch-cpu
+          - diffusers-pytorch-cuda
+          - diffusers-flax-cpu
+          - diffusers-flax-tpu
+          - diffusers-onnxruntime-cpu
+          - diffusers-onnxruntime-cuda
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ env.REGISTRY }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          no-cache: true
+          context: ./docker/${{ matrix.image-name }}
+          push: true
+          tags: ${{ env.REGISTRY }}/${{ matrix.image-name }}:latest
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -11,19 +11,45 @@ concurrency:

 env:
  DIFFUSERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
+  OMP_NUM_THREADS: 4
+  MKL_NUM_THREADS: 4
  PYTEST_TIMEOUT: 60
  MPS_TORCH_VERSION: 1.13.0

 jobs:
-  run_tests_cpu:
-    name: CPU tests on Ubuntu
-    runs-on: [ self-hosted, docker-gpu ]
+  run_fast_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Fast PyTorch CPU tests on Ubuntu
+            framework: pytorch
+            runner: docker-cpu
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu
+          - name: Fast Flax CPU tests on Ubuntu
+            framework: flax
+            runner: docker-cpu
+            image: diffusers/diffusers-flax-cpu
+            report: flax_cpu
+          - name: Fast ONNXRuntime CPU tests on Ubuntu
+            framework: onnxruntime
+            runner: docker-cpu
+            image: diffusers/diffusers-onnxruntime-cpu
+            report: onnx_cpu
+
+    name: ${{ matrix.config.name }}
+
+    runs-on: ${{ matrix.config.runner }}
+
    container:
-      image: python:3.7
+      image: ${{ matrix.config.image }}
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/

+    defaults:
+      run:
+        shell: bash
+
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
@@ -32,8 +58,6 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m pip install --upgrade pip
-        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
        python -m pip install -e .[quality,test]
        python -m pip install git+https://github.com/huggingface/accelerate

@@ -41,25 +65,49 @@ jobs:
      run: |
        python utils/print_env.py

-    - name: Run all fast tests on CPU
+    - name: Run fast PyTorch CPU tests
+      if: ${{ matrix.config.framework == 'pytorch' }}
      env:
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_cpu tests/
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run fast Flax TPU tests
+      if: ${{ matrix.config.framework == 'flax' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Flax" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run fast ONNXRuntime CPU tests
+      if: ${{ matrix.config.framework == 'onnxruntime' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/

    - name: Failure short reports
      if: ${{ failure() }}
-      run: cat reports/tests_torch_cpu_failures_short.txt
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt

    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v2
      with:
-        name: pr_torch_cpu_test_reports
+        name: pr_${{ matrix.config.report }}_test_reports
        path: reports

-  run_tests_apple_m1:
-    name: MPS tests on Apple M1
+  run_fast_tests_apple_m1:
+    name: Fast PyTorch MPS tests on MacOS
    runs-on: [ self-hosted, apple-m1 ]

    steps:
@@ -91,7 +139,7 @@ jobs:
      run: |
        ${CONDA_RUN} python utils/print_env.py

-    - name: Run all fast tests on MPS
+    - name: Run fast PyTorch tests on M1 (MPS)
      shell: arch -arch arm64 bash {0}
      env:
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -14,12 +14,38 @@ env:
  RUN_SLOW: yes

 jobs:
-  run_tests_single_gpu:
-    name: Diffusers tests
-    runs-on: [ self-hosted, docker-gpu, single-gpu ]
+  run_slow_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Slow PyTorch CUDA tests on Ubuntu
+            framework: pytorch
+            runner: docker-gpu
+            image: diffusers/diffusers-pytorch-cuda
+            report: torch_cuda
+          - name: Slow Flax TPU tests on Ubuntu
+            framework: flax
+            runner: docker-tpu
+            image: diffusers/diffusers-flax-tpu
+            report: flax_tpu
+          - name: Slow ONNXRuntime CUDA tests on Ubuntu
+            framework: onnxruntime
+            runner: docker-gpu
+            image: diffusers/diffusers-onnxruntime-cuda
+            report: onnx_cuda
+
+    name: ${{ matrix.config.name }}
+
+    runs-on: ${{ matrix.config.runner }}
+
    container:
-      image: nvcr.io/nvidia/pytorch:22.07-py3
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
+
+    defaults:
+      run:
+        shell: bash

    steps:
    - name: Checkout diffusers
@@ -28,14 +54,12 @@ jobs:
        fetch-depth: 2

    - name: NVIDIA-SMI
+      if : ${{ matrix.config.runner == 'docker-gpu' }}
      run: |
        nvidia-smi

    - name: Install dependencies
      run: |
-        python -m pip install --upgrade pip
-        python -m pip uninstall -y torch torchvision torchtext
-        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu117
        python -m pip install -e .[quality,test]
        python -m pip install git+https://github.com/huggingface/accelerate

@@ -43,29 +67,55 @@ jobs:
      run: |
        python utils/print_env.py

-    - name: Run all (incl. slow) tests on GPU
+    - name: Run slow PyTorch CUDA tests
+      if: ${{ matrix.config.framework == 'pytorch' }}
      env:
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_gpu tests/
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run slow Flax TPU tests
+      if: ${{ matrix.config.framework == 'flax' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 0 \
+          -s -v -k "Flax" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run slow ONNXRuntime CUDA tests
+      if: ${{ matrix.config.framework == 'onnxruntime' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/

    - name: Failure short reports
      if: ${{ failure() }}
-      run: cat reports/tests_torch_gpu_failures_short.txt
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt

    - name: Test suite reports artifacts
      if: ${{ always() }}
      uses: actions/upload-artifact@v2
      with:
-        name: torch_test_reports
+        name: ${{ matrix.config.report }}_test_reports
        path: reports

-  run_examples_single_gpu:
-    name: Examples tests
-    runs-on: [ self-hosted, docker-gpu, single-gpu ]
+  run_examples_tests:
+    name: Examples PyTorch CUDA tests on Ubuntu
+
+    runs-on: docker-gpu
+
    container:
-      image: nvcr.io/nvidia/pytorch:22.07-py3
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache
+      image: diffusers/diffusers-pytorch-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/

    steps:
    - name: Checkout diffusers
@@ -79,9 +129,6 @@ jobs:

    - name: Install dependencies
      run: |
-        python -m pip install --upgrade pip
-        python -m pip uninstall -y torch torchvision torchtext
-        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu117
        python -m pip install -e .[quality,test,training]
        python -m pip install git+https://github.com/huggingface/accelerate

@@ -93,11 +140,11 @@ jobs:
      env:
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
      run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_gpu examples/
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/

    - name: Failure short reports
      if: ${{ failure() }}
-      run: cat reports/examples_torch_gpu_failures_short.txt
+      run: cat reports/examples_torch_cuda_failures_short.txt

    - name: Test suite reports artifacts
      if: ${{ always() }}