diff --git a/.github/ISSUE_TEMPLATE/ci_failure_report.md b/.github/ISSUE_TEMPLATE/ci_failure_report.md new file mode 100644 index 0000000000000000000000000000000000000000..6bf4c7762319497b3292e825aa307048a628c21f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/ci_failure_report.md @@ -0,0 +1,10 @@ +--- +name: CI failure report +about: Report a DeepSpeed CI failure +title: "{{ env.GITHUB_WORKFLOW }} CI test failure" +labels: ci-failure +assignees: '' + +--- + +The Nightly CI for {{ env.GITHUB_SERVER_URL }}/{{ env.GITHUB_REPOSITORY }}/actions/runs/{{ env.GITHUB_RUN_ID }} failed. diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml index 125bf17de6deb92837519e5747455edd4791a867..110d661627bcdb070d4ca6acb57e03c4ba2af8e0 100644 --- a/.github/workflows/amd-mi200.yml +++ b/.github/workflows/amd-mi200.yml @@ -8,6 +8,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + issues: write + jobs: amd-tests: # The type of runner that the job will run on @@ -65,3 +69,12 @@ jobs: cd tests pytest $PYTEST_OPTS -n 4 --verbose unit/ pytest $PYTEST_OPTS -m 'sequential' unit/ + + - name: Open GitHub issue if nightly CI fails + if: failure() + uses: JasonEtco/create-an-issue@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + filename: .github/ISSUE_TEMPLATE/ci_failure_report.md + update_existing: true diff --git a/.github/workflows/nv-h100.yml b/.github/workflows/nv-h100.yml index 33f248c4299fe2cc4dda48383109c046f43657f9..5a62d2dc5d09c325eb5756d2c0e54f31a37dddfd 100644 --- a/.github/workflows/nv-h100.yml +++ b/.github/workflows/nv-h100.yml @@ -8,6 +8,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + issues: write + jobs: unit-tests: runs-on: [self-hosted, nvidia, h100] @@ -49,3 +53,12 @@ jobs: cd tests python -m pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.0" --cuda_ver="12" python -m pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="12" + + - name: Open GitHub issue if nightly CI fails + if: failure() + uses: JasonEtco/create-an-issue@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + filename: .github/ISSUE_TEMPLATE/ci_failure_report.md + update_existing: true diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index e2128d9dd2bb7ea11e792ff5680a3f4dca0167ae..121e90221ae4ddfaeb78fe87a4d7a7438450282b 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -8,6 +8,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + issues: write + jobs: unit-tests: runs-on: [self-hosted, nvidia, cu116, v100] @@ -47,3 +51,12 @@ jobs: unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6" + + - name: Open GitHub issue if nightly CI fails + if: failure() + uses: JasonEtco/create-an-issue@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + filename: .github/ISSUE_TEMPLATE/ci_failure_report.md + update_existing: true diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml index 0ce900cde4f90089a7790a33b3676e736f6e613b..fc4917bee99189e4c5761ded86136fb601db1580 100644 --- a/.github/workflows/nv-torch-nightly-v100.yml +++ b/.github/workflows/nv-torch-nightly-v100.yml @@ -8,6 +8,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + issues: write + jobs: unit-tests: runs-on: [self-hosted, nvidia, cu116, v100] @@ -48,3 +52,12 @@ jobs: cd tests pytest $PYTEST_OPTS --forked -n 4 unit/ pytest $PYTEST_OPTS --forked -m 'sequential' unit/ + + - name: Open GitHub issue if nightly CI fails + if: failure() + uses: JasonEtco/create-an-issue@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + filename: .github/ISSUE_TEMPLATE/ci_failure_report.md + update_existing: true diff --git a/.github/workflows/nv-torch19-p40.yml b/.github/workflows/nv-torch19-p40.yml index b9ff936c78572b82a4856d9736f2513f68b2045f..8c5b0f159ae77802f0f8ef5bbad1f01b9bd23893 100644 --- a/.github/workflows/nv-torch19-p40.yml +++ b/.github/workflows/nv-torch19-p40.yml @@ -8,6 +8,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + issues: write + jobs: unit-tests: runs-on: [self-hosted, nvidia, cu111, p40] @@ -47,3 +51,12 @@ jobs: unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.9" --cuda_ver="11.1" + + - name: Open GitHub issue if nightly CI fails + if: failure() + uses: JasonEtco/create-an-issue@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + filename: .github/ISSUE_TEMPLATE/ci_failure_report.md + update_existing: true diff --git a/.github/workflows/nv-torch19-v100.yml b/.github/workflows/nv-torch19-v100.yml index 61abe0f601c021e59499b925e932432e5a1be8d9..0dbe55f917334904ee51605f3f291bb9e2c010b7 100644 --- a/.github/workflows/nv-torch19-v100.yml +++ b/.github/workflows/nv-torch19-v100.yml @@ -8,6 +8,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + issues: write + jobs: unit-tests: runs-on: [self-hosted, nvidia, cu111, v100] @@ -48,3 +52,12 @@ jobs: cd tests pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.9" --cuda_ver="11" pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="1.9" --cuda_ver="11" + + - name: Open GitHub issue if nightly CI fails + if: failure() + uses: JasonEtco/create-an-issue@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + filename: .github/ISSUE_TEMPLATE/ci_failure_report.md + update_existing: true