diff --git a/.github/workflows/cnn_e2e.yml b/.github/workflows/cnn_e2e.yml index d1afb6e15883a314be8167e9fa8fed5feeddf93d..1655bc8285fd1a7913f214e501a2875a94120e93 100644 --- a/.github/workflows/cnn_e2e.yml +++ b/.github/workflows/cnn_e2e.yml @@ -147,7 +147,7 @@ jobs: python3 ${{ env.ONEFLOW_SRC }}/tools/create_pip_index.py --dir_key ${oss_dir} -b oneflow-staging --index_key=${oss_branch_dir}/index.html --index_key=${oss_dir}/index.html --index_key=${{ needs.find-oss-wheel.outputs.ONEFLOW_WHEEL_PATH }}/index.html test: - name: Test suite + name: Test ResNet50 needs: [build-oneflow] if: always() runs-on: ['self-hosted', 'linux', 'x64', 'gpu-8-titan-v'] @@ -155,68 +155,63 @@ jobs: TEST_CONTAINER_NAME: "oneflow_benchmark-run-id-${{ github.run_id }}-${{ matrix.entry }}-test" TEST_WITH_TORCH_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-pytorch-1.9.0:e7a497b41d8b7f1bce055b1f23d027f93b1557ae steps: - - name: just a test - run: | - echo ${TEST_CONTAINER_NAME} - echo ${TEST_WITH_TORCH_IMG_TAG} - echo ${{ needs.build-oneflow.outputs.ONEFLOW_WHEEL_PATH }} - # - name: Fix permissions - # run: | - # set -x - # docker run --rm -v $PWD:/p -w /p busybox chown -R $(id -u):$(id -g) . - # - name: Checkout Oneflow-Inc/OneFlow-Benchmark - # uses: actions/checkout@v2 - # - name: Remove container - # timeout-minutes: 45 - # run: | - # docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true - # - name: Enable Pytorch container - # run: | - # echo "TEST_IMG_TAG=${TEST_WITH_TORCH_IMG_TAG}" >> $GITHUB_ENV - # - name: Start container - # run: | - # docker pull ${{ env.TEST_IMG_TAG }} - # docker run -d --rm --privileged --network host --shm-size=8g \ - # --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ - # --runtime=nvidia \ - # -v /DATA/disk1:/dataset:ro \ - # -e ONEFLOW_WHEEL_PATH=${{ env.ONEFLOW_WHEEL_PATH }} \ - # -v $PWD:$PWD \ - # -w $PWD \ - # -e E2E_NUM_EPOCHS=${{ github.event.inputs.num_epochs }} \ - # -e E2E_GPU_NUM_PER_NODE=${{ github.event.inputs.gpu_num_per_node }} \ - # -e E2E_NODE_NUM=1 \ - # -e E2E_BATCH_SIZE=32 \ - # -e E2E_LEARNING_RATE=1.536 \ - # -e E2E_SRC_ROOT=Classification/cnns \ - # -e E2E_DATA_ROOT=/dataset/ImageNet/ofrecord \ - # --name ${TEST_CONTAINER_NAME} \ - # ${{ env.TEST_IMG_TAG }} \ - # sleep 3600 - # # -e ONEFLOW_CI=1 \ - # # -v /model_zoo:/model_zoo:ro \ - # # -v $HOME/test-container-cache/dot-local:/root/.local \ - # # -v $HOME/test-container-cache/dot-cache:/root/.cache \ - # # -e ONEFLOW_BIN_PATH=${ONEFLOW_BIN_PATH} \ - # # -v ${ONEFLOW_WHEEL_PATH}:${ONEFLOW_WHEEL_PATH}:ro \ - # # -v ${ONEFLOW_BIN_PATH}:${ONEFLOW_BIN_PATH}:ro \ - # # -v ${ONEFLOW_TEST_CACHE_DIR}:${ONEFLOW_TEST_CACHE_DIR} \ - # # -e ONEFLOW_TEST_CACHE_DIR=${ONEFLOW_TEST_CACHE_DIR} \ - # - name: Install OneFlow - # run: | - # docker exec ${TEST_CONTAINER_NAME} python3 --version - # docker exec ${TEST_CONTAINER_NAME} python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple - # docker exec ${TEST_CONTAINER_NAME} python3 -m pip install --find-links=http://oneflow-staging.oss-cn-beijing.aliyuncs.com/${{ needs.build-oneflow.outputs.ONEFLOW_WHEEL_PATH }} oneflow - # - name: Test container - # run: | - # docker exec ${{ env.TEST_CONTAINER_NAME }} bash ci/test/resnet50_e2e.sh - # - name: Upload log - # uses: ./.github/actions/upload_oss - # with: - # src_path: log - # oss_dst_path: oss://oneflow-log/OneFlow-Benchmark/${{ github.ref }}.${GITHUB_SHA::7}/oneflow/${{ github.event.inputs.of_branch_or_commit }}/${{github.run_id}}/log - # oss_access_key_id: ${{ secrets.OSS_ACCESS_KEY_ID }} - # oss_access_key_secret: ${{ secrets.OSS_ACCESS_KEY_SECRET }} - # upload_core: false + - name: Fix permissions + run: | + set -x + docker run --rm -v $PWD:/p -w /p busybox chown -R $(id -u):$(id -g) . + - name: Checkout Oneflow-Inc/OneFlow-Benchmark + uses: actions/checkout@v2 + - name: Remove container + timeout-minutes: 45 + run: | + docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true + - name: Enable Pytorch container + run: | + echo "TEST_IMG_TAG=${TEST_WITH_TORCH_IMG_TAG}" >> $GITHUB_ENV + - name: Start container + run: | + docker pull ${{ env.TEST_IMG_TAG }} + docker run -d --rm --privileged --network host --shm-size=8g \ + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + --runtime=nvidia \ + -v /DATA/disk1:/dataset:ro \ + -e ONEFLOW_WHEEL_PATH=${{ env.ONEFLOW_WHEEL_PATH }} \ + -v $PWD:$PWD \ + -w $PWD \ + -e E2E_NUM_EPOCHS=${{ github.event.inputs.num_epochs }} \ + -e E2E_GPU_NUM_PER_NODE=${{ github.event.inputs.gpu_num_per_node }} \ + -e E2E_NODE_NUM=1 \ + -e E2E_BATCH_SIZE=32 \ + -e E2E_LEARNING_RATE=1.536 \ + -e E2E_SRC_ROOT=Classification/cnns \ + -e E2E_DATA_ROOT=/dataset/ImageNet/ofrecord \ + --name ${TEST_CONTAINER_NAME} \ + ${{ env.TEST_IMG_TAG }} \ + sleep 3600 + # -e ONEFLOW_CI=1 \ + # -v /model_zoo:/model_zoo:ro \ + # -v $HOME/test-container-cache/dot-local:/root/.local \ + # -v $HOME/test-container-cache/dot-cache:/root/.cache \ + # -e ONEFLOW_BIN_PATH=${ONEFLOW_BIN_PATH} \ + # -v ${ONEFLOW_WHEEL_PATH}:${ONEFLOW_WHEEL_PATH}:ro \ + # -v ${ONEFLOW_BIN_PATH}:${ONEFLOW_BIN_PATH}:ro \ + # -v ${ONEFLOW_TEST_CACHE_DIR}:${ONEFLOW_TEST_CACHE_DIR} \ + # -e ONEFLOW_TEST_CACHE_DIR=${ONEFLOW_TEST_CACHE_DIR} \ + - name: Install OneFlow + run: | + docker exec ${TEST_CONTAINER_NAME} python3 --version + docker exec ${TEST_CONTAINER_NAME} python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + docker exec ${TEST_CONTAINER_NAME} python3 -m pip install --find-links=http://oneflow-staging.oss-cn-beijing.aliyuncs.com/${{ needs.build-oneflow.outputs.ONEFLOW_WHEEL_PATH }} oneflow + - name: Test container + run: | + docker exec ${{ env.TEST_CONTAINER_NAME }} bash ci/test/resnet50_e2e.sh + - name: Upload log + uses: ./.github/actions/upload_oss + with: + src_path: log + oss_dst_path: oss://oneflow-log/OneFlow-Benchmark/${{ github.ref }}.${GITHUB_SHA::7}/oneflow/${{ github.event.inputs.of_branch_or_commit }}/${{github.run_id}}/log + oss_access_key_id: ${{ secrets.OSS_ACCESS_KEY_ID }} + oss_access_key_secret: ${{ secrets.OSS_ACCESS_KEY_SECRET }} + upload_core: false