@@ -85,23 +85,33 @@ runs:
8585 with :
8686 docker-image : ${{ steps.calculate-docker-image.outputs.docker-image }}
8787
88- - name : Check if in a ARC runner
88+ - name : Check if in a container runner
8989 shell : bash
90- id : check_arc_runner
91- run : echo "IN_ARC_RUNNER =$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
90+ id : check_container_runner
91+ run : echo "IN_CONTAINER_RUNNER =$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi )" >> "$GITHUB_OUTPUT"
9292
9393 - name : Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
9494 id : install-nvidia-driver
9595 uses : pytorch/test-infra/.github/actions/setup-nvidia@main
96- if : ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
96+ if : ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
97+
98+ - name : Setup GPU_FLAG for docker run
99+ id : setup-gpu-flag
100+ run : echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
101+ if : ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
102+
103+ - name : Setup SCCACHE_SERVER_PORT environment for docker run when on container
104+ id : setup-sscache-port-flag
105+ run : echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
106+ if : ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
97107
98108 - name : Lock NVIDIA A100 40GB Frequency
99109 shell : bash
100110 run : |
101111 sudo nvidia-smi -pm 1
102112 sudo nvidia-smi -ac 1215,1410
103113 nvidia-smi
104- if : contains(matrix.runner, 'a100')
114+ if : ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
105115
106116 - name : Start monitoring script
107117 id : monitor-script
@@ -172,6 +182,7 @@ runs:
172182 NO_TD : ${{ steps.keep-going.outputs.ci-no-td }}
173183 TD_DISTRIBUTED : ${{ steps.keep-going.outputs.ci-td-distributed }}
174184 SCCACHE_BUCKET : ossci-compiler-cache-circleci-v2
185+ SCCACHE_REGION : us-east-1
175186 SCCACHE_S3_KEY_PREFIX : ${{ github.workflow }}
176187 SHM_SIZE : ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
177188 DOCKER_IMAGE : ${{ inputs.docker-image }}
@@ -181,6 +192,9 @@ runs:
181192 PYTORCH_TEST_RERUN_DISABLED_TESTS : ${{ matrix.rerun_disabled_tests && '1' || '0' }}
182193 DASHBOARD_TAG : ${{ inputs.dashboard-tag }}
183194 HUGGING_FACE_HUB_TOKEN : ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
195+ SCRIBE_GRAPHQL_ACCESS_TOKEN : ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
196+ IS_A100_RUNNER : ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
197+
184198 shell : bash
185199 run : |
186200 set -x
@@ -199,6 +213,7 @@ runs:
199213 # shellcheck disable=SC2086,SC2090
200214 container_name=$(docker run \
201215 ${GPU_FLAG:-} \
216+ ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
202217 -e BUILD_ENVIRONMENT \
203218 -e PR_NUMBER \
204219 -e GITHUB_ACTIONS \
@@ -227,14 +242,17 @@ runs:
227242 -e PR_LABELS \
228243 -e MAX_JOBS="$(nproc --ignore=2)" \
229244 -e SCCACHE_BUCKET \
245+ -e SCCACHE_REGION \
230246 -e SCCACHE_S3_KEY_PREFIX \
231247 -e XLA_CUDA \
232248 -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
233249 -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
234250 -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
235251 -e SKIP_SCCACHE_INITIALIZATION=1 \
236252 -e HUGGING_FACE_HUB_TOKEN \
253+ -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
237254 -e DASHBOARD_TAG \
255+ -e IS_A100_RUNNER \
238256 --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
239257 --security-opt seccomp=unconfined \
240258 --cap-add=SYS_PTRACE \
@@ -305,7 +323,7 @@ runs:
305323
306324 - name : Teardown Linux
307325 uses : pytorch/test-infra/.github/actions/teardown-linux@main
308- if : always()
326+ if : always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
309327
310328 # NB: We are currently having an intermittent GPU-related issue on G5 runners with
311329 # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
0 commit comments