Make WorkNCCL use CUDAEvent::query() rather than re-implement it (#49343)

lw · facebook-github-bot · commit 9234f5026dba · 2020-12-15T03:15:48.000-08:00
Summary: Pull Request resolved: #49343 at::cuda::CUDAEvent is "lazy" and only creates an event when it's first recorded. Until then, at::cuda::CUDAEvent is empty. If we use at::cuda::CUDAEvent::query() this is taken into account (an empty event is always ready), but WorkNCCL extracts the raw cudaEvent_t value from at::cuda::CUDAEvent and calls cudaEventQuery manually and doesn't check this. This could cause a failure. It's unclear if this is ever supposed to happen, but we're seeing that failure, and we want to sort it out in order to see if there's something "deeper" going on. ghstack-source-id: 118532806 Test Plan: Unit tests Reviewed By: SciPioneer Differential Revision: D25537844 fbshipit-source-id: 506319f4742e1c0a02aa75ecc01112ea3be42d8f
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -308,11 +308,7 @@ bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecution() {
 bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const {
   for (size_t i = 0; i < devices_.size(); ++i) {
     // Checking the work's corresponding CUDA events' status
-    auto ret = cudaEventQuery((*cudaEvents_)[i]);
-    if (ret != cudaSuccess && ret != cudaErrorNotReady) {
-      AT_CUDA_CHECK(ret);
-    }
-    if (ret == cudaErrorNotReady) {
+    if (!(*cudaEvents_)[i].query()) {
       return false;
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -308,11 +308,7 @@ bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecution() {`
`308`	`308`	`bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const {`
`309`	`309`	`for (size_t i = 0; i < devices_.size(); ++i) {`
`310`	`310`	`// Checking the work's corresponding CUDA events' status`
`311`		`- auto ret = cudaEventQuery((*cudaEvents_)[i]);`
`312`		`- if (ret != cudaSuccess && ret != cudaErrorNotReady) {`
`313`		`- AT_CUDA_CHECK(ret);`
`314`		`- }`
`315`		`- if (ret == cudaErrorNotReady) {`
	`311`	`+ if (!(*cudaEvents_)[i].query()) {`
`316`	`312`	`return false;`
`317`	`313`	`}`
`318`	`314`	`}`