[c10d] differentiate timeout errors from nccl errors

shuqiangzhang · shuqiangzhang · commit 50ac546648ae · 2024-10-17T11:43:42.000-07:00
Summary:
It's important for c10d to differentiate different reasons of watchdog
failures. E.g, timeout vs nccl errors, and let users to handle the
errors depends on the type of error
Test Plan:
UT
Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -2681,7 +2681,7 @@ def assert_fut_success(fut):
             work = process_group.allreduce(torch.rand(10).cuda(self.rank))
             work.wait()
             result = work.get_future_result().wait()
-            self.assertEqual(WorkResult(result), WorkResult.FAILURE)
+            self.assertEqual(WorkResult(result), WorkResult.COMM_ERROR)
 
         if prev_nccl_async_error_handling is not None:
             os.environ[
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1859,27 +1859,60 @@ void ProcessGroupNCCL::watchdogHandler() {
       // aborted, So cannot check exception based on them. But watchdog needs to
       // finish the check for the works that have already been enqueued to
       // workMetaList_
+
+      // check NCCL errors first
       if (!terminateProcessGroup_.load()) {
         work.checkAndSetException();
       }
-      bool timedOut = work.checkTimeout();
-
-      // If work hits an exception (either an error or timeout)
       if (work.exception()) {
+        // log as soon as exception is detected
+        LOG(ERROR) << c10::str(
+            logPrefix(),
+            "NCCL error is detected by watchdog at work: ",
+            work.seq_,
+            ", last enqueued NCCL work: ",
+            pgStatus_->lastEnqueuedSeq,
+            ", last completed NCCL work: ",
+            pgStatus_->lastCompletedSeq,
+            ".");
         if (work.futureWorkResult_ && !work.futureWorkResult_->completed()) {
           work.futureWorkResult_->markCompleted(
-              at::IValue(static_cast<uint8_t>(WorkResult::FAILURE)));
+              at::IValue(static_cast<uint8_t>(WorkResult::COMM_ERROR)));
         }
-        // log as soon as exception is detected
+      } else if (work.checkTimeout()) {
         LOG(ERROR) << c10::str(
             logPrefix(),
-            "Exception (either an error or timeout) detected by watchdog at work: ",
+            "Work timeout is detected by watchdog at work: ",
             work.seq_,
             ", last enqueued NCCL work: ",
             pgStatus_->lastEnqueuedSeq,
             ", last completed NCCL work: ",
             pgStatus_->lastCompletedSeq,
             ".");
+        if (work.futureWorkResult_ && !work.futureWorkResult_->completed()) {
+          work.futureWorkResult_->markCompleted(
+              at::IValue(static_cast<uint8_t>(WorkResult::TIMEOUT)));
+        }
+        // Report desync state in case of timeout
+        if (desyncDebug_) {
+          try {
+            collectiveDebugInfoMode_.store(true);
+            auto desyncMsg = getNCCLWatchdogDebugInfo();
+            LOG(ERROR) << logPrefix() << desyncMsg;
+          } catch (const std::exception& e) {
+            LOG(ERROR) << logPrefix()
+                       << "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
+                       << " Please file an issue. Error: " << e.what();
+          } catch (...) {
+            LOG(ERROR)
+                << logPrefix()
+                << "Failed to rerieve TORCH_NCCL_DESYNC_DEBUG report with unknown error."
+                << " Please file an issue.";
+          }
+        }
+      }
+      // If work hits an exception (either an error or timeout)
+      if (work.exception()) {
         // try to notify other ranks via global TCPStore to dump the flight
         // recorder when a collective timeout or exception happens. Flight
         // recorder behavior is independent of desync Debug.
@@ -1919,36 +1952,6 @@ void ProcessGroupNCCL::watchdogHandler() {
           // rank
           abortComms();
         }
-
-        // Report desync state in case of timeout
-        if (timedOut) {
-          LOG(ERROR) << c10::str(
-              logPrefix(),
-              "Timeout at NCCL work: ",
-              work.seq_,
-              ", last enqueued NCCL work: ",
-              pgStatus_->lastEnqueuedSeq,
-              ", last completed NCCL work: ",
-              pgStatus_->lastCompletedSeq,
-              ".");
-          if (desyncDebug_) {
-            try {
-              collectiveDebugInfoMode_.store(true);
-              auto desyncMsg = getNCCLWatchdogDebugInfo();
-              LOG(ERROR) << logPrefix() << desyncMsg;
-            } catch (const std::exception& e) {
-              LOG(ERROR)
-                  << logPrefix()
-                  << "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
-                  << " Please file an issue. Error: " << e.what();
-            } catch (...) {
-              LOG(ERROR)
-                  << logPrefix()
-                  << "Failed to rerieve TORCH_NCCL_DESYNC_DEBUG report with unknown error."
-                  << " Please file an issue.";
-            }
-          }
-        }
         // Throw exception
         work.handleException(asyncErrorHandling_);
       }
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
@@ -37,7 +37,8 @@ enum class OpType : std::uint8_t {
 // TODO: support different types of failures/errors
 enum class WorkResult : std::uint8_t {
   SUCCESS = 0,
-  FAILURE = 1,
+  TIMEOUT = 1,
+  COMM_ERROR = 2,
   UNKNOWN = 100,
 };
 
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -2916,7 +2916,8 @@ Example::
 
   py::enum_<::c10d::WorkResult>(module, "WorkResult")
       .value("SUCCESS", ::c10d::WorkResult::SUCCESS)
-      .value("FAILURE", ::c10d::WorkResult::FAILURE)
+      .value("TIMEOUT", ::c10d::WorkResult::TIMEOUT)
+      .value("COMM_ERROR", ::c10d::WorkResult::COMM_ERROR)
       .value("UNKNOWN", ::c10d::WorkResult::UNKNOWN);
 
   py::class_<::c10d::WorkInfo, std::shared_ptr<::c10d::WorkInfo>>(