Always enable P2P access for GPU copies (#21872)

colesbury · facebook-github-bot · commit cc4498a54a2b · 2019-06-17T17:48:28.000-07:00
Summary: PR #20685 incorrectly only enabled P2P access for non-contiguous copies. This can make cudaMemcpy slow for inter-gpu copies, especially on ROCm devices. I didn't notice a difference on CUDA 10, but ngimel says it's important for CUDA too. Pull Request resolved: #21872 Differential Revision: D15863965 Pulled By: colesbury fbshipit-source-id: 0a858f3c338fa2a5d05949d7f65fc05a70a9dfe1
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
@@ -88,7 +88,7 @@ static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-static bool copy_requires_temporaries(TensorIterator& iter) {
+static bool copy_requires_temporaries(TensorIterator& iter, bool p2p_enabled) {
   Device dst_device = iter.device(0);
   Device src_device = iter.device(1);
 
@@ -104,19 +104,32 @@ static bool copy_requires_temporaries(TensorIterator& iter) {
     return false;
   } else if (dst_device.is_cuda() && src_device.is_cuda()) {
     // Copies between GPUs can use the copy kernel if P2P is supported
-    return !THCState_getPeerToPeerAccess(
-        globalContext().getTHCState(), src_device.index(), dst_device.index());
+    return !p2p_enabled;
   } else {
     // The remaining cases require temporaries. For example, this includes
     // non-contiguous copies between CPU and GPU.
     return true;
   }
 }
 
+static bool maybe_enable_p2p_access(Device dst_device, Device src_device) {
+  if (dst_device.is_cpu() || src_device.is_cpu()) {
+    return false;
+  }
+  return THCState_getPeerToPeerAccess(
+        globalContext().getTHCState(), src_device.index(), dst_device.index());
+}
+
 static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
   AT_ASSERT(iter.ntensors() == 2);
 
-  if (copy_requires_temporaries(iter)) {
+  Device dst_device = iter.device(0);
+  Device src_device = iter.device(1);
+
+  // Enable p2p access between devices. (No-op if it invovles the CPU)
+  bool p2p_enabled = maybe_enable_p2p_access(dst_device, src_device);
+
+  if (copy_requires_temporaries(iter, p2p_enabled)) {
     // NB: this involves recursive calls to copy. Be careful that those copies
     // don't require temporaries or you will cause an infinite recursion!
     auto& dst = iter.tensor(0);
@@ -147,9 +160,6 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
     return;
   }
 
-  Device dst_device = iter.device(0);
-  Device src_device = iter.device(1);
-
   // Copy on GPU (or between GPUs)
   if (dst_device.is_cuda() && src_device.is_cuda()) {
     copy_device_to_device(iter, non_blocking);