Skip to content

Commit cc4498a

Browse files
colesburyfacebook-github-bot
authored andcommitted
Always enable P2P access for GPU copies (#21872)
Summary: PR #20685 incorrectly only enabled P2P access for non-contiguous copies. This can make cudaMemcpy slow for inter-gpu copies, especially on ROCm devices. I didn't notice a difference on CUDA 10, but ngimel says it's important for CUDA too. Pull Request resolved: #21872 Differential Revision: D15863965 Pulled By: colesbury fbshipit-source-id: 0a858f3c338fa2a5d05949d7f65fc05a70a9dfe1
1 parent 76a250d commit cc4498a

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

aten/src/ATen/native/cuda/Copy.cu

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
8888
AT_CUDA_CHECK(cudaGetLastError());
8989
}
9090

91-
static bool copy_requires_temporaries(TensorIterator& iter) {
91+
static bool copy_requires_temporaries(TensorIterator& iter, bool p2p_enabled) {
9292
Device dst_device = iter.device(0);
9393
Device src_device = iter.device(1);
9494

@@ -104,19 +104,32 @@ static bool copy_requires_temporaries(TensorIterator& iter) {
104104
return false;
105105
} else if (dst_device.is_cuda() && src_device.is_cuda()) {
106106
// Copies between GPUs can use the copy kernel if P2P is supported
107-
return !THCState_getPeerToPeerAccess(
108-
globalContext().getTHCState(), src_device.index(), dst_device.index());
107+
return !p2p_enabled;
109108
} else {
110109
// The remaining cases require temporaries. For example, this includes
111110
// non-contiguous copies between CPU and GPU.
112111
return true;
113112
}
114113
}
115114

115+
static bool maybe_enable_p2p_access(Device dst_device, Device src_device) {
116+
if (dst_device.is_cpu() || src_device.is_cpu()) {
117+
return false;
118+
}
119+
return THCState_getPeerToPeerAccess(
120+
globalContext().getTHCState(), src_device.index(), dst_device.index());
121+
}
122+
116123
static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
117124
AT_ASSERT(iter.ntensors() == 2);
118125

119-
if (copy_requires_temporaries(iter)) {
126+
Device dst_device = iter.device(0);
127+
Device src_device = iter.device(1);
128+
129+
// Enable p2p access between devices. (No-op if it invovles the CPU)
130+
bool p2p_enabled = maybe_enable_p2p_access(dst_device, src_device);
131+
132+
if (copy_requires_temporaries(iter, p2p_enabled)) {
120133
// NB: this involves recursive calls to copy. Be careful that those copies
121134
// don't require temporaries or you will cause an infinite recursion!
122135
auto& dst = iter.tensor(0);
@@ -147,9 +160,6 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
147160
return;
148161
}
149162

150-
Device dst_device = iter.device(0);
151-
Device src_device = iter.device(1);
152-
153163
// Copy on GPU (or between GPUs)
154164
if (dst_device.is_cuda() && src_device.is_cuda()) {
155165
copy_device_to_device(iter, non_blocking);

0 commit comments

Comments
 (0)