Add missing boundary checks

xinyazhang · xinyazhang · commit 9f8c51cf4be5 · 2024-11-14T07:07:53.000Z
This fixes OOB memory access for followng code
```
import torch
qk = torch.randn((1024,587), dtype=torch.float64, device='cuda')
smqk = torch.softmax(qk, dim=-1)
```
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -465,7 +465,7 @@ ilpReduce(index_t shift,
   if(shift > 0){
     data -= shift;
     size += shift;
-    if(threadIdx.x >= shift){
+    if (offset >= shift && offset < size) {
       threadVal = r(threadVal, data[offset]);
     }
     size -= blockDim.x;
@@ -515,7 +515,7 @@ WriteFpropResultsVectorized(
     output -= shift;
     size += shift;
 
-    if (threadIdx.x >= shift) {
+    if (offset >= shift && offset < size) {
       output[offset] = epilogue(input[offset]);
     }
     size -= blockDim.x;