pytorch
diff --git a/‎.circleci/cimodel/data/binary_build_data.py‎
Lines changed: 3 additions & 3 deletions b/‎.circleci/cimodel/data/binary_build_data.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.circleci/cimodel/data/dimensions.py‎
Lines changed: 4 additions & 1 deletion b/‎.circleci/cimodel/data/dimensions.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.circleci/cimodel/data/simple/docker_definitions.py‎
Lines changed: 1 addition & 0 deletions b/‎.circleci/cimodel/data/simple/docker_definitions.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.circleci/config.yml‎
Lines changed: 159 additions & 0 deletions b/‎.circleci/config.yml‎
Lines changed: 159 additions & 0 deletions
diff --git a/‎.circleci/docker/build.sh‎
Lines changed: 7 additions & 0 deletions b/‎.circleci/docker/build.sh‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.circleci/docker/common/install_base.sh‎
Lines changed: 1 addition & 1 deletion b/‎.circleci/docker/common/install_base.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/Context.cpp‎
Lines changed: 23 additions & 0 deletions b/‎aten/src/ATen/Context.cpp‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎aten/src/ATen/Context.h‎
Lines changed: 16 additions & 0 deletions b/‎aten/src/ATen/Context.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎aten/src/ATen/WrapDimUtils.h‎
Lines changed: 11 additions & 3 deletions b/‎aten/src/ATen/WrapDimUtils.h‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 19 additions & 13 deletions b/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 19 additions & 13 deletions
@@ -54,7 +54,7 @@ def get_processor_arch_name(gpu_version):
     )),
     # Skip CUDA-9.2 builds on Windows
     windows=(
-        [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92', "rocm3.7"]],
+        [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
         OrderedDict(
             wheel=dimensions.STANDARD_PYTHON_VERSIONS,
             conda=dimensions.STANDARD_PYTHON_VERSIONS,
@@ -142,11 +142,11 @@ def get_children(self):
 
         # XXX disabling conda rocm build since docker images are not there
         if self.find_prop("package_format") == 'conda':
-            gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
+            gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)
 
         # XXX libtorch rocm build  is temporarily disabled
         if self.find_prop("package_format") == 'libtorch':
-            gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
+            gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)
 
         return [ArchConfigNode(self, v) for v in gpu_versions]
 
 
@@ -9,9 +9,12 @@
 
 ROCM_VERSIONS = [
     "3.7",
+    "3.8",
 ]
 
-GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ["rocm" + v for v in ROCM_VERSIONS]
+ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]
+
+GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS
 
 STANDARD_PYTHON_VERSIONS = [
     "3.6",
 
@@ -28,6 +28,7 @@
     "pytorch-linux-xenial-py3.6-gcc7.2",
     "pytorch-linux-xenial-py3.6-gcc7",
     "pytorch-linux-bionic-rocm3.7-py3.6",
+    "pytorch-linux-bionic-rocm3.8-py3.6",
 ]
 
 
 
@@ -2130,6 +2130,39 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-rocm:3.7"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -3429,6 +3462,51 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_test
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -4932,6 +5010,48 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: rocm3.7
+      - binary_upload:
+          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
+      - binary_upload:
+          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
+      - binary_upload:
+          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
       - binary_upload:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload
           context: org-member
@@ -6320,6 +6440,9 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-bionic-rocm3.7-py3.6"
           image_name: "pytorch-linux-bionic-rocm3.7-py3.6"
+      - docker_build_job:
+          name: "docker-pytorch-linux-bionic-rocm3.8-py3.6"
+          image_name: "pytorch-linux-bionic-rocm3.8-py3.6"
       - pytorch_linux_build:
           name: pytorch_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -7455,6 +7578,42 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_conda_3_6_cpu_devtoolset7_nightly
           build_environment: "conda 3.6 cpu devtoolset7"
 
@@ -262,6 +262,13 @@ case "$image" in
     VISION=yes
     ROCM_VERSION=3.7
     ;;
+  pytorch-linux-bionic-rocm3.8-py3.6)
+    ANACONDA_PYTHON_VERSION=3.6
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=3.8
+    ;;
   *)
     # Catch-all for builds that are not hardcoded.
     PROTOBUF=yes
 
@@ -118,7 +118,7 @@ esac
 
 # Install Valgrind separately since the apt-get version is too old.
 mkdir valgrind_build && cd valgrind_build
-VALGRIND_VERSION=3.15.0
+VALGRIND_VERSION=3.16.1
 if ! wget http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2
 then
   wget https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2
 
@@ -230,4 +230,27 @@ Allocator* getCPUAllocator() {
   return getTHDefaultAllocator();
 }
 
+// override_allow_tf32_flag = true
+//    means the allow_tf32 flags are overrided and tf32 is force disabled
+// override_allow_tf32_flag = false
+//    means the original allow_tf32 flags are followed
+thread_local bool override_allow_tf32_flag = false;
+
+NoTF32Guard::NoTF32Guard() {
+  if (!override_allow_tf32_flag) {
+    changed = true;
+    override_allow_tf32_flag = true;
+  }
+}
+
+NoTF32Guard::~NoTF32Guard() {
+  if (changed) {
+    override_allow_tf32_flag = false;
+  }
+}
+
+bool NoTF32Guard::should_disable_tf32() {
+  return override_allow_tf32_flag;
+}
+
 } // namespace at
@@ -327,4 +327,20 @@ static inline void manual_seed(uint64_t seed) {
   }
 }
 
+// When the global flag `allow_tf32` is set to true, cuBLAS handles are
+// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH.
+// For some operators, such as addmv, TF32 offers no performance improvement
+// but causes precision loss. To help this case, this class implements
+// a RAII guard that can be used to quickly disable TF32 within its scope.
+//
+// Usage:
+//     NoTF32Guard disable_tf32;
+struct TORCH_API NoTF32Guard {
+  NoTF32Guard();
+  ~NoTF32Guard();
+  static bool should_disable_tf32();
+private:
+  bool changed = false;
+};
+
 } // namespace at
@@ -30,14 +30,15 @@ static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector<std::vector<
   return maybe_wrap_dim(dim, tensor_sizes[0].size());
 }
 
-// wrap each of dims basing on dim_post_expr
-static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_expr) {
+// wrap each dim in the dims array, taking dim_post_expr as the true number of dimensions
+static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_post_expr) {
   if (dim_post_expr <= 0) {
     dim_post_expr = 1; // this will make range [-1, 0]
   }
   int64_t min = -dim_post_expr;
   int64_t max = dim_post_expr - 1;
-  for (auto& dim : dims) {
+  for (int64_t i = 0; i < ndims; ++i) {
+    auto &dim = dims[i];
     if (dim < min || dim > max) {
       TORCH_CHECK_INDEX(false,
         "Dimension out of range (expected to be in range of [",
@@ -47,6 +48,13 @@ static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_
   }
 }
 
+// Wrap each dim in a contiguous container, taking dim_post_expr as the true number of dimensions
+// E.g. could also be std::array or c10::SmallVector
+template <typename Container>
+inline void maybe_wrap_dims(Container& dims, int64_t dim_post_expr) {
+  return maybe_wrap_dims_n(dims.data(), dims.size(), dim_post_expr);
+}
+
 // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
 // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
 // to be "skipped" (both for wrap dimension behavior and dimension size checking).
 
@@ -407,19 +407,22 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 #endif
 
 #if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 210)
-  template <>
-  void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
-    // See Note [Writing Nondeterministic Operations]
-    globalContext().alertCuBLASConfigNotDeterministic();
-    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-    cublasOperation_t op = _cublasOpFromChar(trans);
-    _cublasAdjustLdLevel2(m, n, &lda);
-    GEMV_CHECK_ARGVALUES(c10::complex<float>);
-    TORCH_CUDABLAS_CHECK(
-        cublasCgemv(handle, op, m, n, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(a),
-        lda, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(&beta),
-        reinterpret_cast<cuComplex*>(y), incy));
-  }
+template <>
+void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
+  // gemv is bw bound, and does not benefit from TF32. But the precision
+  // loss still happens on TF32. So we disable it here.
+  NoTF32Guard disable_tf32;
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+  cublasOperation_t op = _cublasOpFromChar(trans);
+  _cublasAdjustLdLevel2(m, n, &lda);
+  GEMV_CHECK_ARGVALUES(c10::complex<float>);
+  TORCH_CUDABLAS_CHECK(
+      cublasCgemv(handle, op, m, n, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(a),
+      lda, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(&beta),
+      reinterpret_cast<cuComplex*>(y), incy));
+}
 #endif
 
 template <>
@@ -436,6 +439,9 @@ void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
 
 template <>
 void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
+  // gemv is bw bound, and does not benefit from TF32. But the precision
+  // loss still happens on TF32. So we disable it here.
+  NoTF32Guard disable_tf32;
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@`
`28`	`28`	`"pytorch-linux-xenial-py3.6-gcc7.2",`
`29`	`29`	`"pytorch-linux-xenial-py3.6-gcc7",`
`30`	`30`	`"pytorch-linux-bionic-rocm3.7-py3.6",`
	`31`	`+ "pytorch-linux-bionic-rocm3.8-py3.6",`
`31`	`32`	`]`
`32`	`33`
`33`	`34`