Upstream push ci fixes (#1965)

jjsjann123 · naoyam · web-flow · commit c668e13aea0c · 2022-09-09T12:49:16.000-07:00
Cherry-picking upstream build failure patches from PR #84626 Changes includes: 1. added throw in stringify 2. Split fused_reduction.cu as its size exceeds the limit in MSVC 3. update bzl build for runtime header 4. Fix a bug originally reported in #84626 5. Meta internal build fix Co-authored-by: Naoya Maruyama <nmaruyama@nvidia.com>
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -26,6 +26,8 @@ libtorch_nvfuser_runtime_sources = [
     "torch/csrc/jit/codegen/cuda/runtime/broadcast.cu",
     "torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu",
     "torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu",
     "torch/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu",
     "torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu",
     "torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu",
diff --git a/c10/util/hash.h b/c10/util/hash.h
@@ -309,7 +309,7 @@ struct hash<std::pair<T1, T2>> {
   size_t operator()(const std::pair<T1, T2>& pair) const {
     std::tuple<T1, T2> tuple = std::make_tuple(pair.first, pair.second);
     return _hash_detail::simple_get_hash(tuple);
-  };
+  }
 };
 
 template <typename T>
diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.h b/torch/csrc/jit/codegen/cuda/evaluator_common.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <torch/csrc/jit/codegen/cuda/dynamic_type.h>
+#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
 #include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
@@ -307,6 +307,15 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
     }
   }
 
+  KernelArgumentHolder& operator=(const KernelArgumentHolder& self) {
+    device_index_ = self.getDeviceIndex();
+    index_mode_ = self.getIndexMode();
+    for (const auto& arg : self.arguments_) {
+      push(arg.get());
+    }
+    return *this;
+  }
+
   // Push a tensor to the arguments
   void push(const at::Tensor& tensor);
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -23,6 +23,8 @@
 #include <nvfuser_resources/broadcast.h>
 #include <nvfuser_resources/fp16_support.h>
 #include <nvfuser_resources/fused_reduction.h>
+#include <nvfuser_resources/fused_welford_helper.h>
+#include <nvfuser_resources/fused_welford_impl.h>
 #include <nvfuser_resources/grid_broadcast.h>
 #include <nvfuser_resources/grid_reduction.h>
 #include <nvfuser_resources/grid_sync.h>
@@ -101,7 +103,9 @@ std::string kernelPreamble() {
   ss << nvfuser_resources::warp_cu;
   ss << nvfuser_resources::tensorcore_cu;
   ss << nvfuser_resources::memory_cu;
+  ss << nvfuser_resources::fused_welford_helper_cu;
   ss << nvfuser_resources::fused_reduction_cu;
+  ss << nvfuser_resources::fused_welford_impl_cu;
   ss << nvfuser_resources::swizzle_cu;
 
   // Random utilities
@@ -924,6 +928,7 @@ void initializeCudaContext() {
 namespace {
 
 // Dump PTX or CUBIN to a file
+#if CUDA_VERSION >= 11010
 void dumpCompiledCodeToFile(
     const nvrtcProgram& program,
     int fusion_id,
@@ -946,6 +951,7 @@ void dumpCompiledCodeToFile(
   out.write(code.data(), size);
   out.close();
 }
+#endif
 
 } // namespace
 
@@ -1189,6 +1195,7 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
     AT_CUDA_NVRTC_CHECK(getFunc(program, ptx.data()));
   }
 
+#if CUDA_VERSION >= 11010
   if (isDebugDumpEnabled(DebugDumpOption::Ptx)) {
     dumpCompiledCodeToFile(program, id, false);
   }
@@ -1199,6 +1206,7 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
         "CUBIN not available as the kernel was compiled only to PTX");
     dumpCompiledCodeToFile(program, id, true);
   }
+#endif
 
   NvrtcFunction compiled_kernel_;
 
diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h
@@ -4,7 +4,6 @@
 #include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
 
-#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
 #include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_container.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
@@ -54,6 +53,7 @@ class WelfordResult;
 
 class SegmentCandidateFinder;
 class SegmentedFusion;
+class KernelArgumentHolder;
 
 //! Fusion Guard is our "context manager". It holds the actrive fusion and
 //! allows it to be accessed anywhere through FusionGuard::getCurFusion()
diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
@@ -541,7 +541,7 @@ class TORCH_CUDA_CU_API WelfordTriplet {
 
  private:
   //! Holds avg, var and N in this order
-  std::array<Val*, 3> vals_ = {nullptr, nullptr, nullptr};
+  std::array<Val*, 3> vals_ = {{nullptr, nullptr, nullptr}};
 };
 
 //! Welford Scan operation.
diff --git a/torch/csrc/jit/codegen/cuda/nvfuser.cmake b/torch/csrc/jit/codegen/cuda/nvfuser.cmake
@@ -15,6 +15,8 @@ list(APPEND NVFUSER_RUNTIME_FILES
   ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu
   ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu
   ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_reduction.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu
   ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support.cu
   ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu
   ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu b/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu b/torch/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu b/torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu
diff --git a/torch/csrc/jit/codegen/cuda/tools/stringify_file.py b/torch/csrc/jit/codegen/cuda/tools/stringify_file.py

Original file line number	Diff line number	Diff line change
`@@ -307,6 +307,15 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {`
`307`	`307`	`}`
`308`	`308`	`}`
`309`	`309`
	`310`	`+ KernelArgumentHolder& operator=(const KernelArgumentHolder& self) {`
	`311`	`+ device_index_ = self.getDeviceIndex();`
	`312`	`+ index_mode_ = self.getIndexMode();`
	`313`	`+ for (const auto& arg : self.arguments_) {`
	`314`	`+ push(arg.get());`
	`315`	`+ }`
	`316`	`+ return *this;`
	`317`	`+ }`
	`318`	`+`
`310`	`319`	`// Push a tensor to the arguments`
`311`	`320`	`void push(const at::Tensor& tensor);`
`312`	`321`