Skip to content

Commit c668e13

Browse files
jjsjann123naoyam
andauthored
Upstream push ci fixes (#1965)
Cherry-picking upstream build failure patches from PR #84626 Changes includes: 1. added throw in stringify 2. Split fused_reduction.cu as its size exceeds the limit in MSVC 3. update bzl build for runtime header 4. Fix a bug originally reported in #84626 5. Meta internal build fix Co-authored-by: Naoya Maruyama <[email protected]>
1 parent c40202b commit c668e13

File tree

12 files changed

+1701
-1399
lines changed

12 files changed

+1701
-1399
lines changed

build_variables.bzl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ libtorch_nvfuser_runtime_sources = [
2626
"torch/csrc/jit/codegen/cuda/runtime/broadcast.cu",
2727
"torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu",
2828
"torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu",
29+
"torch/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu",
30+
"torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu",
2931
"torch/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu",
3032
"torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu",
3133
"torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu",

c10/util/hash.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ struct hash<std::pair<T1, T2>> {
309309
size_t operator()(const std::pair<T1, T2>& pair) const {
310310
std::tuple<T1, T2> tuple = std::make_tuple(pair.first, pair.second);
311311
return _hash_detail::simple_get_hash(tuple);
312-
};
312+
}
313313
};
314314

315315
template <typename T>

torch/csrc/jit/codegen/cuda/evaluator_common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22
#include <torch/csrc/jit/codegen/cuda/dynamic_type.h>
3+
#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
34
#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
45
#include <torch/csrc/jit/codegen/cuda/fusion.h>
56
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>

torch/csrc/jit/codegen/cuda/executor_kernel_arg.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,15 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
307307
}
308308
}
309309

310+
KernelArgumentHolder& operator=(const KernelArgumentHolder& self) {
311+
device_index_ = self.getDeviceIndex();
312+
index_mode_ = self.getIndexMode();
313+
for (const auto& arg : self.arguments_) {
314+
push(arg.get());
315+
}
316+
return *this;
317+
}
318+
310319
// Push a tensor to the arguments
311320
void push(const at::Tensor& tensor);
312321

torch/csrc/jit/codegen/cuda/executor_utils.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include <nvfuser_resources/broadcast.h>
2424
#include <nvfuser_resources/fp16_support.h>
2525
#include <nvfuser_resources/fused_reduction.h>
26+
#include <nvfuser_resources/fused_welford_helper.h>
27+
#include <nvfuser_resources/fused_welford_impl.h>
2628
#include <nvfuser_resources/grid_broadcast.h>
2729
#include <nvfuser_resources/grid_reduction.h>
2830
#include <nvfuser_resources/grid_sync.h>
@@ -101,7 +103,9 @@ std::string kernelPreamble() {
101103
ss << nvfuser_resources::warp_cu;
102104
ss << nvfuser_resources::tensorcore_cu;
103105
ss << nvfuser_resources::memory_cu;
106+
ss << nvfuser_resources::fused_welford_helper_cu;
104107
ss << nvfuser_resources::fused_reduction_cu;
108+
ss << nvfuser_resources::fused_welford_impl_cu;
105109
ss << nvfuser_resources::swizzle_cu;
106110

107111
// Random utilities
@@ -924,6 +928,7 @@ void initializeCudaContext() {
924928
namespace {
925929

926930
// Dump PTX or CUBIN to a file
931+
#if CUDA_VERSION >= 11010
927932
void dumpCompiledCodeToFile(
928933
const nvrtcProgram& program,
929934
int fusion_id,
@@ -946,6 +951,7 @@ void dumpCompiledCodeToFile(
946951
out.write(code.data(), size);
947952
out.close();
948953
}
954+
#endif
949955

950956
} // namespace
951957

@@ -1189,6 +1195,7 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
11891195
AT_CUDA_NVRTC_CHECK(getFunc(program, ptx.data()));
11901196
}
11911197

1198+
#if CUDA_VERSION >= 11010
11921199
if (isDebugDumpEnabled(DebugDumpOption::Ptx)) {
11931200
dumpCompiledCodeToFile(program, id, false);
11941201
}
@@ -1199,6 +1206,7 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
11991206
"CUBIN not available as the kernel was compiled only to PTX");
12001207
dumpCompiledCodeToFile(program, id, true);
12011208
}
1209+
#endif
12021210

12031211
NvrtcFunction compiled_kernel_;
12041212

torch/csrc/jit/codegen/cuda/fusion.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include <c10/macros/Export.h>
55
#include <c10/util/Exception.h>
66

7-
#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
87
#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
98
#include <torch/csrc/jit/codegen/cuda/ir_container.h>
109
#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
@@ -54,6 +53,7 @@ class WelfordResult;
5453

5554
class SegmentCandidateFinder;
5655
class SegmentedFusion;
56+
class KernelArgumentHolder;
5757

5858
//! Fusion Guard is our "context manager". It holds the actrive fusion and
5959
//! allows it to be accessed anywhere through FusionGuard::getCurFusion()

torch/csrc/jit/codegen/cuda/ir_internal_nodes.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,7 @@ class TORCH_CUDA_CU_API WelfordTriplet {
541541

542542
private:
543543
//! Holds avg, var and N in this order
544-
std::array<Val*, 3> vals_ = {nullptr, nullptr, nullptr};
544+
std::array<Val*, 3> vals_ = {{nullptr, nullptr, nullptr}};
545545
};
546546

547547
//! Welford Scan operation.

torch/csrc/jit/codegen/cuda/nvfuser.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ list(APPEND NVFUSER_RUNTIME_FILES
1515
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu
1616
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu
1717
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_reduction.cu
18+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu
19+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu
1820
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support.cu
1921
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu
2022
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu

0 commit comments

Comments
 (0)