add fp8 sm120

LopezCastroRoberto · LopezCastroRoberto · commit 66e0d48acb35 · 2025-10-30T09:29:46.000-07:00
diff --git a/qutlass/csrc/gemm.cu b/qutlass/csrc/gemm.cu
@@ -378,6 +378,18 @@ void matmul_host_mxf8_bf16_tn(torch::Tensor& D,
                         ElementB, LayoutBTag, AlignmentB>::Gemm, cutlass::float_ue8m0_t
                     >(D, A, B, A_sf, B_sf, alpha, m, n, k, A.device());
     }
+#elif TARGET_CUDA_ARCH == 120
+    using ArchTag = cutlass::arch::Sm120;
+
+    using MmaTileShape       = Shape<_128,_128,_128>;
+    using ClusterShape       = Shape<_1,_1,_1>;
+    using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+    runGemm<FpGemm<MmaTileShape, ClusterShape, PerSmTileShape_MNK,
+                    ArchTag,
+                    ElementA, LayoutATag, AlignmentA,
+                    ElementB, LayoutBTag, AlignmentB>::Gemm, cutlass::float_ue8m0_t
+                >(D, A, B, A_sf, B_sf, alpha, m, n, k, A.device());
 #else
     TORCH_CHECK(false, "Unsupported CUDA arch");
 #endif