ScalingIntelligence · simonguozirui · Oct 23, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/original_files.txt → KernelBench/changelog/original_files.txt b/original_files.txt → KernelBench/changelog/original_files.txt
diff --git a/KernelBench/test.py b/KernelBench/test.py
diff --git a/README.md b/README.md
@@ -26,6 +26,8 @@ We construct KernelBench to have 4 Levels of categories:
 - **Level 4 🤗**:  Level Hugging Face 
     Optimize whole model architectures from HuggingFace
 
+We are actively extending KernelBench to other DSLs beyond `cuda` as well.
+
 ## ⚖️ Evaluation
 #### Methodology
 To evaluate model-generated kernels, we need to check if they:
@@ -47,6 +49,7 @@ Some examples to illustrate this metric that filters based on speedups:
 
 You can increase speedup threshold `p` to make the task more challenging.
 
+
 #### Compute Overall Benchmark Performance
 
 We provide a script `scripts/greedy_analysis.py` to compute the overall benchmark performance. 
@@ -95,6 +98,8 @@ python3 scripts/generate_and_eval_single_sample.py dataset_src="huggingface" lev
 # add .verbose_logging for more visbility
 ```
 
+We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support (`cuda`, `triton`, `cute`).
+
 ### Run on all problems 
 
 ```
@@ -120,25 +125,10 @@ We provide some reference baseline times a variety of NVIDIA GPUs across generat
 We have also releaed the test-time framework [Caesar](https://github.com/simonguozirui/caesar) that are used in the multi-turn / iterative refinement experiments in our paper. You can use or modify this framework for high-throughput test-time scaling (both sequential and parallel) targeting KernelBench problems. 
 
 ## 🛣️ Upcoming Roadmap
-- [ ] Triton Variant (To be merged)
-- [ ] Easy to use CoLab Notebook Example
-- [ ] Push button flow on Modal / Cloud Provider 
-- [ ] Integrate with more frameworks, such as [ThunderKittens](https://github.com/HazyResearch/ThunderKittens)
-- [ ] Add backward pass
-- [ ] Integrate with toolchains such as NCU
-See Issues for the ongoing roadmap and directions.
-
-
+Check out our [roadmap](https://github.com/ScalingIntelligence/KernelBench/issues/74) for what we plan to add as features. We welcome community contirbutions in these directions. 
 
 ## 🔍 Known Usage
-- [NVIDIA](https://developer.nvidia.com/blog/automating-gpu-kernel-generation-with-deepseek-r1-and-inference-time-scaling/) - Automating GPU Kernel Generation with DeepSeek-R1 and Inference Time Scaling
-- [METR](https://metr.org/blog/2025-02-14-measuring-automated-kernel-engineering/) - Measuring Automated Kernel Engineering
-- [Sakana AI](https://sakana.ai/ai-cuda-engineer/) - AI Cuda Engineer
-- [Project Popcorn](https://www.youtube.com/watch?v=mdDVkBeFy9A) - Triton Support for KernelBench, Data Scaling + SFT'd Kernel LLM
-- [Kevin](https://cognition.ai/blog/kevin-32b) - Kevin-32B: Multi-Turn RL for Writing CUDA Kernels
-- [Simple Test-Time Search](https://scalingintelligence.stanford.edu/blogs/fastkernels/) - by @anneouyang
-
-If you are using KernelBench, we love to hear more about it!
+Since release, we have gotten a lot of interest from researchers, research labs, and companies that use KernelBench to explore this direction. We have documented [known usage](https://docs.google.com/document/d/e/2PACX-1vTjS-UMH1HB5n_PENq2k-3YRfXIXkqKIKeNC2zcWMyLPdl4Jrwvdk4dNDVSsM8ybKrCxZB7GJq1slZF/pub) of KernelBench and related efforts towards automated kernel generations. If you are using KernelBench, we love to hear more about it!
 
 ## 🪪 License
 MIT. Check `LICENSE.md` for more details.

diff --git a/requirements.txt b/requirements.txt
@@ -1,17 +1,28 @@
-anthropic
+# Frameworks
+torch==2.5.0
+# we shall upgrade torch for blackwell when it is stable
+transformers
+datasets
 modal
-numpy
-openai
+
+# DSLs
+nvidia-cutlass-dsl
+
+# helper
+tqdm
 packaging
 pydra_config
-torch==2.5.0
-tqdm
-datasets
-transformers
-google-generativeai
-together
 pytest
 ninja
-archon-ai
+
+# Numerics
 einops
-dotenv
+dotenv
+numpy
+
+# to deprecate with litellm
+google-generativeai
+together
+openai
+anthropic
+
diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
@@ -3,6 +3,8 @@
 import os
 import shutil
 import time
+from dataclasses import dataclass
+
 from collections import defaultdict
 from dataclasses import dataclass
 
@@ -12,15 +14,19 @@
 
 from datasets import load_dataset
 from pydra import Config, REQUIRED
+
+# Import only what we need
 from src import compile, eval, utils
 
 from src.dataset import construct_kernelbench_dataset
 from src.eval import (
     build_compile_cache,
+    get_error_name,
     check_metadata_serializable_all_types,
     eval_kernel_against_ref,
     KernelExecResult,
 )
+
 from src.utils import read_file, set_gpu_arch
 from tqdm import tqdm
 
@@ -137,6 +143,8 @@ def __init__(self):
         # number of GPUs to do batch evaluation
         self.num_gpu_devices = 1
 
+        # Backend to use for kernel implementation (cuda or triton)
+        self.backend = "cuda"
         # Number of samples per problem to evaluate for pass@k analysis
         self.num_samples_per_problem = 1  # Default to 1 sample per problem
 
@@ -312,6 +320,7 @@ def evaluate_single_sample(
             num_perf_trials=configs.num_perf_trials,
             build_dir=build_dir,
             device=device,
+            backend=configs.backend,
         )
         return eval_result
     except Exception as e:
@@ -322,6 +331,7 @@ def evaluate_single_sample(
             # NOTE: count this as compilation failure as it is not runnable code
             metadata = {
                 "cuda_error": f"CUDA Error: {str(e)}",
+                "cuda_error_name": get_error_name(e),
                 "hardware": torch.cuda.get_device_name(device=device),
                 "device": str(device),
             }  # log this for debugging as this usually signifies illegal memory access
@@ -332,6 +342,7 @@ def evaluate_single_sample(
         else:
             metadata = {
                 "other_error": f"error: {str(e)}",
+                "other_error_name": get_error_name(e),
                 "hardware": torch.cuda.get_device_name(device=device),
                 "device": str(device),
             }  # for debugging
@@ -387,10 +398,9 @@ def cuda_single_eval_wrapper(curr_work: WorkArgs, configs: dict, dataset, run_di
             pool.terminate()
             pool.join()
             raise
-        except mp.TimeoutError:
+        except mp.TimeoutError as e:
             print(
-                f"[WARNING] Evaluation TIMED OUT for Problem ID: {curr_work.problem_id},"
-                f" Sample ID: {curr_work.sample_id}"
+                f"[WARNING] Evaluation TIMED OUT for Problem ID: {curr_work.problem_id}, Sample ID: {curr_work.sample_id}\nException: {e}"
             )
 
         print(
@@ -691,7 +701,7 @@ def add_to_eval_results_file(
         os.makedirs(os.path.dirname(eval_file_path), exist_ok=True)
 
     with open(eval_file_path, "w") as f:
-        json.dump(eval_results, f)
+        json.dump(eval_results, f, indent=4)
 
 
 def single_eval_example(

diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
@@ -3,13 +3,21 @@
 import os, sys
 import torch
 import json
+import modal
 
 from datasets import load_dataset
 
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
-from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
+from src.prompt_constructor_multilang import get_prompt_for_backend
+from src.utils import (
+    create_inference_server_from_presets,
+    extract_first_code,
+    query_server,
+    read_file,
+    set_gpu_arch,
+)
 
 """
 Generate and evaluate a single sample
@@ -20,15 +28,15 @@
 
 torch.set_printoptions(precision=4, threshold=10)
 
+
 class EvalConfig(Config):
     def __init__(self):
-        
-        self.dataset_src = REQUIRED # either huggingface or local
+
+        self.dataset_src = REQUIRED  # either huggingface or local
 
         # name of dataset name on Hugging Face
         self.dataset_name = "ScalingIntelligence/KernelBench"
 
-
         # Problem Specification
         self.level = REQUIRED
         # NOTE: this is the logical index (problem id the problem_name)\
@@ -56,6 +64,8 @@ def __init__(self):
         self.log_generated_kernel = False
         self.log_eval_result = False
 
+        self.backend = "cuda"
+
     def verbose_logging(self):
         self.log = True
         self.log_prompt = True
@@ -86,24 +96,31 @@ def main(config: EvalConfig):
 
     if config.log:
         os.makedirs(config.logdir, exist_ok=True)
-        
+
     # Problem Checks
     num_problems = len(curr_level_dataset)
     print(f"Number of problems in Level {config.level}: {num_problems}")
-    print(f"Start Generation + Evaluation for Level {config.level} Problem {config.problem_id}")
-
-    assert config.problem_id <= num_problems, f"Problem ID {config.problem_id} out of range for Level {config.level}"
+    print(
+        f"Start Generation + Evaluation for Level {config.level} Problem {config.problem_id}"
+    )
 
+    assert (
+        config.problem_id <= num_problems
+    ), f"Problem ID {config.problem_id} out of range for Level {config.level}"
 
     # 1. Fetch Problem
     if config.dataset_src == "huggingface":
 
-        curr_problem_row = curr_level_dataset.filter(lambda x: x["problem_id"] == config.problem_id)
+        curr_problem_row = curr_level_dataset.filter(
+            lambda x: x["problem_id"] == config.problem_id
+        )
         ref_arch_src = curr_problem_row["code"][0]
         problem_name = curr_problem_row["name"][0]
 
     elif config.dataset_src == "local":
-        problem_idx_in_dataset = config.problem_id - 1 # due to dataset list being 0-indexed locally
+        problem_idx_in_dataset = (
+            config.problem_id - 1
+        )  # due to dataset list being 0-indexed locally
         ref_arch_path = curr_level_dataset[problem_idx_in_dataset]
 
         problem_name = os.path.basename(ref_arch_path)
@@ -112,52 +129,90 @@ def main(config: EvalConfig):
 
     # Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
     problem_number = int(problem_name.split("_")[0])
-    assert problem_number == config.problem_id, f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
-
-
+    assert (
+        problem_number == config.problem_id
+    ), f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
+
     # 2. Generate Sample
     # Create inference function with config parameters
     # We provide some presets in utils but you can also pass in your own, see query_server for more details
-    inference_server = create_inference_server_from_presets(server_type=config.server_type,
-                                                        model_name=config.model_name,
-                                                        temperature=config.temperature,
-                                                        max_tokens=config.max_tokens,
-                                                        verbose=config.verbose,
-                                                        time_generation=True)
-
+    inference_server = create_inference_server_from_presets(
+        server_type=config.server_type,
+        model_name=config.model_name,
+        temperature=config.temperature,
+        max_tokens=config.max_tokens,
+        verbose=config.verbose,
+        time_generation=True,
+    )
 
+    # Use appropriate prompt constructor based on backend
+    if config.backend == "cuda":
+        custom_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
+    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+        custom_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
+    else:
+        raise ValueError(
+            f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'."
+        )
 
-    custom_cuda_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
     if config.log_prompt:
-        with open(os.path.join(config.logdir, f"prompt_level_{config.level}_problem_{config.problem_id}.txt"), "w") as f:
-            f.write(custom_cuda_prompt)
+        with open(
+            os.path.join(
+                config.logdir,
+                f"prompt_level_{config.level}_problem_{config.problem_id}.txt",
+            ),
+            "w",
+        ) as f:
+            f.write(custom_prompt)
 
     # Query server with constructed prompt
-    custom_cuda = inference_server(custom_cuda_prompt)
-    custom_cuda = extract_first_code(custom_cuda, ["python", "cpp"])
-    # check LLM is able to generate custom CUDA code
-    assert custom_cuda is not None, "Custom CUDA code generation failed"
-
+    custom_kernel = inference_server(custom_prompt)
+    custom_kernel = extract_first_code(custom_kernel, ["python", "cpp"])
+
+    # check LLM is able to generate custom kernel code
+    assert (
+        custom_kernel is not None
+    ), f"Custom {config.backend} kernel code generation failed"
+
     # this should be optional
     if config.log:
-        with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py"), "w") as f:
-            f.write(custom_cuda)
+        with open(
+            os.path.join(
+                config.logdir,
+                f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py",
+            ),
+            "w",
+        ) as f:
+            f.write(custom_kernel)
 
     # 3. Evaluate Kernel
     # NOTE: no need to wrap around process here as only a single sample
     # see batch eval for examples of process isolation
     kernel_exec_result = eval_kernel_against_ref(
-        ref_arch_src, custom_cuda, verbose=config.verbose, measure_performance=True, num_correct_trials=5, num_perf_trials=100
+        ref_arch_src,
+        custom_kernel,
+        verbose=config.verbose,
+        measure_performance=True,
+        num_correct_trials=5,
+        num_perf_trials=100,
+        backend=config.backend,
+    )
+
+    print(
+        f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}"
     )
-
-    print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")
 
     if config.log:
-        with open(os.path.join(config.logdir, f"eval_result_level_{config.level}_problem_{config.problem_id}.txt"), "a") as f:
+        with open(
+            os.path.join(
+                config.logdir,
+                f"eval_result_level_{config.level}_problem_{config.problem_id}.txt",
+            ),
+            "a",
+        ) as f:
             f.write(f"Problem Name: {problem_name}\n")
             f.write(str(kernel_exec_result))
 
 
 if __name__ == "__main__":
-    main()
-
+    main()