Skip to content

Commit 712d94a

Browse files
authored
feat: add mlebench submission validitor (#545)
* add mlebench submission check * fix CI * fix bug
1 parent 54d930e commit 712d94a

File tree

3 files changed

+38
-4
lines changed

3 files changed

+38
-4
lines changed

rdagent/components/coder/data_science/workflow/eval.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from rdagent.core.experiment import FBWorkspace, Task
1616
from rdagent.oai.llm_utils import APIBackend
1717
from rdagent.utils.agent.tpl import T
18-
from rdagent.utils.env import DockerEnv, DSDockerConf
18+
from rdagent.utils.env import DockerEnv, DSDockerConf, MLEBDockerConf
1919

2020
DIRNAME = Path(__file__).absolute().resolve().parent
2121

@@ -53,11 +53,22 @@ def evaluate(
5353
code="This task has failed too many times, skip implementation.",
5454
final_decision=False,
5555
)
56+
57+
# DockerEnv for Kaggle Competition
5658
ds_docker_conf = DSDockerConf()
5759
ds_docker_conf.extra_volumes = {
5860
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
5961
}
6062
de = DockerEnv(conf=ds_docker_conf)
63+
64+
# DockerEnv for MLEBench submission validation
65+
mle_de_conf = MLEBDockerConf()
66+
mle_de_conf.extra_volumes = {
67+
f"{DS_RD_SETTING.local_data_path}/zip_files": "/mle/data",
68+
}
69+
mde = DockerEnv(conf=mle_de_conf)
70+
mde.prepare()
71+
6172
# Clean the scores.csv & submission.csv.
6273
stdout = implementation.execute(env=de, entry=f"rm submission.csv scores.csv")
6374

@@ -85,10 +96,21 @@ def evaluate(
8596
if not submission_fp.exists():
8697
stdout += "\nSubmission file (submission.csv) is not generated."
8798
else:
88-
check_code = (DIRNAME / "eval_tests" / "submission_check.txt").read_text()
89-
implementation.inject_files(**{"submission_check.py": check_code})
99+
base_check_code = (DIRNAME / "eval_tests" / "submission_check.txt").read_text()
100+
implementation.inject_files(**{"submission_check.py": base_check_code})
101+
stdout += "----Submission Check 1-----\n"
90102
stdout += implementation.execute(env=de, entry="python submission_check.py")
91103

104+
# MLEBench Check
105+
mle_check_code = (
106+
(DIRNAME / "eval_tests" / "mle_submission_check.txt")
107+
.read_text()
108+
.replace("<competition_id>", self.scen.competition)
109+
)
110+
implementation.inject_files(**{"mle_submission_check.py": mle_check_code})
111+
stdout += "----Submission Check 2-----\n"
112+
stdout += implementation.execute(env=mde, entry=f"python mle_submission_check.py")
113+
92114
system_prompt = T(".prompts:workflow_eval.system").r(
93115
scenario=self.scen.get_scenario_all_desc(),
94116
task_desc=target_task.get_task_information(),
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from pathlib import Path
2+
3+
from mlebench.grade import validate_submission
4+
from mlebench.registry import registry
5+
6+
COMPETITION_ID = "<competition_id>"
7+
new_registry = registry.set_data_dir(Path("/mle/data"))
8+
competition = new_registry.get_competition(COMPETITION_ID)
9+
10+
is_valid, message = validate_submission(Path("submission.csv"), competition)
11+
12+
print(message)

rdagent/scenarios/data_science/dev/runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def develop(self, exp: DSExperiment) -> DSExperiment:
4343
stdout = exp.experiment_workspace.execute(env=de, entry="coverage json -o coverage.json")
4444
if Path(exp.experiment_workspace.workspace_path / "coverage.json").exists():
4545
with open(exp.experiment_workspace.workspace_path / "coverage.json") as f:
46-
used_files = set(json.load(f)["files"].keys()) | {"submission_check.py"}
46+
used_files = set(json.load(f)["files"].keys()) | {"submission_check.py", "mle_submission_check.py"}
4747
logger.info("All used scripts: {}".format(used_files))
4848
all_python_files = set(Path(exp.experiment_workspace.workspace_path).rglob("*.py"))
4949
unused_files = [

0 commit comments

Comments
 (0)