Skip to content

Commit 030abd8

Browse files
authored
feat: added running time statistics for the DS scenario experiment (#1007)
* added running time statistics for the DS scenario experiment * update execute_ret_code to return running_time * fix * fix * update describe * add EnvResult * update corresponding calls * add RunningInfo class * fix * fix * fix * fix ci * rename function name * fix ci * fix * refine running_time logic * fix ci
1 parent 25a9612 commit 030abd8

File tree

18 files changed

+143
-91
lines changed

18 files changed

+143
-91
lines changed

rdagent/components/coder/data_science/ensemble/eval.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ def evaluate(
6363
)
6464

6565
implementation.inject_files(**{fname: test_code})
66-
stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")
66+
result = implementation.run(env=env, entry=f"python {fname}")
67+
stdout = result.stdout
68+
ret_code = result.ret_code
6769

6870
stdout += f"\nNOTE: the above scripts run with return code {ret_code}"
6971

rdagent/components/coder/data_science/feature/eval.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ def evaluate(
5050
test_code = (DIRNAME / "eval_tests" / "feature_test.txt").read_text()
5151
implementation.inject_files(**{fname: test_code})
5252

53-
stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")
53+
result = implementation.run(env=env, entry=f"python {fname}")
5454

55-
if "main.py" in implementation.file_dict and ret_code == 0:
55+
if "main.py" in implementation.file_dict and result.ret_code == 0:
5656
workflow_stdout = implementation.execute(env=env, entry="python main.py")
5757
workflow_stdout = remove_eda_part(workflow_stdout)
5858
else:
@@ -66,7 +66,7 @@ def evaluate(
6666
workflow_code=implementation.all_codes,
6767
)
6868
user_prompt = T(".prompts:feature_eval.user").r(
69-
stdout=shrink_text(stdout),
69+
stdout=shrink_text(result.stdout),
7070
workflow_stdout=workflow_stdout,
7171
)
7272

@@ -76,6 +76,6 @@ def evaluate(
7676
user_prompt=user_prompt,
7777
init_kwargs_update_func=FeatureEvalFeedback.val_and_update_init_dict,
7878
)
79-
fb.final_decision = fb.final_decision and ret_code == 0
79+
fb.final_decision = fb.final_decision and result.ret_code == 0
8080

8181
return fb

rdagent/components/coder/data_science/model/eval.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,9 @@ def evaluate(
6767
(DIRNAME / "eval_tests" / "model_test.txt").read_text().replace("model01", target_task.name)
6868
) # only check the model changed this time
6969
implementation.inject_files(**{fname: test_code})
70-
stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")
70+
result = implementation.run(env=env, entry=f"python {fname}")
71+
stdout = result.stdout
72+
ret_code = result.ret_code
7173

7274
if stdout is None:
7375
raise CoderError(
@@ -113,6 +115,6 @@ def evaluate(
113115
user_prompt=user_prompt,
114116
init_kwargs_update_func=ModelSingleFeedback.val_and_update_init_dict,
115117
)
116-
fb.final_decision = fb.final_decision and ret_code == 0
118+
fb.final_decision = fb.final_decision and result.ret_code == 0
117119

118120
return fb

rdagent/components/coder/data_science/pipeline/eval.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,10 @@ def evaluate(
5757

5858
# Clean the scores.csv & submission.csv.
5959
implementation.execute(env=env, entry=get_clear_ws_cmd())
60-
stdout, execute_ret_code = implementation.execute_ret_code(env=env, entry=f"python -m coverage run main.py")
61-
stdout = remove_eda_part(stdout)
60+
result = implementation.run(env=env, entry=f"python -m coverage run main.py")
61+
implementation.running_info.running_time = result.running_time
62+
execute_ret_code = result.ret_code
63+
stdout = remove_eda_part(result.stdout)
6264
stdout += f"The code executed {'successfully' if execute_ret_code == 0 else 'failed'}."
6365

6466
score_fp = implementation.workspace_path / "scores.csv"
@@ -105,9 +107,9 @@ def evaluate(
105107
base_check_code = T(".eval_tests.submission_format_test", ftype="txt").r()
106108
implementation.inject_files(**{"test/submission_format_test.py": base_check_code})
107109
# stdout += "----Submission Check 1-----\n"
108-
submission_check_out, submission_ret_code = implementation.execute_ret_code(
109-
env=env, entry="python test/submission_format_test.py"
110-
)
110+
submission_result = implementation.run(env=env, entry="python test/submission_format_test.py")
111+
submission_check_out = submission_result.stdout
112+
submission_ret_code = submission_result.ret_code
111113
if DS_RD_SETTING.rule_base_eval:
112114
if execute_ret_code == 0 and score_ret_code == 0 and submission_ret_code == 0:
113115
return PipelineSingleFeedback(

rdagent/components/coder/data_science/raw_data_loader/eval.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ def evaluate(
5252
fname = "test/data_loader_test.py"
5353
test_code = (DIRNAME / "eval_tests" / "data_loader_test.txt").read_text()
5454
implementation.inject_files(**{fname: test_code})
55-
stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")
55+
result = implementation.run(env=env, entry=f"python {fname}")
56+
stdout = result.stdout
57+
ret_code = result.ret_code
5658
match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===(.*)", stdout, re.DOTALL)
5759
stdout_part_1, eda_output, stdout_part_2 = match.groups() if match else (stdout, None, "")
5860
stdout = stdout_part_1 + stdout_part_2

rdagent/components/coder/data_science/workflow/eval.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,9 @@ def evaluate(
121121
base_check_code = T(".eval_tests.submission_format_test", ftype="txt").r()
122122
implementation.inject_files(**{"test/submission_format_test.py": base_check_code})
123123
# stdout += "----Submission Check 1-----\n"
124-
submission_check_out, submission_ret_code = implementation.execute_ret_code(
125-
env=env, entry="python test/submission_format_test.py"
126-
)
124+
submission_result = implementation.run(env=env, entry="python test/submission_format_test.py")
125+
submission_check_out = submission_result.stdout
126+
submission_ret_code = submission_result.ret_code
127127
stdout += "\n" + submission_check_out
128128

129129
system_prompt = T(".prompts:workflow_eval.system").r(

rdagent/core/experiment.py

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,17 @@
99
from abc import ABC, abstractmethod
1010
from collections.abc import Sequence
1111
from copy import deepcopy
12+
from dataclasses import dataclass
1213
from pathlib import Path
13-
from typing import Any, Generic, TypeVar
14+
from typing import TYPE_CHECKING, Any, Generic, TypeVar
1415

1516
from rdagent.core.conf import RD_AGENT_SETTINGS
1617
from rdagent.core.evaluation import Feedback
1718
from rdagent.utils import filter_redundant_text
19+
20+
if TYPE_CHECKING:
21+
from rdagent.utils.env import EnvResult
22+
1823
from rdagent.utils.fmt import shrink_text
1924

2025
if typing.TYPE_CHECKING:
@@ -59,6 +64,12 @@ def __repr__(self) -> str:
5964
ASpecificFeedback = TypeVar("ASpecificFeedback", bound=Feedback)
6065

6166

67+
@dataclass
68+
class RunningInfo:
69+
result: object = None # The result of the experiment, can be different types in different scenarios.
70+
running_time: float | None = None
71+
72+
6273
class Workspace(ABC, Generic[ASpecificTask, ASpecificFeedback]):
6374
"""
6475
A workspace is a place to store the task implementation. It evolves as the developer implements the task.
@@ -68,6 +79,7 @@ class Workspace(ABC, Generic[ASpecificTask, ASpecificFeedback]):
6879
def __init__(self, target_task: ASpecificTask | None = None) -> None:
6980
self.target_task: ASpecificTask | None = target_task
7081
self.feedback: ASpecificFeedback | None = None
82+
self.running_info: RunningInfo = RunningInfo()
7183

7284
@abstractmethod
7385
def execute(self, *args: Any, **kwargs: Any) -> object | None:
@@ -250,26 +262,25 @@ def execute(self, env: Env, entry: str) -> str:
250262
"""
251263
Before each execution, make sure to prepare and inject code.
252264
"""
253-
stdout, _ = self.execute_ret_code(env, entry)
254-
return stdout
265+
result = self.run(env, entry)
266+
return result.stdout
255267

256-
def execute_ret_code(self, env: Env, entry: str) -> tuple[str, int]:
268+
def run(self, env: Env, entry: str) -> EnvResult:
257269
"""
258-
Execute the code in the environment and return both the stdout and the exit code.
270+
Execute the code in the environment and return an EnvResult object (stdout, exit_code, running_time).
259271
260272
Before each execution, make sure to prepare and inject code.
261273
"""
262274
self.prepare()
263275
self.inject_files(**self.file_dict)
264-
stdout, return_code = env.run_ret_code(entry, str(self.workspace_path), env={"PYTHONPATH": "./"})
265-
return (
266-
shrink_text(
267-
filter_redundant_text(stdout),
268-
context_lines=RD_AGENT_SETTINGS.stdout_context_len,
269-
line_len=RD_AGENT_SETTINGS.stdout_line_len,
270-
),
271-
return_code,
276+
result = env.run_ret_code(entry, str(self.workspace_path), env={"PYTHONPATH": "./"})
277+
# result is EnvResult
278+
result.stdout = shrink_text(
279+
filter_redundant_text(result.stdout),
280+
context_lines=RD_AGENT_SETTINGS.stdout_context_len,
281+
line_len=RD_AGENT_SETTINGS.stdout_line_len,
272282
)
283+
return result
273284

274285
def __str__(self) -> str:
275286
return f"Workspace[{self.workspace_path=}" + (
@@ -319,14 +330,22 @@ def __init__(
319330
# NOTE: Assumption
320331
# - only runner will assign this variable
321332
# - We will always create a new Experiment without copying previous results when we goto the next new loop.
322-
self.result: object = None # The result of the experiment, can be different types in different scenarios.
333+
self.running_info = RunningInfo()
323334
self.sub_results: dict[str, float] = (
324335
{}
325336
) # TODO: in Kaggle, now sub results are all saved in self.result, remove this in the future.
326337

327338
# For parallel multi-trace support
328339
self.local_selection: tuple[int, ...] | None = None
329340

341+
@property
342+
def result(self) -> object:
343+
return self.running_info.result
344+
345+
@result.setter
346+
def result(self, value: object) -> None:
347+
self.running_info.result = value
348+
330349

331350
ASpecificExp = TypeVar("ASpecificExp", bound=Experiment)
332351

rdagent/scenarios/data_science/dev/prompts.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,9 @@ exp_feedback:
9898
1. Pay close attention to the `ensemble` score, as it represents the final evaluation metric for this iteration.
9999
2. If any individual model significantly outperforms the ensemble, this may indicate an issue in the ensemble method. But if the final `ensemble` score surpasses the current SOTA, you should update the SOTA record. However, it seems that there are noticeable issues in the ensemble component, be sure to highlight them explicitly.
100100
101-
Below are the results for this experiment:
102-
{{ cur_exp.result }}
101+
Below are the results and running time for this experiment:
102+
Running time: {{ cur_exp.running_info.running_time }} seconds.
103+
Results: {{ cur_exp.result }}
103104
104105
{% if cur_vs_sota_score is not none %}
105106
Below is the comparison of the current `ensemble` performance with the SOTA results:

rdagent/scenarios/data_science/dev/runner/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def develop(self, exp):
130130
logger.error("Metrics file (scores.csv) is not generated.")
131131
raise RunnerError(f"Metrics file (scores.csv) is not generated")
132132
exp.result = pd.read_csv(score_fp, index_col=0)
133+
exp.running_info.running_time = exp.experiment_workspace.running_info.running_time
133134

134135
# 2) if mle-bench, then the submission format checking will be used.
135136
# DockerEnv for MLEBench submission validation

rdagent/scenarios/data_science/dev/runner/eval.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,11 @@ def evaluate(
5353
) # Remove previous submission and scores files generated by worklfow.
5454

5555
# execute workflow
56-
stdout, execute_ret_code = implementation.execute_ret_code(env=env, entry="python -m coverage run main.py")
56+
result = implementation.run(env=env, entry="python -m coverage run main.py")
57+
stdout = result.stdout
58+
execute_ret_code = result.ret_code
59+
implementation.running_info.running_time = result.running_time
60+
5761
match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===", stdout, re.DOTALL)
5862
eda_output = match.groups()[1] if match else None
5963
if eda_output is None:

0 commit comments

Comments
 (0)