feat: added running time statistics for the DS scenario experiment (#1007)

WinstonLiyt · web-flow · commit 030abd871913 · 2025-07-02T15:11:18.000+08:00
* added running time statistics for the DS scenario experiment

* update execute_ret_code to return running_time

* fix

* fix

* update describe

* add EnvResult

* update corresponding calls

* add RunningInfo class

* fix

* fix

* fix

* fix ci

* rename function name

* fix ci

* fix

* refine running_time logic

* fix ci
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -63,7 +63,9 @@ def evaluate(
         )
 
         implementation.inject_files(**{fname: test_code})
-        stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")
+        result = implementation.run(env=env, entry=f"python {fname}")
+        stdout = result.stdout
+        ret_code = result.ret_code
 
         stdout += f"\nNOTE: the above scripts run with return code {ret_code}"
 
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
@@ -50,9 +50,9 @@ def evaluate(
         test_code = (DIRNAME / "eval_tests" / "feature_test.txt").read_text()
         implementation.inject_files(**{fname: test_code})
 
-        stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")
+        result = implementation.run(env=env, entry=f"python {fname}")
 
-        if "main.py" in implementation.file_dict and ret_code == 0:
+        if "main.py" in implementation.file_dict and result.ret_code == 0:
             workflow_stdout = implementation.execute(env=env, entry="python main.py")
             workflow_stdout = remove_eda_part(workflow_stdout)
         else:
@@ -66,7 +66,7 @@ def evaluate(
             workflow_code=implementation.all_codes,
         )
         user_prompt = T(".prompts:feature_eval.user").r(
-            stdout=shrink_text(stdout),
+            stdout=shrink_text(result.stdout),
             workflow_stdout=workflow_stdout,
         )
 
@@ -76,6 +76,6 @@ def evaluate(
             user_prompt=user_prompt,
             init_kwargs_update_func=FeatureEvalFeedback.val_and_update_init_dict,
         )
-        fb.final_decision = fb.final_decision and ret_code == 0
+        fb.final_decision = fb.final_decision and result.ret_code == 0
 
         return fb
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
@@ -67,7 +67,9 @@ def evaluate(
                 (DIRNAME / "eval_tests" / "model_test.txt").read_text().replace("model01", target_task.name)
             )  # only check the model changed this time
             implementation.inject_files(**{fname: test_code})
-            stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")
+            result = implementation.run(env=env, entry=f"python {fname}")
+            stdout = result.stdout
+            ret_code = result.ret_code
 
             if stdout is None:
                 raise CoderError(
@@ -113,6 +115,6 @@ def evaluate(
             user_prompt=user_prompt,
             init_kwargs_update_func=ModelSingleFeedback.val_and_update_init_dict,
         )
-        fb.final_decision = fb.final_decision and ret_code == 0
+        fb.final_decision = fb.final_decision and result.ret_code == 0
 
         return fb
diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -57,8 +57,10 @@ def evaluate(
 
         # Clean the scores.csv & submission.csv.
         implementation.execute(env=env, entry=get_clear_ws_cmd())
-        stdout, execute_ret_code = implementation.execute_ret_code(env=env, entry=f"python -m coverage run main.py")
-        stdout = remove_eda_part(stdout)
+        result = implementation.run(env=env, entry=f"python -m coverage run main.py")
+        implementation.running_info.running_time = result.running_time
+        execute_ret_code = result.ret_code
+        stdout = remove_eda_part(result.stdout)
         stdout += f"The code executed {'successfully' if execute_ret_code == 0 else 'failed'}."
 
         score_fp = implementation.workspace_path / "scores.csv"
@@ -105,9 +107,9 @@ def evaluate(
             base_check_code = T(".eval_tests.submission_format_test", ftype="txt").r()
             implementation.inject_files(**{"test/submission_format_test.py": base_check_code})
             # stdout += "----Submission Check 1-----\n"
-            submission_check_out, submission_ret_code = implementation.execute_ret_code(
-                env=env, entry="python test/submission_format_test.py"
-            )
+            submission_result = implementation.run(env=env, entry="python test/submission_format_test.py")
+            submission_check_out = submission_result.stdout
+            submission_ret_code = submission_result.ret_code
             if DS_RD_SETTING.rule_base_eval:
                 if execute_ret_code == 0 and score_ret_code == 0 and submission_ret_code == 0:
                     return PipelineSingleFeedback(
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -52,7 +52,9 @@ def evaluate(
         fname = "test/data_loader_test.py"
         test_code = (DIRNAME / "eval_tests" / "data_loader_test.txt").read_text()
         implementation.inject_files(**{fname: test_code})
-        stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")
+        result = implementation.run(env=env, entry=f"python {fname}")
+        stdout = result.stdout
+        ret_code = result.ret_code
         match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===(.*)", stdout, re.DOTALL)
         stdout_part_1, eda_output, stdout_part_2 = match.groups() if match else (stdout, None, "")
         stdout = stdout_part_1 + stdout_part_2
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
@@ -121,9 +121,9 @@ def evaluate(
         base_check_code = T(".eval_tests.submission_format_test", ftype="txt").r()
         implementation.inject_files(**{"test/submission_format_test.py": base_check_code})
         # stdout += "----Submission Check 1-----\n"
-        submission_check_out, submission_ret_code = implementation.execute_ret_code(
-            env=env, entry="python test/submission_format_test.py"
-        )
+        submission_result = implementation.run(env=env, entry="python test/submission_format_test.py")
+        submission_check_out = submission_result.stdout
+        submission_ret_code = submission_result.ret_code
         stdout += "\n" + submission_check_out
 
         system_prompt = T(".prompts:workflow_eval.system").r(
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -9,12 +9,17 @@
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from copy import deepcopy
+from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Generic, TypeVar
+from typing import TYPE_CHECKING, Any, Generic, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.evaluation import Feedback
 from rdagent.utils import filter_redundant_text
+
+if TYPE_CHECKING:
+    from rdagent.utils.env import EnvResult
+
 from rdagent.utils.fmt import shrink_text
 
 if typing.TYPE_CHECKING:
@@ -59,6 +64,12 @@ def __repr__(self) -> str:
 ASpecificFeedback = TypeVar("ASpecificFeedback", bound=Feedback)
 
 
+@dataclass
+class RunningInfo:
+    result: object = None  # The result of the experiment, can be different types in different scenarios.
+    running_time: float | None = None
+
+
 class Workspace(ABC, Generic[ASpecificTask, ASpecificFeedback]):
     """
     A workspace is a place to store the task implementation. It evolves as the developer implements the task.
@@ -68,6 +79,7 @@ class Workspace(ABC, Generic[ASpecificTask, ASpecificFeedback]):
     def __init__(self, target_task: ASpecificTask | None = None) -> None:
         self.target_task: ASpecificTask | None = target_task
         self.feedback: ASpecificFeedback | None = None
+        self.running_info: RunningInfo = RunningInfo()
 
     @abstractmethod
     def execute(self, *args: Any, **kwargs: Any) -> object | None:
@@ -250,26 +262,25 @@ def execute(self, env: Env, entry: str) -> str:
         """
         Before each execution, make sure to prepare and inject code.
         """
-        stdout, _ = self.execute_ret_code(env, entry)
-        return stdout
+        result = self.run(env, entry)
+        return result.stdout
 
-    def execute_ret_code(self, env: Env, entry: str) -> tuple[str, int]:
+    def run(self, env: Env, entry: str) -> EnvResult:
         """
-        Execute the code in the environment and return both the stdout and the exit code.
+        Execute the code in the environment and return an EnvResult object (stdout, exit_code, running_time).
 
         Before each execution, make sure to prepare and inject code.
         """
         self.prepare()
         self.inject_files(**self.file_dict)
-        stdout, return_code = env.run_ret_code(entry, str(self.workspace_path), env={"PYTHONPATH": "./"})
-        return (
-            shrink_text(
-                filter_redundant_text(stdout),
-                context_lines=RD_AGENT_SETTINGS.stdout_context_len,
-                line_len=RD_AGENT_SETTINGS.stdout_line_len,
-            ),
-            return_code,
+        result = env.run_ret_code(entry, str(self.workspace_path), env={"PYTHONPATH": "./"})
+        # result is EnvResult
+        result.stdout = shrink_text(
+            filter_redundant_text(result.stdout),
+            context_lines=RD_AGENT_SETTINGS.stdout_context_len,
+            line_len=RD_AGENT_SETTINGS.stdout_line_len,
         )
+        return result
 
     def __str__(self) -> str:
         return f"Workspace[{self.workspace_path=}" + (
@@ -319,14 +330,22 @@ def __init__(
         # NOTE: Assumption
         # - only runner will assign this variable
         # - We will always create a new Experiment without copying previous results when we goto the next new loop.
-        self.result: object = None  # The result of the experiment, can be different types in different scenarios.
+        self.running_info = RunningInfo()
         self.sub_results: dict[str, float] = (
             {}
         )  # TODO: in Kaggle, now sub results are all saved in self.result, remove this in the future.
 
         # For parallel multi-trace support
         self.local_selection: tuple[int, ...] | None = None
 
+    @property
+    def result(self) -> object:
+        return self.running_info.result
+
+    @result.setter
+    def result(self, value: object) -> None:
+        self.running_info.result = value
+
 
 ASpecificExp = TypeVar("ASpecificExp", bound=Experiment)
 
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -98,8 +98,9 @@ exp_feedback:
     1. Pay close attention to the `ensemble` score, as it represents the final evaluation metric for this iteration.
     2. If any individual model significantly outperforms the ensemble, this may indicate an issue in the ensemble method. But if the final `ensemble` score surpasses the current SOTA, you should update the SOTA record. However, it seems that there are noticeable issues in the ensemble component, be sure to highlight them explicitly.
 
-    Below are the results for this experiment:
-    {{ cur_exp.result }}
+    Below are the results and running time for this experiment:
+    Running time: {{ cur_exp.running_info.running_time }} seconds.
+    Results: {{ cur_exp.result }}
 
     {% if cur_vs_sota_score is not none %}
     Below is the comparison of the current `ensemble` performance with the SOTA results:
diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
@@ -130,6 +130,7 @@ def develop(self, exp):
             logger.error("Metrics file (scores.csv) is not generated.")
             raise RunnerError(f"Metrics file (scores.csv) is not generated")
         exp.result = pd.read_csv(score_fp, index_col=0)
+        exp.running_info.running_time = exp.experiment_workspace.running_info.running_time
 
         # 2) if mle-bench, then the submission format checking will be used.
         # DockerEnv for MLEBench submission validation
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -53,7 +53,11 @@ def evaluate(
         )  # Remove previous submission and scores files generated by worklfow.
 
         # execute workflow
-        stdout, execute_ret_code = implementation.execute_ret_code(env=env, entry="python -m coverage run main.py")
+        result = implementation.run(env=env, entry="python -m coverage run main.py")
+        stdout = result.stdout
+        execute_ret_code = result.ret_code
+        implementation.running_info.running_time = result.running_time
+
         match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===", stdout, re.DOTALL)
         eda_output = match.groups()[1] if match else None
         if eda_output is None:
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -22,6 +22,9 @@ describe: # some template to describe some object
     Submission format check result is:
     {{ exp.format_check_result }}
     {% endif %}
+    {% if exp.running_info.running_time is not none %}
+    Running time: {{ exp.running_info.running_time }} seconds
+    {% endif %}
     {% endif %}
 
     {% else %}
diff --git a/rdagent/scenarios/data_science/test_eval.py b/rdagent/scenarios/data_science/test_eval.py
@@ -74,15 +74,15 @@ def valid(self, competition: str, workspace: FBWorkspace) -> tuple[str, int]:
             raise NoTestEvalError(err_msg)
         workspace.inject_files(**{"submission_format_valid.py": (eval_path / "valid.py").read_text()})
         workspace.inject_files(**{"submission_test.csv": (eval_path / "submission_test.csv").read_text()})
-        submission_check_out, submission_ret_code = workspace.execute_ret_code(
+        submission_result = workspace.run(
             env=self.env,
             entry=f"python submission_format_valid.py {competition}",
         )
         workspace.inject_files(
             **{file: workspace.DEL_KEY for file in ["submission_format_valid.py", "submission_test.csv"]}
         )
-        workspace.inject_files(**{"test/mle_submission_format_test.output": submission_check_out})
-        return submission_check_out, submission_ret_code
+        workspace.inject_files(**{"test/mle_submission_format_test.output": submission_result.stdout})
+        return submission_result.stdout, submission_result.ret_code
 
     def enabled(self, competition) -> bool:
         return Path(
@@ -116,12 +116,10 @@ def valid(self, competition: str, workspace: FBWorkspace) -> tuple[str, int]:
             .replace("<competition_id>", competition)
         )
         workspace.inject_files(**{"test/mle_submission_format_test.py": mle_check_code})
-        submission_check_out, submission_ret_code = workspace.execute_ret_code(
-            env=self.env, entry="python test/mle_submission_format_test.py"
-        )
+        submission_result = workspace.run(env=self.env, entry="python test/mle_submission_format_test.py")
 
-        workspace.inject_files(**{"test/mle_submission_format_test.output": submission_check_out})
-        return submission_check_out, submission_ret_code
+        workspace.inject_files(**{"test/mle_submission_format_test.output": submission_result.stdout})
+        return submission_result.stdout, submission_result.ret_code
 
     def enabled(self, competition) -> bool:
         return True
diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -83,7 +83,7 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
         else:
             running_extra_volume = {}
 
-        execute_log = kgde.run(
+        execute_log = kgde.check_output(
             local_path=str(self.workspace_path),
             env=run_env,
             running_extra_volume=running_extra_volume,
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -118,7 +118,7 @@ def download_data(competition: str, settings: ExtendedBaseSettings, enable_creat
             mleb_env = MLEBDockerEnv()
             mleb_env.prepare()
             (Path(zipfile_path)).mkdir(parents=True, exist_ok=True)
-            mleb_env.run(
+            mleb_env.check_output(
                 f"mlebench prepare -c {competition} --data-dir ./zip_files",
                 local_path=local_path,
                 running_extra_volume={str(Path("~/.kaggle").expanduser().absolute()): "/root/.kaggle"},
@@ -129,17 +129,19 @@ def download_data(competition: str, settings: ExtendedBaseSettings, enable_creat
 
             mleb_env = MLEBDockerEnv()
             mleb_env.prepare()
-            mleb_env.run(f"cp -r ./zip_files/{competition}/prepared/public/* ./{competition}", local_path=local_path)
+            mleb_env.check_output(
+                f"cp -r ./zip_files/{competition}/prepared/public/* ./{competition}", local_path=local_path
+            )
 
             for zip_path in competition_local_path.rglob("*.zip"):
                 with zipfile.ZipFile(zip_path, "r") as zip_ref:
                     if len(zip_ref.namelist()) == 1:
-                        mleb_env.run(
+                        mleb_env.check_output(
                             f"unzip -o ./{zip_path.relative_to(competition_local_path)} -d {zip_path.parent.relative_to(competition_local_path)}",
                             local_path=competition_local_path,
                         )
                     else:
-                        mleb_env.run(
+                        mleb_env.check_output(
                             f"mkdir -p ./{zip_path.parent.relative_to(competition_local_path)}/{zip_path.stem}; unzip -o ./{zip_path.relative_to(competition_local_path)} -d ./{zip_path.parent.relative_to(competition_local_path)}/{zip_path.stem}",
                             local_path=competition_local_path,
                         )
@@ -150,13 +152,13 @@ def download_data(competition: str, settings: ExtendedBaseSettings, enable_creat
                 is_gzip_file = open(tar_path, "rb").read(2) == b"\x1f\x8b"
                 with tarfile.open(tar_path, "r:gz") if is_gzip_file else tarfile.open(tar_path, "r") as tar_ref:
                     if len(tar_ref.getmembers()) == 1:
-                        mleb_env.run(
+                        mleb_env.check_output(
                             f"tar -{'xzf' if is_gzip_file else 'xf'} ./{tar_path.relative_to(competition_local_path)} -C {tar_path.parent.relative_to(competition_local_path)}",
                             local_path=competition_local_path,
                         )
                     else:
                         folder_name = tar_path.name.replace(".tar", "").replace(".gz", "")
-                        mleb_env.run(
+                        mleb_env.check_output(
                             f"mkdir -p ./{tar_path.parent.relative_to(competition_local_path)}/{folder_name}; tar -{'xzf' if is_gzip_file else 'xf'} ./{tar_path.relative_to(competition_local_path)} -C ./{tar_path.parent.relative_to(competition_local_path)}/{folder_name}",
                             local_path=competition_local_path,
                         )
diff --git a/rdagent/scenarios/qlib/experiment/utils.py b/rdagent/scenarios/qlib/experiment/utils.py
@@ -16,7 +16,7 @@ def generate_data_folder_from_qlib():
     qtde.prepare()
 
     # Run the Qlib backtest
-    execute_log = qtde.run(
+    execute_log = qtde.check_output(
         local_path=str(template_path),
         entry=f"python generate.py",
     )
diff --git a/rdagent/scenarios/qlib/experiment/workspace.py b/rdagent/scenarios/qlib/experiment/workspace.py
@@ -26,15 +26,15 @@ def execute(self, qlib_config_name: str = "conf.yaml", run_env: dict = {}, *args
         qtde.prepare()
 
         # Run the Qlib backtest
-        execute_qlib_log = qtde.run(
+        execute_qlib_log = qtde.check_output(
             local_path=str(self.workspace_path),
             entry=f"qrun {qlib_config_name}",
             env=run_env,
         )
         logger.log_object(execute_qlib_log, tag="Qlib_execute_log")
 
         # TODO: We should handle the case when Docker times out.
-        execute_log = qtde.run(
+        execute_log = qtde.check_output(
             local_path=str(self.workspace_path),
             entry="python read_exp_res.py",
             env=run_env,
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
diff --git a/test/utils/test_env.py b/test/utils/test_env.py

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,9 @@ def evaluate(`
`63`	`63`	`)`
`64`	`64`
`65`	`65`	`implementation.inject_files(**{fname: test_code})`
`66`		`- stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")`
	`66`	`+ result = implementation.run(env=env, entry=f"python {fname}")`
	`67`	`+ stdout = result.stdout`
	`68`	`+ ret_code = result.ret_code`
`67`	`69`
`68`	`70`	`stdout += f"\nNOTE: the above scripts run with return code {ret_code}"`
`69`	`71`