feat: dump model (#776)

you-n-g · web-flow · commit b49481e073e6 · 2025-04-09T23:24:12.000+08:00
* feat: add model dump flag and multi-evaluator support

* tmp code

* refactor: update evaluator feedback and FBWorkspace types

* feat: add get_clear_ws_cmd and CPU count in Docker environment

* feat: Add model dump check level and enhance evaluator functionality

fix data type bug

* fix: Ensure required files exist before model dump evaluation

* refactor: streamline prompt and file checks in model dump evaluation

* fix: add assertions and reorder file reads in model dump evaluator

* feat: remove EDA part from evaluation output

* docs: update dump_model guidelines and eval prompt to include template

* style: reformat multiline dicts and lists in conf and eval files

* fix: add DOTALL flag to EDA removal regex
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -1,3 +1,5 @@
+from typing import Literal
+
 from pydantic_settings import SettingsConfigDict
 
 from rdagent.app.kaggle.conf import KaggleBasePropSetting
@@ -34,5 +36,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
 
     rule_base_eval: bool = False
 
+    ### model dump
+    enable_model_dump: bool = False
+    model_dump_check_level: Literal["medium", "high"] = "medium"
+
 
 DS_RD_SETTING = DataScienceBasePropSetting()
diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -1,4 +1,5 @@
 from abc import abstractmethod
+from copy import deepcopy
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List
 
@@ -197,7 +198,7 @@ def evaluate(
 class CoSTEERMultiEvaluator(CoSTEEREvaluator):
     """This is for evaluation of experiment. Due to we have multiple tasks, so we will return a list of evaluation feebacks"""
 
-    def __init__(self, single_evaluator: CoSTEEREvaluator, *args, **kwargs) -> None:
+    def __init__(self, single_evaluator: CoSTEEREvaluator | list[CoSTEEREvaluator], *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.single_evaluator = single_evaluator
 
@@ -207,30 +208,56 @@ def evaluate(
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
     ) -> CoSTEERMultiFeedback:
-        multi_implementation_feedback = multiprocessing_wrapper(
-            [
-                (
-                    self.single_evaluator.evaluate,
+        eval_l = self.single_evaluator if isinstance(self.single_evaluator, list) else [self.single_evaluator]
+        task_li_feedback_li = []
+        for ev in eval_l:
+            multi_implementation_feedback = multiprocessing_wrapper(
+                [
                     (
-                        evo.sub_tasks[index],
-                        evo.sub_workspace_list[index],
-                        evo.sub_gt_implementations[index] if evo.sub_gt_implementations is not None else None,
-                        queried_knowledge,
+                        ev.evaluate,
+                        (
+                            evo.sub_tasks[index],
+                            evo.sub_workspace_list[index],
+                            evo.sub_gt_implementations[index] if evo.sub_gt_implementations is not None else None,
+                            queried_knowledge,
+                        ),
+                    )
+                    for index in range(len(evo.sub_tasks))
+                ],
+                n=RD_AGENT_SETTINGS.multi_proc_n,
+            )
+            task_li_feedback_li.append(multi_implementation_feedback)
+        # merge the feedbacks
+        merged_task_feedback = []
+        for task_id, fb in enumerate(task_li_feedback_li[0]):
+            fb = deepcopy(fb)  # deep copy to make it more robust
+
+            fb.final_decision = all(
+                task_li_feedback[task_id].final_decision for task_li_feedback in task_li_feedback_li
+            )
+            for attr in "execution", "return_checking", "code":
+                setattr(
+                    fb,
+                    attr,
+                    "\n\n".join(
+                        [
+                            getattr(task_li_feedback[task_id], attr)
+                            for task_li_feedback in task_li_feedback_li
+                            if getattr(task_li_feedback[task_id], attr) is not None
+                        ]
                     ),
                 )
-                for index in range(len(evo.sub_tasks))
-            ],
-            n=RD_AGENT_SETTINGS.multi_proc_n,
-        )
+            merged_task_feedback.append(fb)
 
         final_decision = [
             None if single_feedback is None else single_feedback.final_decision
-            for single_feedback in multi_implementation_feedback
+            for single_feedback in merged_task_feedback
         ]
         logger.info(f"Final decisions: {final_decision} True count: {final_decision.count(True)}")
 
+        # TODO: this is to be compatible with factor_implementation;
         for index in range(len(evo.sub_tasks)):
             if final_decision[index]:
                 evo.sub_tasks[index].factor_implementation = True
 
-        return CoSTEERMultiFeedback(multi_implementation_feedback)
+        return CoSTEERMultiFeedback(merged_task_feedback)
diff --git a/rdagent/components/coder/data_science/conf.py b/rdagent/components/coder/data_science/conf.py
@@ -1,5 +1,6 @@
 from typing import Literal
 
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
 from rdagent.utils.env import (
     CondaConf,
@@ -48,3 +49,15 @@ def get_ds_env(conf_type: Literal["kaggle", "mlebench"] = "kaggle") -> Env:
     else:
         raise ValueError(f"Unknown env type: {conf.env_type}")
     return env
+
+
+def get_clear_ws_cmd(stage: Literal["before_training", "before_inference"] = "before_training") -> str:
+    """
+    Clean the files in workspace to a specific stage
+    """
+    assert stage in ["before_training", "before_inference"], f"Unknown stage: {stage}"
+    if DS_RD_SETTING.enable_model_dump and stage == "before_training":
+        cmd = "rm -r submission.csv scores.csv models"
+    else:
+        cmd = "rm submission.csv scores.csv"
+    return cmd
diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -48,6 +48,7 @@
     DataLoaderCoSTEEREvaluator,
 )
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
 from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
@@ -95,6 +96,7 @@ def implement_one_task(
             out_spec=PythonAgentOut.get_spec(),
             runtime_environment=runtime_environment,
             spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
+            enable_model_dump=DS_RD_SETTING.enable_model_dump,
         )
         user_prompt = T(".prompts:pipeline_coder.user").r(
             competition_info=competition_info,
@@ -146,8 +148,12 @@ def __init__(
         **kwargs,
     ) -> None:
         settings = DSCoderCoSTEERSettings()
+        eval_l = [PipelineCoSTEEREvaluator(scen=scen)]
+        if DS_RD_SETTING.enable_model_dump:
+            eval_l.append(ModelDumpEvaluator(scen=scen, data_type="sample"))
+
         eva = CoSTEERMultiEvaluator(
-            PipelineCoSTEEREvaluator(scen=scen), scen=scen
+            single_evaluator=eval_l, scen=scen
         )  # Please specify whether you agree running your eva in parallel or not
         es = PipelineMultiProcessEvolvingStrategy(scen=scen, settings=settings)
 
diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -15,7 +15,7 @@
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledgeV2,
 )
-from rdagent.components.coder.data_science.conf import get_ds_env
+from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
@@ -55,7 +55,7 @@ def evaluate(
         env.conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
 
         # Clean the scores.csv & submission.csv.
-        implementation.execute(env=env, entry=f"rm submission.csv scores.csv")
+        implementation.execute(env=env, entry=get_clear_ws_cmd())
         stdout, execute_ret_code = implementation.execute_ret_code(env=env, entry=f"python main.py")
         stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", stdout)
 
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -58,6 +58,13 @@ pipeline_coder:
     - An evaluation agent will help to check whether the EDA part is added correctly.
     - During the EDA part, you should try to avoid any irrelevant information sending to the standard output.
 
+    {% if enable_model_dump %}
+    ## Model Dumping
+    {% include "components.coder.data_science.share.prompts:dump_model_coder.guideline" %}
+    {% endif %}
+
+
+
     ## Output Format
     {% if out_spec %}
     {{ out_spec }}
@@ -125,10 +132,10 @@ pipeline_eval:
         "final_decision": <true/false>
     }
     ```
-  
+
   user: |-
     --------- code generated by user ---------
     {{ code }}
 
     --------- code running stdout ---------
-    {{ stdout }}
+    {{ stdout }}
diff --git a/rdagent/components/coder/data_science/share/eval.py b/rdagent/components/coder/data_science/share/eval.py
@@ -0,0 +1,119 @@
+from pathlib import Path
+from typing import Literal
+
+import pandas as pd
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.CoSTEER import CoSTEERMultiFeedback
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERSingleFeedback,
+)
+from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
+from rdagent.components.coder.data_science.utils import remove_eda_part
+from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.core.scenario import Scenario
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+PipelineSingleFeedback = CoSTEERSingleFeedback
+PipelineMultiFeedback = CoSTEERMultiFeedback
+
+
+class ModelDumpEvaluator(CoSTEEREvaluator):
+    """This evaluator assumes that it runs after the model"""
+
+    def __init__(self, scen: Scenario, data_type: Literal["sample", "full"]):
+        super().__init__(scen)
+        self.data_type = data_type
+
+    def evaluate(
+        self, target_task: Task, implementation: FBWorkspace, gt_implementation: FBWorkspace, *kargs, **kwargs
+    ) -> CoSTEERSingleFeedback:
+
+        model_folder = implementation.workspace_path / "models"
+        # 1) Check if the model_folder is not empty
+        if not model_folder.exists() or not any(model_folder.iterdir()):
+            err_msg = "Model folder (`models` sub folder) is empty or does not exist. The model is not dumped."
+            return CoSTEERSingleFeedback(
+                execution=err_msg,
+                return_checking=err_msg,
+                code=err_msg,
+                final_decision=False,
+            )
+        env = get_ds_env()
+        env.conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/{'sample/' if self.data_type == 'sample' else ''}{self.scen.competition}": "/kaggle/input"
+        }
+
+        # 2) check the result and stdout after reruning the model.
+
+        # Remove the files submission.csv and scores.csv
+        implementation.execute(env=env, entry=get_clear_ws_cmd(stage="before_inference"))
+
+        # Execute the main script
+        stdout = remove_eda_part(implementation.execute(env=env, entry="python main.py"))
+
+        # walk model_folder and list the files
+        model_folder_files = [
+            str(file.relative_to(implementation.workspace_path)) for file in model_folder.iterdir() if file.is_file()
+        ]
+
+        # this will assert the generation of necessary files
+        for f in ["submission.csv", "scores.csv"]:
+            if not (implementation.workspace_path / f).exists():
+                err_msg = f"{f} does not exist. The model is not dumped. Make sure that the required files, like submission.csv and scores.csv, are created even if you bypass the model training step by loading the saved model file directly."
+                return CoSTEERSingleFeedback(
+                    execution=err_msg,
+                    return_checking=err_msg,
+                    code=err_msg,
+                    final_decision=False,
+                )
+
+        # Read the content of files submission.csv and scores.csv before execution
+        submission_content_before = (
+            (implementation.workspace_path / "submission.csv").read_text()
+            if (implementation.workspace_path / "submission.csv").exists()
+            else None
+        )
+        scores_content_before = (
+            (implementation.workspace_path / "scores.csv").read_text()
+            if (implementation.workspace_path / "scores.csv").exists()
+            else None
+        )
+
+        assert submission_content_before is not None
+        assert scores_content_before is not None
+
+        submission_content_after = (implementation.workspace_path / "submission.csv").read_text()
+        scores_content_after = (implementation.workspace_path / "scores.csv").read_text()
+
+        system_prompt = T(".prompts:dump_model_eval.system").r()
+        user_prompt = T(".prompts:dump_model_eval.user").r(
+            stdout=stdout.strip(),
+            code=implementation.all_codes,
+            model_folder_files=model_folder_files,
+            scores_content_before=scores_content_before,
+            scores_content_after=scores_content_after,
+        )
+
+        csfb = build_cls_from_json_with_retry(
+            CoSTEERSingleFeedback,
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+        )
+
+        if DS_RD_SETTING.model_dump_check_level == "high":
+            # Read the content of files submission.csv and scores.csv after execution
+            # Check if the content has changed
+            # excactly same checking. But it will take more user's time
+            if scores_content_before != scores_content_after:
+                return_msg = "\n[Error] The content of scores.csv has changed. Please check the code to ensure that the model is dumped correctly, and rerun the code to use the model directly without retraining it."
+                return_msg += f"\nBefore:\n{scores_content_before}\nAfter:\n{scores_content_after}"
+                if submission_content_before != submission_content_after:
+                    # If the scores file changes, display the two contents and append it into the return_checking
+                    return_msg = "[Error] The content of submission.csv has changed. Please check the code to ensure that the model is dumped correctly, and rerun the code to use the model directly without retraining it."
+                csfb.return_checking = (csfb.return_checking or "") + return_msg
+        return csfb
diff --git a/rdagent/components/coder/data_science/share/prompts.yaml b/rdagent/components/coder/data_science/share/prompts.yaml
@@ -0,0 +1,54 @@
+dump_model_coder:
+  guideline: |-
+    Please dump the model in a "models/" subfolder in the first running, and the script rerun performs inference without needing to retrain the model.
+    If there are parameters generated from the training data that might be needed for inference on test data, please save them in the "models/" subfolder as well.
+    Make sure that the required files, like submission.csv and scores.csv, are created even if you bypass the model training step by loading the saved model file directly.
+
+dump_model_eval:
+  system: |-
+    You are a data scientist tasked with evaluating code generation. You've developed a Kaggle competition code that can produce a submission file.
+    The code should follow the guideline below:
+    {% include "components.coder.data_science.share.prompts:dump_model_coder.guideline" %}
+
+    You will receive the following information:
+    - The implemented code
+    - The stdout from running the code
+    - The file list in "models/" subfolder
+    - The scores.csv file generated during both training and inference (if it exists)
+
+    Focus on these aspects:
+    - Check if the code saves the model in the "models/" subfolder.
+    - Ensure that when the code is rerun, it skips the training process and loads the model from the "models/" subfolder for direct inference.
+      - Verify that there is no training activity in the output.
+    - Ensure that even if you skip the model training by loading saved models, the files like scores.csv and submission.csv are still correctly created.
+    - The model's performance should remain consistent and not vary unreasonably between training and inference.
+
+    Please respond with your feedback in the following JSON format and order
+    ```json
+    {
+        "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. Carefully check the stdout to ensure that when the code is rerun, it skips the training process and loads the model from the 'models/' subfolder for direct inference. Append the information that makes you think that the model is still being retrained when rerunning the code."
+        "return_checking": "Verify the generated files include necessary files. Make sure scores.csv file does not change unreasonably between training and inference",
+        "code": "The code has explicity dump the model into 'models/' subfolder; When the modes files are already in 'models/' subfolder, the code will explicity skip the training process.",
+        "final_decision": <true or false in boolean type; only return true when ensuring that the code saves the model in a 'models/' subfolder, and the script rerun performs inference without needing to retrain the model.>
+    }
+    ```
+
+  user: |-
+    ------------ The implemented code ------------ 
+    {{code}}
+
+    ------------ The stdout from running the code ------------ 
+    {{stdout}}
+
+    ------------ The file list in "models/" subfolder ------------
+    {% for f in model_folder_files %}
+    - {{ f }}
+    {% endfor %}
+
+    ------------ The scores.csv file generated ------------
+    # Training:
+    {{scores_content_before}}
+
+    # Inference:
+    {{scores_content_after}}
+
diff --git a/rdagent/components/coder/data_science/utils.py b/rdagent/components/coder/data_science/utils.py
@@ -0,0 +1,6 @@
+import re
+
+
+def remove_eda_part(stdout: str) -> str:
+    """Data Science scenario have a LLM-based EDA feature. We can remove it when current task does not involve EDA"""
+    return re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", stdout, flags=re.DOTALL)
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py