feat: add model removal and adjust some framework logic (#681)

WinstonLiyt · web-flow · commit 1edf881c6351 · 2025-03-18T14:42:53.000+08:00
* prune model task

* add component_description

* add model removal logic to component, hypo, and task gen

* fix ci

* adjust coder to meet the requirement of model removal

* fix and refine the logic of model removal

* add model removal logic in model_eval

* fix ci

* fix ci

* prune some unnecessary codes
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
@@ -103,6 +103,10 @@ def implement_one_task(
                 f"{target_task.name}.py"
             ] != workspace.file_dict.get(f"{target_task.name}.py"):
                 break
+
+            # If the task involves model removal, assume it can only process one model at a time.
+            if len(batch_edit) == 1 and batch_edit[f"{target_task.name}.py"] == "__DEL__":
+                break
         else:
             raise CoderError("Failed to generate a new model code.")
 
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
@@ -59,37 +59,55 @@ def evaluate(
         env = get_ds_env()
         env.conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
 
-        fname = "test/model_test.py"
-        test_code = (
-            (DIRNAME / "eval_tests" / "model_test.txt").read_text().replace("model01", target_task.name)
-        )  # only check the model changed this time
-        implementation.inject_files(**{fname: test_code})
-        stdout = implementation.execute(env=env, entry=f"python {fname}")
+        if_model_removed = False
 
-        if stdout is None:
-            raise CoderError(
-                "The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
-            )
+        if f"{target_task.name}.py" in implementation.file_dict:
+            fname = "test/model_test.py"
+            test_code = (
+                (DIRNAME / "eval_tests" / "model_test.txt").read_text().replace("model01", target_task.name)
+            )  # only check the model changed this time
+            implementation.inject_files(**{fname: test_code})
+            stdout = implementation.execute(env=env, entry=f"python {fname}")
+
+            if stdout is None:
+                raise CoderError(
+                    "The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
+                )
+        else:
+            if_model_removed = True
+            stdout = f"Model {target_task.name} removal succeeded."
 
         if "main.py" in implementation.file_dict:
             workflow_stdout = implementation.execute(env=env, entry="python main.py")
             workflow_stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", workflow_stdout)
         else:
             workflow_stdout = None
 
-        system_prompt = T(".prompts:model_eval.system").r(
-            task_desc=target_task.get_task_information(),
-            test_code=test_code,
-            code=implementation.file_dict[f"{target_task.name}.py"],
-            scenario=self.scen.get_scenario_all_desc(),
-            spec=implementation.file_dict["spec/model.md"],
-            workflow_stdout=workflow_stdout,
-            workflow_code=implementation.all_codes,
-        )
-        user_prompt = T(".prompts:model_eval.user").r(
-            stdout=stdout,
-            workflow_stdout=workflow_stdout,
-        )
+        if if_model_removed:
+            system_prompt = T(".prompts:model_eval_rm.system").r(
+                task_desc=target_task.get_task_information(),
+                workflow_stdout=workflow_stdout,
+                workflow_code=implementation.all_codes,
+            )
+            user_prompt = T(".prompts:model_eval_rm.user").r(
+                stdout=stdout,
+                workflow_stdout=workflow_stdout,
+            )
+        else:
+            system_prompt = T(".prompts:model_eval.system").r(
+                task_desc=target_task.get_task_information(),
+                test_code=test_code,
+                code=implementation.file_dict[f"{target_task.name}.py"],
+                scenario=self.scen.get_scenario_all_desc(),
+                spec=implementation.file_dict["spec/model.md"],
+                workflow_stdout=workflow_stdout,
+                workflow_code=implementation.all_codes,
+            )
+            user_prompt = T(".prompts:model_eval.user").r(
+                stdout=stdout,
+                workflow_stdout=workflow_stdout,
+            )
+
         return build_cls_from_json_with_retry(
             ModelSingleFeedback,
             system_prompt=system_prompt,
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
@@ -9,28 +9,13 @@ def __init__(
         self,
         name: str,
         description: str,
-        architecture: str = "",
         *args,
-        hyperparameters: Dict[str, str] = {},
-        model_type: Optional[str] = None,
         **kwargs,
     ) -> None:
-        self.architecture: str = architecture
-        self.hyperparameters: str = hyperparameters
-        self.model_type: str | None = (
-            model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
-            # TODO: More Models Supported
-        )
         super().__init__(name=name, description=description, *args, **kwargs)
 
     def get_task_information(self):
         task_desc = f"""name: {self.name}
 description: {self.description}
 """
-        if self.architecture:
-            task_desc += f"architecture: {self.architecture}\n"
-        if self.hyperparameters:
-            task_desc += f"hyperparameters: {self.hyperparameters}\n"
-        if self.model_type:
-            task_desc += f"model_type: {self.model_type}\n"
         return task_desc
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -138,3 +138,43 @@ model_eval:
     --------- Whole workflow test stdout ---------
     {{ workflow_stdout }}
     {% endif %}
+
+model_eval_rm:
+  system: |-
+    You are a data scientist responsible for evaluating model removal process.
+
+    ## Task Description
+    {{ task_desc }}
+
+    {% if workflow_stdout is not none %}
+    ## Whole Workflow Consideration
+    The model building code is part of the whole workflow. The user has executed the entire pipeline and provided additional stdout.
+
+    **Workflow Code:**
+    ```python
+    {{ workflow_code }}
+    ```
+
+    You should evaluate both the model removal test results and the overall workflow results. **Approve the code only if both tests pass.**
+    {% endif %}
+    
+    ## Evaluation Criteria
+    You will be given the standard output (`stdout`) from the model removal test and, if applicable, the workflow test.
+
+    Please respond with your feedback in the following JSON format and order
+    ```json
+    {
+        "execution": "Describe how well the model removal executed, including any errors or issues encountered. Append all error messages and full traceback details without summarizing or omitting any information.",
+        "return_checking": "Check the generated value, including whether the value is generated and comparing the shape of the model output with the requirement in spec.md.",
+        "code": "Assess code quality, readability, and adherence to specifications.",
+        "final_decision": <true/false>
+    }
+    ```
+
+  user: |-
+    --------- Model removal test stdout ---------
+    {{ stdout }}   
+    {% if workflow_stdout is not none %}
+    --------- Whole workflow test stdout ---------
+    {{ workflow_stdout }}
+    {% endif %}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -61,16 +61,10 @@ def __str__(self) -> str:
         "task_class": FeatureTask,
     },
     "Model": {
-        "target_name": "Building model",
+        "target_name": "Model",
         "spec_file": "spec/model.md",
         "task_output_format": T(".prompts:output_format.model").r(),
         "task_class": ModelTask,
-        "extra_params": {
-            "model_type": "Model type not provided",
-            "architecture": "Model architecture not provided",
-            "hyperparameters": "Model hyperparameters not provided",
-        },
-        "extra_requirement": T(".prompts:extra_requirement.model").r(),
     },
     "Ensemble": {
         "target_name": "Ensemble",
@@ -259,10 +253,6 @@ def _handle_missing_component(
         task = task_cls(
             name=component if component != "Model" else resp_dict.pop("model_name"),
             description=resp_dict.get("description", f"{component} description not provided"),
-            **{
-                k: resp_dict.get("extra_params", {}).get(k, v)
-                for k, v in COMPONENT_TASK_MAPPING[component].get("extra_params", {}).items()
-            },
         )
 
         exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=DSHypothesis(component))
@@ -350,6 +340,12 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 scenario=scenario_desc,
                 sota_exp_desc=sota_exp_desc,
                 last_exp_diff=last_exp_diff,
+                component_desc="\n".join(
+                    [
+                        f"[{key}] {value}"
+                        for key, value in T("scenarios.data_science.share:component_description").template.items()
+                    ]
+                ),
                 component_output_format=T(".prompts:output_format.component").r(),
             )
 
@@ -396,12 +392,12 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
                     task_specification=sota_exp.experiment_workspace.file_dict[component_info["spec_file"]],
                     task_output_format=component_info["task_output_format"],
-                    extra_requirement=component_info.get("extra_requirement"),
                     workflow_check=(not component == "Workflow"),
                 )
 
                 user_prompt = T(".prompts:direct_exp_gen.user").r(
                     targets=component_info["target_name"],
+                    sota_exp_desc=sota_exp_desc,
                     sota_exp_and_feedback_list_desc=sota_exp_feedback_list_desc,
                     failed_exp_and_feedback_list_desc=failed_exp_feedback_list_desc,
                     last_exp_diff=last_exp_diff,
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -213,10 +213,6 @@ direct_exp_gen:
       {% if workflow_check %}"workflow_update": [Partial Response Format 3], {% endif %}
     }
 
-    {% if extra_requirement %}
-    {{ extra_requirement }}
-    {% endif %}
-
   user: |-
     # All former successful experiments and their feedbacks, the current SOTA solution is the combination of the best solutions of these trials:
     {{ sota_exp_and_feedback_list_desc }}
@@ -226,8 +222,23 @@ direct_exp_gen:
     The user has made several hypothesis on this scenario and did several evaluation on them.
     {{ failed_exp_and_feedback_list_desc }}
     
-    {% if targets == "Building model" %}
+    {% if targets == "Model" %}
     Based on the feedback from previous experiment failures, if the failure was due to exceeding the time limit or memory constraints, start with the smallest model size or choose alternative algorithms or methods with significantly lower time or space complexity instead of using a neural network. You can then iteratively refine and optimize the model in later stages.
+
+    Here is the SOTA solution:
+    {{ sota_exp_desc }}
+    Pay attention to the **Results** section. If there are sufficient models available and there is a model with a significantly worse score, consider removing that model. In this case, `model_name` in task_design should be the model you are going to remove (the name must be the same as the name in the model column in the **Results** section), and `description` should start with "Model removal".
+    Otherwise, if the number of available models is insufficient. Your task is to first decide whether to:
+      - Tune an existing model: Select one of the current models for further tuning and improvement.
+      - Add a new model: Introduce a new model to expand the hypothesis space.
+
+      The information of the model is described by the code of workspace.
+
+      Make a decision and proceed accordingly:
+      - If you decide to tune an existing model, select the existing model file and generate a new hypothesis.
+      - If you decide to add a new model, specify the type of model you would add and generate a new hypothesis related to the new model.
+
+      When building the model, if the runtime permits, consider incorporating hyperparameter search methods to improve performance.
     {% endif %}
     
     {% endif %}
@@ -258,13 +269,12 @@ component_gen:
   system: |-
     You are a Kaggle Grander Master. You are going to provide a solution for a kaggle competition.
 
-    Here is the description of the competition scenario:
-    ```
+    # Here is the description of the competition scenario:
     {{ scenario }}
-    ```
 
     # Here is the current best version of implementation:
     {{ sota_exp_desc }}
+    [Notice] Pay attention to the **Results** section. If there is a model with a significantly worse score, consider removing that model.
 
     {% if last_exp_diff %}
     # Here are the differences between the latest version of implementation and the current best version of implementation
@@ -274,7 +284,9 @@ component_gen:
 
     You will be provided the feedback for the latest implementation.
 
-    Please select the component you are going to improve the latest implementation or sota implementation. 
+    Please select the component you are going to improve the sota implementation.
+    # Here is the brief description of the components you can select:
+    {{ component_desc }}
 
     Please generate the output in JSON format following the format below:
     {{ component_output_format }}
@@ -346,17 +358,7 @@ output_format:
     The output should follow JSON format. The schema is as follows: 
     {
         "model_name": "model name, must start with 'model_' and only contain letters, numbers, and underscores",
-        "description": "A precise and comprehensive description of the model",
-        "extra_params":
-        {
-          "model_type": "The type of the model, e.g., neural network, tree-based model, etc.",
-          "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
-          "hyperparameters": {
-            "hyperparameter_name_1": "value of hyperparameter 1",
-            "hyperparameter_name_2": "value of hyperparameter 2",
-            "hyperparameter_name_3": "value of hyperparameter 3"
-          },
-        },
+        "description": "A precise and comprehensive description of the model. Start with [Model building/tuning] or [Model removal].",
     }
   ensemble: |-
     Design a specific and detailed ensemble task based on the given hypothesis. The output should be detailed enough to directly implement the corresponding code.
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -1,21 +1,21 @@
 scenario_description: |-
   ------Background of the scenario------
-  {{background}}
+  {{ background }}
 
   ------ Guidelines for participating in the competition ----
   Before submitting your results, we have numerous tests ready to check your code. Please ensure your submission is genuine and do not manipulate data or return values just to pass the tests, as this will not lead to successful final results.
 
   ------The expected output & submission format specifications------
-  {{submission_specifications}}
+  {{ submission_specifications }}
 
   {% if evaluation is not none %}
   ------Evaluation------
-  {{evaluation}}
+  {{ evaluation }}
   {% endif %}
 
   The evaluation metrics used is directed as:
-  {% if metric_direction %}The metric is better when it is bigger. 
-  {% else %}The metric is better when it is smaller.
+  {% if metric_direction %} The metric is better when it is bigger. 
+  {% else %} The metric is better when it is smaller.
   {% endif %}
 
   {% if eda_output is not none %}
@@ -57,7 +57,7 @@ competition_background: |-
   The data type used in this competition is {{ data_type }}.
   Briefly, the competition involves: {{ brief_description }}.
   The dataset used in this competition is: {{ dataset_description }}.
-  Your goal in this competition is to: {{target_description }}.
+  Your goal in this competition is to: {{ target_description }}.
 
 rich_style_description: |-
   ### {{ name }} Agent: Automated Feature Engineering & Model Tuning Evolution
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -60,3 +60,16 @@ describe: # some template to describe some object
     Reason: {{ exp_and_feedback[1].reason }}
     {% endfor %}
     {% endif %}
+
+
+component_description:
+  data_loader: |-
+    Loads and preprocesses competition data, ensuring proper data types, handling missing values, and providing an exploratory data analysis summary.
+  feature: |-
+    Transforms raw data into meaningful features while maintaining shape consistency, avoiding data leakage, and optimizing for model performance.
+  model: |-
+    Perform one of three tasks: model building, which develops a model to address the problem; model tuning, which optimizes an existing model for better performance; or model removal, which discards models that do not contribute effectively.
+  ensemble: |-
+    Combines predictions from multiple models using ensemble strategies, evaluates their performance, and generates the final test predictions.
+  workflow: |-
+    Integrates all pipeline components, from data loading to ensemble prediction, ensuring efficient execution and correct output formatting.
diff --git a/rdagent/utils/agent/tpl.yaml b/rdagent/utils/agent/tpl.yaml
@@ -12,10 +12,10 @@ BatchEditOut: |-
   For example:
   Inject the code into the folder. Your file name should always contain the suffix. Your file name keys should be unique to avoid delete or replace conflicts.
   {
-      <file name1>: "<code>",  // indicate writing <code> into <file name1> (create new file or replace existing file)
+      <file name1>: "<code>",  // indicate writing <code> into <file name1> (create a new file or update an existing file)
       {% if with_del %}
-      <file name2>: "__DEL__"  // indicate removing file name2. When we want to replace a file to a new one, we usually use this
+      <file name2>: "__DEL__"  // indicate removing file name2. When we want to just remove a file or replace a file to a new one, we usually use this
       {% else %}
-      <file name2>(optional): "<code>"  // indicate writing <code> into <file name2> (create new file or replace existing file)
+      <file name2> (optional): "<code>"  // indicate writing <code> into <file name2> (create a new file or update an existing file)
       {% endif %}
   }

Original file line number	Diff line number	Diff line change
`@@ -12,10 +12,10 @@ BatchEditOut: \|-`
`12`	`12`	`For example:`
`13`	`13`	`Inject the code into the folder. Your file name should always contain the suffix. Your file name keys should be unique to avoid delete or replace conflicts.`
`14`	`14`	`{`
`15`		`- <file name1>: "<code>", // indicate writing <code> into <file name1> (create new file or replace existing file)`
	`15`	`+ <file name1>: "<code>", // indicate writing <code> into <file name1> (create a new file or update an existing file)`
`16`	`16`	`{% if with_del %}`
`17`		`- <file name2>: "__DEL__" // indicate removing file name2. When we want to replace a file to a new one, we usually use this`
	`17`	`+ <file name2>: "__DEL__" // indicate removing file name2. When we want to just remove a file or replace a file to a new one, we usually use this`
`18`	`18`	`{% else %}`
`19`		`- <file name2>(optional): "<code>" // indicate writing <code> into <file name2> (create new file or replace existing file)`
	`19`	`+ <file name2> (optional): "<code>" // indicate writing <code> into <file name2> (create a new file or update an existing file)`
`20`	`20`	`{% endif %}`
`21`	`21`	`}`