fix: refine prompt (#987)

you-n-g · web-flow · commit 76df96ee8821 · 2025-06-26T12:08:31.000+08:00
* refactor: rename failed_exp_and_feedback_list to include _after_sota suffix

* refactor: merge prompts_v3 into prompts_v2 and update references
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
@@ -182,27 +182,27 @@ def experiment_and_feedback_list_after_init(
         final_component = self.COMPLETE_ORDER[-1]
         has_final_component = True if DS_RD_SETTING.coder_on_whole_pipeline else False
         SOTA_exp_and_feedback_list = []
-        failed_exp_and_feedback_list = []
+        failed_exp_and_feedback_list_after_sota = []
         for exp, fb in search_list:
             if has_final_component:
                 if fb.decision:
                     SOTA_exp_and_feedback_list.append((exp, fb))
-                    failed_exp_and_feedback_list = []
+                    failed_exp_and_feedback_list_after_sota = []
                 else:
-                    failed_exp_and_feedback_list.append((exp, fb))
+                    failed_exp_and_feedback_list_after_sota.append((exp, fb))
             if exp.hypothesis.component == final_component and fb:
                 has_final_component = True
-        if max_retrieve_num is not None and (SOTA_exp_and_feedback_list or failed_exp_and_feedback_list):
+        if max_retrieve_num is not None and (SOTA_exp_and_feedback_list or failed_exp_and_feedback_list_after_sota):
             SOTA_exp_and_feedback_list = SOTA_exp_and_feedback_list[
                 -min(max_retrieve_num, len(SOTA_exp_and_feedback_list)) :
             ]
-            failed_exp_and_feedback_list = failed_exp_and_feedback_list[
-                -min(max_retrieve_num, len(failed_exp_and_feedback_list)) :
+            failed_exp_and_feedback_list_after_sota = failed_exp_and_feedback_list_after_sota[
+                -min(max_retrieve_num, len(failed_exp_and_feedback_list_after_sota)) :
             ]
         if return_type == "all":
-            return SOTA_exp_and_feedback_list + failed_exp_and_feedback_list
+            return SOTA_exp_and_feedback_list + failed_exp_and_feedback_list_after_sota
         elif return_type == "failed":
-            return failed_exp_and_feedback_list
+            return failed_exp_and_feedback_list_after_sota
         elif return_type == "sota":
             return SOTA_exp_and_feedback_list
         else:
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -5,8 +5,7 @@ scenario_problem:
 
     You will be provided with:
     1. A detailed competition scenario description;
-    2. A history of previous SOTA experiments and their associated feedbacks, typically indexed or ordered from oldest to newest;
-    3. A history of previous failed experiments and their associated feedbacks, chronologically ordered, where each failed experiment did not surpass the SOTA that was current at the time of its execution;
+    2. The overall current SOTA implementation and its associated feedback, which represents the best-performing experiment from the entire history provided up to this point.
 
     Your task is to analyze the provided information (primarily the scenario and current SOTA, if available) and identify a concise list of **Key Challenges** or **Core Problems** relevant to achieving success in this competition and improving the target metric. Aim for **FEWER BUT BETTER** challenges (e.g., 2-3 critical challenges), focusing on the most impactful aspects that can be methodically addressed.
 
@@ -46,8 +45,8 @@ feedback_problem:
 
     You will be provided with:
     1. A detailed competition scenario description;
-    2. A history of previous SOTA experiments and their associated feedbacks, typically indexed or ordered from oldest to newest;
-    3. A history of previous failed experiments and their associated feedbacks, chronologically ordered, where each failed experiment did not surpass the SOTA that was current at the time of its execution;
+    2. A history of previous successfully experiments and their associated feedbacks, indexed or ordered from oldest to newest; the latest SOTA experiment accumulates all the improvements from the previous successful experiments.
+    3. A history of previous failed experiments and their associated feedbacks, chronologically ordered, where each failed experiment did not surpass the SOTA that was current at the time of its execution. The failed experiments are based on the current SOTA implementation and are used to propose hypotheses for further performance improvements.
     4. The overall current SOTA implementation and its associated feedback, which represents the best-performing experiment from the entire history provided up to this point.
 
     Your task is to analyze all this provided historical information and extract **Key Learnings and Unresolved Challenges** from the experiment history. These should guide concrete improvements in subsequent iterations.
@@ -99,7 +98,7 @@ feedback_problem:
   user: |-
     # Scenario Description
     {{ scenario_desc }}
-    
+
     # Previous Experiments and Feedbacks
     {{ exp_and_feedback_list_desc }}    
 
@@ -155,8 +154,8 @@ hypothesis_gen:
     The user is iteratively improving a Kaggle competition implementation. Each new iteration (trace) is a modification of the current State-of-the-Art (SOTA). If a new trace surpasses the current SOTA, it becomes the new SOTA. Otherwise, it's a failed experiment.
     You will be provided with:
     1. A detailed competition scenario description.
-    2. Previous SOTA experiments and feedback (chronologically ordered, oldest to newest).
-    3. Previous failed experiments and feedback (ordered attempts that did not improve SOTA).
+    2. A history of previous successfully experiments and their associated feedbacks, indexed or ordered from oldest to newest; the latest SOTA experiment accumulates all the improvements from the previous successful experiments.
+    3. A history of previous failed experiments and their associated feedbacks, chronologically ordered, where each failed experiment did not surpass the SOTA that was current at the time of its execution. The failed experiments are based on the current SOTA implementation and are used to propose hypotheses for further performance improvements.
     4. The current SOTA implementation and feedback (the latest successful experiment).
     5. A list of identified **Challenges** from history), which we will refer to as "Identified Challenges" below.
 
@@ -275,10 +274,9 @@ task_gen:
 
     You will be provided with the following inputs:
     1. **Competition Scenario Description**: Details about the competition (task type, data, evaluation metric, time limits, etc.).
-    2. **Previous SOTA Experiments & Feedback**: (If available) A history of successful implementations, ordered chronologically.
-    3. **Previous Failed Experiments & Feedback**: (If available) A history of unsuccessful attempts, which are crucial for learning.
-    4. **Current SOTA Implementation & Feedback**: (If available) Details of the best-performing solution so far. **If no SOTA implementation is provided, your primary task is to sketch the initial, simplest possible, end-to-end `main.py` workflow.**
-    5. **Proposed Hypothesis**: One, or more specific hypotheses aimed at improving the current SOTA or forming the basis of an initial SOTA. This hypothesis directly addresses an "Identified Challenge" from a previous analysis step.
+    2. **Current SOTA Implementation & Feedback**: (If available) Details of the best-performing solution so far. **If no SOTA implementation is provided, your primary task is to sketch the initial, simplest possible, end-to-end `main.py` workflow.**
+    3. **Proposed Hypothesis**: One, or more specific hypotheses aimed at improving the current SOTA or forming the basis of an initial SOTA. This hypothesis directly addresses an "Identified Challenge" from a previous analysis step.
+    4. **Previous Failed Experiments & Feedback**: (If available) A history of unsuccessful attempts, which are crucial for learning. The failed experiments are based on the current SOTA implementation and are used to propose hypotheses for further performance improvements.
 
     Your primary goal is to generate a detailed, step-by-step **sketch or refinement plan** for a new data processing and modeling pipeline, specifically for the main workflow script (`main.py`), that effectively implements the `Proposed Hypothesis`. This sketch will guide a developer to write the code correctly.
 
@@ -381,7 +379,7 @@ task_gen:
     # Data Folder Structure (All files are under {% include "scenarios.data_science.share:scen.input_path" %})
     {{ data_folder_info }}
 
-    # Current SOTA Implementation
+    # Current SOTA Implementation & Feedback
     {{ sota_exp_desc }}
 
     # Proposed Hypothesis
@@ -393,7 +391,8 @@ task_gen:
     **Hypothesis:** {{ hypothesis.hypothesis }}
 
     {% endfor %}
-    # Feedback from Previous Failed Experiments (e.g., experiments that did not pass evaluation, encountered bugs, or failed to surpass SOTA performance):
+    # Previous Failed Experiments & Feedback (e.g., experiments that did not pass evaluation, encountered bugs, or failed to surpass SOTA performance)
+
     {{ failed_exp_and_feedback_list_desc }}
 
 idea_sample:
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v3.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v3.yaml
@@ -87,47 +87,7 @@ feedback_problem:
     {{ sota_exp_desc }}
 
 scenario_description: |-
-  {% if use_raw_description -%}
-  ====== Background ======
-  {{ raw_description }}
-
-  {% else %}
-  ====== Background ======
-  {{ background }}
-
-  {% if eda_output is not none %}
-  ====== Data Overview (EDA) ======
-  {{ eda_output }}
-  {% endif %}
-
-  ====== Submission Format ======
-  Please ensure your submission adheres to the following specifications:
-  {{ submission_specifications }}
-
-  ====== Important Guidelines ======
-  Before submitting your results, please note the following:
-  - We have numerous tests in place to check your code.
-  - Ensure your submission is genuine.
-  - Do not manipulate data or return values solely to pass preliminary tests, as this will not lead to successful final evaluation.
-
-  {% endif %}
-
-  ====== Evaluation ======
-  {% if not use_raw_description and metric_name %}
-  The primary evaluation metric for this task is: **{{ metric_name }}**.
-  {% endif %}
-  This metric is considered better when it is **{% if metric_direction %}larger{% else %}smaller{% endif %}**.
-
-  {% if evaluation is not none %}
-  Additional Evaluation Details:
-  {{ evaluation }}
-  {% endif %}
-
-  {% if time_limit %}
-  ====== Time Limit ======
-  Your code's execution is limited to **{{ time_limit }}**.
-  Please optimize your model and parameters to ensure your code runs within this specified time constraint.
-  {% endif %}
+  {% include "scenarios.data_science.proposal.exp_gen.prompts_v2:scenario_description" %}
 
 hypothesis_gen:
   system: |-
@@ -320,24 +280,4 @@ task_gen:
       - Double-check that validation scores are saved correctly to `scores.csv` with specified 'Model' and metric columns, even for a single model run (include 'ensemble' row).
 
   user: |-
-    # Competition Scenario Description
-    {{ scenario_desc }}
-
-    # Data Folder Structure (All files are under {% include "scenarios.data_science.share:scen.input_path" %})
-    {{ data_folder_info }}
-
-    # Current SOTA Implementation
-    {{ sota_exp_desc }}
-
-    # Proposed Hypothesis
-    This sketch should implement the following hypotheses:
-
-    {% for hypothesis in hypotheses %}
-    ## {{ hypothesis.problem_name }}
-    **Why:** {{ hypothesis.problem_desc }}
-    **Hypothesis:** {{ hypothesis.hypothesis }}
-
-    {% endfor %}
-    # Feedback from Previous Failed Experiments (e.g., experiments that did not pass evaluation, encountered bugs, or failed to surpass SOTA performance)
-
-    {{ failed_exp_and_feedback_list_desc }}
+    {% include "scenarios.data_science.proposal.exp_gen.prompts_v2:task_gen.user" %}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -731,7 +731,7 @@ def task_gen(
             component_desc=component_desc,
             workflow_check=not pipeline and hypotheses[0].component != "Workflow",
         )
-        user_prompt = T(".prompts_v3:task_gen.user").r(
+        user_prompt = T(".prompts_v2:task_gen.user").r(
             scenario_desc=scenario_desc,
             data_folder_info=data_folder_info,
             sota_exp_desc=sota_exp_desc,
@@ -774,7 +774,7 @@ def task_gen(
         return exp
 
     def get_scenario_all_desc(self, trace: DSTrace, eda_output=None) -> str:
-        return T(".prompts_v3:scenario_description").r(
+        return T(".prompts_v2:scenario_description").r(
             background=trace.scen.background,
             submission_specifications=trace.scen.submission_specifications,
             evaluation=trace.scen.metric_description,