feat: reanalyze competition info & pipeline coding evaluator prompt (#837)

TPLin22 · peteryang1 · peteryangms · web-flow · commit f7b52580080c · 2025-04-30T17:36:35.000+08:00
* update coding evaluator prompt similar to feedback

* reanaylyzing competition description when three sonsecutive coding failures

* update reanalyzing competition implementation

* fix bug

* update prompts and reanalyze

* fix bugs

* ci issue

* improve some code

* fix CI

---------

Co-authored-by: Xu Yang &lt;peteryang@vip.qq.com&gt;
Co-authored-by: Xu Yang &lt;xuyang1@microsoft.com&gt;
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -16,6 +16,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     ## Workflow Related
     consecutive_errors: int = 5
 
+    ## Coding Related
+    coding_fail_reanalyze_threshold: int = 3
+
     debug_timeout: int = 600
     """The timeout limit for running on debugging data"""
     full_timeout: int = 3600
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
@@ -137,7 +137,6 @@ def feedback(self, prev_out: dict[str, Any]) -> ExperimentFeedback:
         return feedback
 
     def record(self, prev_out: dict[str, Any]):
-
         # set the DAG parent for the trace
         self.trace.sync_dag_parent_and_hist()
 
@@ -151,19 +150,31 @@ def record(self, prev_out: dict[str, Any]):
                     ExperimentFeedback.from_exception(e),
                 )
             )
-            if (
-                self.trace.sota_experiment() is None
-                and len(self.trace.hist) >= DS_RD_SETTING.consecutive_errors
-                and not DS_RD_SETTING.coder_on_whole_pipeline
-            ):
-                # if {in inital/drafting stage} and {tried enough times}
-                for _, fb in self.trace.hist[-DS_RD_SETTING.consecutive_errors :]:
-                    if fb:
-                        break  # any success will stop restarting.
-                else:  # otherwise restart it
-                    logger.error("Consecutive errors reached the limit. Dumping trace.")
-                    logger.log_object(self.trace, tag="trace before restart")
-                    self.trace = DSTrace(scen=self.trace.scen, knowledge_base=self.trace.knowledge_base)
+            if self.trace.sota_experiment() is None:
+                if DS_RD_SETTING.coder_on_whole_pipeline:
+                    #  check if feedback is not generated
+                    if len(self.trace.hist) >= DS_RD_SETTING.coding_fail_reanalyze_threshold:
+                        recent_hist = self.trace.hist[-DS_RD_SETTING.coding_fail_reanalyze_threshold :]
+                        if all(isinstance(fb.exception, (CoderError, RunnerError)) for _, fb in recent_hist):
+                            new_scen = self.trace.scen
+                            if hasattr(new_scen, "reanalyze_competition_description"):
+                                logger.info(
+                                    "Reanalyzing the competition description after three consecutive coding failures."
+                                )
+                                new_scen.reanalyze_competition_description()
+                                self.trace = DSTrace(scen=new_scen, knowledge_base=self.trace.knowledge_base)
+                            else:
+                                logger.info("Can not reanalyze the competition description.")
+                elif len(self.trace.hist) >= DS_RD_SETTING.consecutive_errors:
+                    # if {in inital/drafting stage} and {tried enough times}
+                    for _, fb in self.trace.hist[-DS_RD_SETTING.consecutive_errors :]:
+                        if fb:
+                            break  # any success will stop restarting.
+                    else:  # otherwise restart it
+                        logger.error("Consecutive errors reached the limit. Dumping trace.")
+                        logger.log_object(self.trace, tag="trace before restart")
+                        self.trace = DSTrace(scen=self.trace.scen, knowledge_base=self.trace.knowledge_base)
+
         logger.log_object(self.trace, tag="trace")
         logger.log_object(self.trace.sota_experiment(), tag="SOTA experiment")
 
@@ -294,7 +305,6 @@ def main(
         DS_RD_SETTING.competition = competition
 
     if DS_RD_SETTING.competition:
-
         if DS_RD_SETTING.scen.endswith("KaggleScen"):
             download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
         else:
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -114,8 +114,21 @@ pipeline_eval:
 
     ## Evaluation Scope
     Your focus is to check whether the workflow code:
-    1. Executes successfully, correctly generating a final submission.
-    2. Generates predictions in the correct format, ensuring they align with the submission structure!
+    Step 1: Executes successfully, correctly generating a final submission.
+
+    Step 2: Generates predictions in the correct format, ensuring: they align with the submission structure, the index and column names should match the sample, and the content should not be empty or apparently incorrect.
+    
+    Step 3: Aligns with the competition requirements. This includes:
+    - CAREFULLY ANALYZE WHETHER THE EXPERIMENTAL SETUP AND CODE MAY CAUSE MISALIGNMENT BETWEEN VALIDATION AND TEST PERFORMANCE.
+    - Confirm strict adherence to the competition's evaluation rules listed in `scenario`:
+      - Exact match between the implementation code of metric and the requirements of the scenario. The metric number is not the focus.
+      - Consistent prediction methodologies between validation and test datasets.
+      - No shortcuts or fold-specific strategies applied inconsistently.
+      - Rigorous checks for corner-case consistency.
+    - If such discrepancies or risks are found:
+      - Clearly document these issues in `code`.
+      - Begin your `code` with `[Evaluation error]`, explicitly stating the evaluation alignment issues causing experiment failure.
+    - If no issues are found, begin your `code` with `[Code analysis]`, providing a detailed analysis of the code quality, readability, and adherence to specifications.
 
     ## Evaluation Criteria
     You will be given the execution output (`stdout`) to determine correctness.  
@@ -129,7 +142,7 @@ pipeline_eval:
     {
         "execution": "Describe whether the code executed successfully, correctly integrating all components and generating the final submission. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information.",
         "return_checking": "Verify the generated files, particularly the submission file. Ensure that its format matches the sample submission, checking the index, column names, and CSV content.",
-        "code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
+        "code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide feedback on code quality, readability, adherence to the given specifications, and alignment with competition requirements.",
         "final_decision": <true/false>
     }
     ```
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -48,6 +48,12 @@ def __init__(self, competition: str) -> None:
             self._get_direction()
         )  # True indicates higher is better, False indicates lower is better
 
+    def reanalyze_competition_description(self):
+        self.raw_description = self._get_description()
+        self.processed_data_folder_description = self._get_data_folder_description()
+        self._analysis_competition_description()
+        self.metric_direction: bool = self._get_direction()
+
     def _get_description(self):
         if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}/description.md")).exists():
             return fp.read_text()