Skip to content

Commit f7b5258

Browse files
TPLin22peteryang1peteryangms
authored
feat: reanalyze competition info & pipeline coding evaluator prompt (#837)
* update coding evaluator prompt similar to feedback * reanaylyzing competition description when three sonsecutive coding failures * update reanalyzing competition implementation * fix bug * update prompts and reanalyze * fix bugs * ci issue * improve some code * fix CI --------- Co-authored-by: Xu Yang <[email protected]> Co-authored-by: Xu Yang <[email protected]>
1 parent efedddf commit f7b5258

File tree

4 files changed

+50
-18
lines changed

4 files changed

+50
-18
lines changed

rdagent/app/data_science/conf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
1616
## Workflow Related
1717
consecutive_errors: int = 5
1818

19+
## Coding Related
20+
coding_fail_reanalyze_threshold: int = 3
21+
1922
debug_timeout: int = 600
2023
"""The timeout limit for running on debugging data"""
2124
full_timeout: int = 3600

rdagent/app/data_science/loop.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,6 @@ def feedback(self, prev_out: dict[str, Any]) -> ExperimentFeedback:
137137
return feedback
138138

139139
def record(self, prev_out: dict[str, Any]):
140-
141140
# set the DAG parent for the trace
142141
self.trace.sync_dag_parent_and_hist()
143142

@@ -151,19 +150,31 @@ def record(self, prev_out: dict[str, Any]):
151150
ExperimentFeedback.from_exception(e),
152151
)
153152
)
154-
if (
155-
self.trace.sota_experiment() is None
156-
and len(self.trace.hist) >= DS_RD_SETTING.consecutive_errors
157-
and not DS_RD_SETTING.coder_on_whole_pipeline
158-
):
159-
# if {in inital/drafting stage} and {tried enough times}
160-
for _, fb in self.trace.hist[-DS_RD_SETTING.consecutive_errors :]:
161-
if fb:
162-
break # any success will stop restarting.
163-
else: # otherwise restart it
164-
logger.error("Consecutive errors reached the limit. Dumping trace.")
165-
logger.log_object(self.trace, tag="trace before restart")
166-
self.trace = DSTrace(scen=self.trace.scen, knowledge_base=self.trace.knowledge_base)
153+
if self.trace.sota_experiment() is None:
154+
if DS_RD_SETTING.coder_on_whole_pipeline:
155+
# check if feedback is not generated
156+
if len(self.trace.hist) >= DS_RD_SETTING.coding_fail_reanalyze_threshold:
157+
recent_hist = self.trace.hist[-DS_RD_SETTING.coding_fail_reanalyze_threshold :]
158+
if all(isinstance(fb.exception, (CoderError, RunnerError)) for _, fb in recent_hist):
159+
new_scen = self.trace.scen
160+
if hasattr(new_scen, "reanalyze_competition_description"):
161+
logger.info(
162+
"Reanalyzing the competition description after three consecutive coding failures."
163+
)
164+
new_scen.reanalyze_competition_description()
165+
self.trace = DSTrace(scen=new_scen, knowledge_base=self.trace.knowledge_base)
166+
else:
167+
logger.info("Can not reanalyze the competition description.")
168+
elif len(self.trace.hist) >= DS_RD_SETTING.consecutive_errors:
169+
# if {in inital/drafting stage} and {tried enough times}
170+
for _, fb in self.trace.hist[-DS_RD_SETTING.consecutive_errors :]:
171+
if fb:
172+
break # any success will stop restarting.
173+
else: # otherwise restart it
174+
logger.error("Consecutive errors reached the limit. Dumping trace.")
175+
logger.log_object(self.trace, tag="trace before restart")
176+
self.trace = DSTrace(scen=self.trace.scen, knowledge_base=self.trace.knowledge_base)
177+
167178
logger.log_object(self.trace, tag="trace")
168179
logger.log_object(self.trace.sota_experiment(), tag="SOTA experiment")
169180

@@ -294,7 +305,6 @@ def main(
294305
DS_RD_SETTING.competition = competition
295306

296307
if DS_RD_SETTING.competition:
297-
298308
if DS_RD_SETTING.scen.endswith("KaggleScen"):
299309
download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
300310
else:

rdagent/components/coder/data_science/pipeline/prompts.yaml

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,21 @@ pipeline_eval:
114114
115115
## Evaluation Scope
116116
Your focus is to check whether the workflow code:
117-
1. Executes successfully, correctly generating a final submission.
118-
2. Generates predictions in the correct format, ensuring they align with the submission structure!
117+
Step 1: Executes successfully, correctly generating a final submission.
118+
119+
Step 2: Generates predictions in the correct format, ensuring: they align with the submission structure, the index and column names should match the sample, and the content should not be empty or apparently incorrect.
120+
121+
Step 3: Aligns with the competition requirements. This includes:
122+
- CAREFULLY ANALYZE WHETHER THE EXPERIMENTAL SETUP AND CODE MAY CAUSE MISALIGNMENT BETWEEN VALIDATION AND TEST PERFORMANCE.
123+
- Confirm strict adherence to the competition's evaluation rules listed in `scenario`:
124+
- Exact match between the implementation code of metric and the requirements of the scenario. The metric number is not the focus.
125+
- Consistent prediction methodologies between validation and test datasets.
126+
- No shortcuts or fold-specific strategies applied inconsistently.
127+
- Rigorous checks for corner-case consistency.
128+
- If such discrepancies or risks are found:
129+
- Clearly document these issues in `code`.
130+
- Begin your `code` with `[Evaluation error]`, explicitly stating the evaluation alignment issues causing experiment failure.
131+
- If no issues are found, begin your `code` with `[Code analysis]`, providing a detailed analysis of the code quality, readability, and adherence to specifications.
119132
120133
## Evaluation Criteria
121134
You will be given the execution output (`stdout`) to determine correctness.
@@ -129,7 +142,7 @@ pipeline_eval:
129142
{
130143
"execution": "Describe whether the code executed successfully, correctly integrating all components and generating the final submission. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information.",
131144
"return_checking": "Verify the generated files, particularly the submission file. Ensure that its format matches the sample submission, checking the index, column names, and CSV content.",
132-
"code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
145+
"code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide feedback on code quality, readability, adherence to the given specifications, and alignment with competition requirements.",
133146
"final_decision": <true/false>
134147
}
135148
```

rdagent/scenarios/data_science/scen/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ def __init__(self, competition: str) -> None:
4848
self._get_direction()
4949
) # True indicates higher is better, False indicates lower is better
5050

51+
def reanalyze_competition_description(self):
52+
self.raw_description = self._get_description()
53+
self.processed_data_folder_description = self._get_data_folder_description()
54+
self._analysis_competition_description()
55+
self.metric_direction: bool = self._get_direction()
56+
5157
def _get_description(self):
5258
if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}/description.md")).exists():
5359
return fp.read_text()

0 commit comments

Comments
 (0)