Skip to content

Commit b49481e

Browse files
authored
feat: dump model (#776)
* feat: add model dump flag and multi-evaluator support * tmp code * refactor: update evaluator feedback and FBWorkspace types * feat: add get_clear_ws_cmd and CPU count in Docker environment * feat: Add model dump check level and enhance evaluator functionality fix data type bug * fix: Ensure required files exist before model dump evaluation * refactor: streamline prompt and file checks in model dump evaluation * fix: add assertions and reorder file reads in model dump evaluator * feat: remove EDA part from evaluation output * docs: update dump_model guidelines and eval prompt to include template * style: reformat multiline dicts and lists in conf and eval files * fix: add DOTALL flag to EDA removal regex
1 parent 52efc85 commit b49481e

File tree

13 files changed

+271
-25
lines changed

13 files changed

+271
-25
lines changed

rdagent/app/data_science/conf.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import Literal
2+
13
from pydantic_settings import SettingsConfigDict
24

35
from rdagent.app.kaggle.conf import KaggleBasePropSetting
@@ -34,5 +36,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
3436

3537
rule_base_eval: bool = False
3638

39+
### model dump
40+
enable_model_dump: bool = False
41+
model_dump_check_level: Literal["medium", "high"] = "medium"
42+
3743

3844
DS_RD_SETTING = DataScienceBasePropSetting()

rdagent/components/coder/CoSTEER/evaluators.py

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from abc import abstractmethod
2+
from copy import deepcopy
23
from dataclasses import dataclass
34
from typing import TYPE_CHECKING, List
45

@@ -197,7 +198,7 @@ def evaluate(
197198
class CoSTEERMultiEvaluator(CoSTEEREvaluator):
198199
"""This is for evaluation of experiment. Due to we have multiple tasks, so we will return a list of evaluation feebacks"""
199200

200-
def __init__(self, single_evaluator: CoSTEEREvaluator, *args, **kwargs) -> None:
201+
def __init__(self, single_evaluator: CoSTEEREvaluator | list[CoSTEEREvaluator], *args, **kwargs) -> None:
201202
super().__init__(*args, **kwargs)
202203
self.single_evaluator = single_evaluator
203204

@@ -207,30 +208,56 @@ def evaluate(
207208
queried_knowledge: QueriedKnowledge = None,
208209
**kwargs,
209210
) -> CoSTEERMultiFeedback:
210-
multi_implementation_feedback = multiprocessing_wrapper(
211-
[
212-
(
213-
self.single_evaluator.evaluate,
211+
eval_l = self.single_evaluator if isinstance(self.single_evaluator, list) else [self.single_evaluator]
212+
task_li_feedback_li = []
213+
for ev in eval_l:
214+
multi_implementation_feedback = multiprocessing_wrapper(
215+
[
214216
(
215-
evo.sub_tasks[index],
216-
evo.sub_workspace_list[index],
217-
evo.sub_gt_implementations[index] if evo.sub_gt_implementations is not None else None,
218-
queried_knowledge,
217+
ev.evaluate,
218+
(
219+
evo.sub_tasks[index],
220+
evo.sub_workspace_list[index],
221+
evo.sub_gt_implementations[index] if evo.sub_gt_implementations is not None else None,
222+
queried_knowledge,
223+
),
224+
)
225+
for index in range(len(evo.sub_tasks))
226+
],
227+
n=RD_AGENT_SETTINGS.multi_proc_n,
228+
)
229+
task_li_feedback_li.append(multi_implementation_feedback)
230+
# merge the feedbacks
231+
merged_task_feedback = []
232+
for task_id, fb in enumerate(task_li_feedback_li[0]):
233+
fb = deepcopy(fb) # deep copy to make it more robust
234+
235+
fb.final_decision = all(
236+
task_li_feedback[task_id].final_decision for task_li_feedback in task_li_feedback_li
237+
)
238+
for attr in "execution", "return_checking", "code":
239+
setattr(
240+
fb,
241+
attr,
242+
"\n\n".join(
243+
[
244+
getattr(task_li_feedback[task_id], attr)
245+
for task_li_feedback in task_li_feedback_li
246+
if getattr(task_li_feedback[task_id], attr) is not None
247+
]
219248
),
220249
)
221-
for index in range(len(evo.sub_tasks))
222-
],
223-
n=RD_AGENT_SETTINGS.multi_proc_n,
224-
)
250+
merged_task_feedback.append(fb)
225251

226252
final_decision = [
227253
None if single_feedback is None else single_feedback.final_decision
228-
for single_feedback in multi_implementation_feedback
254+
for single_feedback in merged_task_feedback
229255
]
230256
logger.info(f"Final decisions: {final_decision} True count: {final_decision.count(True)}")
231257

258+
# TODO: this is to be compatible with factor_implementation;
232259
for index in range(len(evo.sub_tasks)):
233260
if final_decision[index]:
234261
evo.sub_tasks[index].factor_implementation = True
235262

236-
return CoSTEERMultiFeedback(multi_implementation_feedback)
263+
return CoSTEERMultiFeedback(merged_task_feedback)

rdagent/components/coder/data_science/conf.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Literal
22

3+
from rdagent.app.data_science.conf import DS_RD_SETTING
34
from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
45
from rdagent.utils.env import (
56
CondaConf,
@@ -48,3 +49,15 @@ def get_ds_env(conf_type: Literal["kaggle", "mlebench"] = "kaggle") -> Env:
4849
else:
4950
raise ValueError(f"Unknown env type: {conf.env_type}")
5051
return env
52+
53+
54+
def get_clear_ws_cmd(stage: Literal["before_training", "before_inference"] = "before_training") -> str:
55+
"""
56+
Clean the files in workspace to a specific stage
57+
"""
58+
assert stage in ["before_training", "before_inference"], f"Unknown stage: {stage}"
59+
if DS_RD_SETTING.enable_model_dump and stage == "before_training":
60+
cmd = "rm -r submission.csv scores.csv models"
61+
else:
62+
cmd = "rm submission.csv scores.csv"
63+
return cmd

rdagent/components/coder/data_science/pipeline/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
DataLoaderCoSTEEREvaluator,
4949
)
5050
from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
51+
from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
5152
from rdagent.core.exception import CoderError
5253
from rdagent.core.experiment import FBWorkspace
5354
from rdagent.core.scenario import Scenario
@@ -95,6 +96,7 @@ def implement_one_task(
9596
out_spec=PythonAgentOut.get_spec(),
9697
runtime_environment=runtime_environment,
9798
spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
99+
enable_model_dump=DS_RD_SETTING.enable_model_dump,
98100
)
99101
user_prompt = T(".prompts:pipeline_coder.user").r(
100102
competition_info=competition_info,
@@ -146,8 +148,12 @@ def __init__(
146148
**kwargs,
147149
) -> None:
148150
settings = DSCoderCoSTEERSettings()
151+
eval_l = [PipelineCoSTEEREvaluator(scen=scen)]
152+
if DS_RD_SETTING.enable_model_dump:
153+
eval_l.append(ModelDumpEvaluator(scen=scen, data_type="sample"))
154+
149155
eva = CoSTEERMultiEvaluator(
150-
PipelineCoSTEEREvaluator(scen=scen), scen=scen
156+
single_evaluator=eval_l, scen=scen
151157
) # Please specify whether you agree running your eva in parallel or not
152158
es = PipelineMultiProcessEvolvingStrategy(scen=scen, settings=settings)
153159

rdagent/components/coder/data_science/pipeline/eval.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from rdagent.components.coder.CoSTEER.knowledge_management import (
1616
CoSTEERQueriedKnowledgeV2,
1717
)
18-
from rdagent.components.coder.data_science.conf import get_ds_env
18+
from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
1919
from rdagent.core.experiment import FBWorkspace, Task
2020
from rdagent.utils.agent.tpl import T
2121
from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
@@ -55,7 +55,7 @@ def evaluate(
5555
env.conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
5656

5757
# Clean the scores.csv & submission.csv.
58-
implementation.execute(env=env, entry=f"rm submission.csv scores.csv")
58+
implementation.execute(env=env, entry=get_clear_ws_cmd())
5959
stdout, execute_ret_code = implementation.execute_ret_code(env=env, entry=f"python main.py")
6060
stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", stdout)
6161

rdagent/components/coder/data_science/pipeline/prompts.yaml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ pipeline_coder:
5858
- An evaluation agent will help to check whether the EDA part is added correctly.
5959
- During the EDA part, you should try to avoid any irrelevant information sending to the standard output.
6060
61+
{% if enable_model_dump %}
62+
## Model Dumping
63+
{% include "components.coder.data_science.share.prompts:dump_model_coder.guideline" %}
64+
{% endif %}
65+
66+
67+
6168
## Output Format
6269
{% if out_spec %}
6370
{{ out_spec }}
@@ -125,10 +132,10 @@ pipeline_eval:
125132
"final_decision": <true/false>
126133
}
127134
```
128-
135+
129136
user: |-
130137
--------- code generated by user ---------
131138
{{ code }}
132139
133140
--------- code running stdout ---------
134-
{{ stdout }}
141+
{{ stdout }}
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
from pathlib import Path
2+
from typing import Literal
3+
4+
import pandas as pd
5+
6+
from rdagent.app.data_science.conf import DS_RD_SETTING
7+
from rdagent.components.coder.CoSTEER import CoSTEERMultiFeedback
8+
from rdagent.components.coder.CoSTEER.evaluators import (
9+
CoSTEEREvaluator,
10+
CoSTEERSingleFeedback,
11+
)
12+
from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
13+
from rdagent.components.coder.data_science.utils import remove_eda_part
14+
from rdagent.core.experiment import FBWorkspace, Task
15+
from rdagent.core.scenario import Scenario
16+
from rdagent.utils.agent.tpl import T
17+
from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
18+
19+
DIRNAME = Path(__file__).absolute().resolve().parent
20+
21+
PipelineSingleFeedback = CoSTEERSingleFeedback
22+
PipelineMultiFeedback = CoSTEERMultiFeedback
23+
24+
25+
class ModelDumpEvaluator(CoSTEEREvaluator):
26+
"""This evaluator assumes that it runs after the model"""
27+
28+
def __init__(self, scen: Scenario, data_type: Literal["sample", "full"]):
29+
super().__init__(scen)
30+
self.data_type = data_type
31+
32+
def evaluate(
33+
self, target_task: Task, implementation: FBWorkspace, gt_implementation: FBWorkspace, *kargs, **kwargs
34+
) -> CoSTEERSingleFeedback:
35+
36+
model_folder = implementation.workspace_path / "models"
37+
# 1) Check if the model_folder is not empty
38+
if not model_folder.exists() or not any(model_folder.iterdir()):
39+
err_msg = "Model folder (`models` sub folder) is empty or does not exist. The model is not dumped."
40+
return CoSTEERSingleFeedback(
41+
execution=err_msg,
42+
return_checking=err_msg,
43+
code=err_msg,
44+
final_decision=False,
45+
)
46+
env = get_ds_env()
47+
env.conf.extra_volumes = {
48+
f"{DS_RD_SETTING.local_data_path}/{'sample/' if self.data_type == 'sample' else ''}{self.scen.competition}": "/kaggle/input"
49+
}
50+
51+
# 2) check the result and stdout after reruning the model.
52+
53+
# Remove the files submission.csv and scores.csv
54+
implementation.execute(env=env, entry=get_clear_ws_cmd(stage="before_inference"))
55+
56+
# Execute the main script
57+
stdout = remove_eda_part(implementation.execute(env=env, entry="python main.py"))
58+
59+
# walk model_folder and list the files
60+
model_folder_files = [
61+
str(file.relative_to(implementation.workspace_path)) for file in model_folder.iterdir() if file.is_file()
62+
]
63+
64+
# this will assert the generation of necessary files
65+
for f in ["submission.csv", "scores.csv"]:
66+
if not (implementation.workspace_path / f).exists():
67+
err_msg = f"{f} does not exist. The model is not dumped. Make sure that the required files, like submission.csv and scores.csv, are created even if you bypass the model training step by loading the saved model file directly."
68+
return CoSTEERSingleFeedback(
69+
execution=err_msg,
70+
return_checking=err_msg,
71+
code=err_msg,
72+
final_decision=False,
73+
)
74+
75+
# Read the content of files submission.csv and scores.csv before execution
76+
submission_content_before = (
77+
(implementation.workspace_path / "submission.csv").read_text()
78+
if (implementation.workspace_path / "submission.csv").exists()
79+
else None
80+
)
81+
scores_content_before = (
82+
(implementation.workspace_path / "scores.csv").read_text()
83+
if (implementation.workspace_path / "scores.csv").exists()
84+
else None
85+
)
86+
87+
assert submission_content_before is not None
88+
assert scores_content_before is not None
89+
90+
submission_content_after = (implementation.workspace_path / "submission.csv").read_text()
91+
scores_content_after = (implementation.workspace_path / "scores.csv").read_text()
92+
93+
system_prompt = T(".prompts:dump_model_eval.system").r()
94+
user_prompt = T(".prompts:dump_model_eval.user").r(
95+
stdout=stdout.strip(),
96+
code=implementation.all_codes,
97+
model_folder_files=model_folder_files,
98+
scores_content_before=scores_content_before,
99+
scores_content_after=scores_content_after,
100+
)
101+
102+
csfb = build_cls_from_json_with_retry(
103+
CoSTEERSingleFeedback,
104+
system_prompt=system_prompt,
105+
user_prompt=user_prompt,
106+
)
107+
108+
if DS_RD_SETTING.model_dump_check_level == "high":
109+
# Read the content of files submission.csv and scores.csv after execution
110+
# Check if the content has changed
111+
# excactly same checking. But it will take more user's time
112+
if scores_content_before != scores_content_after:
113+
return_msg = "\n[Error] The content of scores.csv has changed. Please check the code to ensure that the model is dumped correctly, and rerun the code to use the model directly without retraining it."
114+
return_msg += f"\nBefore:\n{scores_content_before}\nAfter:\n{scores_content_after}"
115+
if submission_content_before != submission_content_after:
116+
# If the scores file changes, display the two contents and append it into the return_checking
117+
return_msg = "[Error] The content of submission.csv has changed. Please check the code to ensure that the model is dumped correctly, and rerun the code to use the model directly without retraining it."
118+
csfb.return_checking = (csfb.return_checking or "") + return_msg
119+
return csfb
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
dump_model_coder:
2+
guideline: |-
3+
Please dump the model in a "models/" subfolder in the first running, and the script rerun performs inference without needing to retrain the model.
4+
If there are parameters generated from the training data that might be needed for inference on test data, please save them in the "models/" subfolder as well.
5+
Make sure that the required files, like submission.csv and scores.csv, are created even if you bypass the model training step by loading the saved model file directly.
6+
7+
dump_model_eval:
8+
system: |-
9+
You are a data scientist tasked with evaluating code generation. You've developed a Kaggle competition code that can produce a submission file.
10+
The code should follow the guideline below:
11+
{% include "components.coder.data_science.share.prompts:dump_model_coder.guideline" %}
12+
13+
You will receive the following information:
14+
- The implemented code
15+
- The stdout from running the code
16+
- The file list in "models/" subfolder
17+
- The scores.csv file generated during both training and inference (if it exists)
18+
19+
Focus on these aspects:
20+
- Check if the code saves the model in the "models/" subfolder.
21+
- Ensure that when the code is rerun, it skips the training process and loads the model from the "models/" subfolder for direct inference.
22+
- Verify that there is no training activity in the output.
23+
- Ensure that even if you skip the model training by loading saved models, the files like scores.csv and submission.csv are still correctly created.
24+
- The model's performance should remain consistent and not vary unreasonably between training and inference.
25+
26+
Please respond with your feedback in the following JSON format and order
27+
```json
28+
{
29+
"execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. Carefully check the stdout to ensure that when the code is rerun, it skips the training process and loads the model from the 'models/' subfolder for direct inference. Append the information that makes you think that the model is still being retrained when rerunning the code."
30+
"return_checking": "Verify the generated files include necessary files. Make sure scores.csv file does not change unreasonably between training and inference",
31+
"code": "The code has explicity dump the model into 'models/' subfolder; When the modes files are already in 'models/' subfolder, the code will explicity skip the training process.",
32+
"final_decision": <true or false in boolean type; only return true when ensuring that the code saves the model in a 'models/' subfolder, and the script rerun performs inference without needing to retrain the model.>
33+
}
34+
```
35+
36+
user: |-
37+
------------ The implemented code ------------
38+
{{code}}
39+
40+
------------ The stdout from running the code ------------
41+
{{stdout}}
42+
43+
------------ The file list in "models/" subfolder ------------
44+
{% for f in model_folder_files %}
45+
- {{ f }}
46+
{% endfor %}
47+
48+
------------ The scores.csv file generated ------------
49+
# Training:
50+
{{scores_content_before}}
51+
52+
# Inference:
53+
{{scores_content_after}}
54+
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import re
2+
3+
4+
def remove_eda_part(stdout: str) -> str:
5+
"""Data Science scenario have a LLM-based EDA feature. We can remove it when current task does not involve EDA"""
6+
return re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", stdout, flags=re.DOTALL)

0 commit comments

Comments
 (0)