Skip to content

Commit a16b70f

Browse files
XianBWyou-n-gWinstonLiyt
authored
feat: make spec optional (#719)
* feat: Add spec_enabled configuration for data science settings * make spec alternative * change spec logic in exp_gen * remove some general texts * align --------- Co-authored-by: Young <[email protected]> Co-authored-by: yuanteli <[email protected]>
1 parent 8e50793 commit a16b70f

File tree

20 files changed

+334
-92
lines changed

20 files changed

+334
-92
lines changed

rdagent/app/data_science/conf.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,15 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
1919
full_timeout: int = 3600
2020
"""The timeout limit for running on full data"""
2121

22+
### specific feature
23+
24+
#### enable specification
25+
spec_enabled: bool = True
26+
# - [ ] rdagent/components/coder/data_science/raw_data_loader/__init__.py: make spec implementation optional
27+
# - [ ] move spec responsibility into rdagent/scenarios/data_science/share.yaml
28+
# - [ ] make all spec.md optional; but replace it with the test & responsibility. "spec/.*\.md".
29+
# - [ ] replace yaml render with target test. "spec > .yaml data_science !out_spec !task_spec model_spec"
30+
# - [ ] At the head of all tests, emphasis the function to be tested.
31+
2232

2333
DS_RD_SETTING = DataScienceBasePropSetting()

rdagent/components/coder/data_science/ensemble/__init__.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,12 @@
1212
"""
1313

1414
import json
15+
from pathlib import Path
1516
from typing import Dict
1617

18+
from jinja2 import Environment, StrictUndefined
19+
20+
from rdagent.app.data_science.conf import DS_RD_SETTING
1721
from rdagent.components.coder.CoSTEER import CoSTEER
1822
from rdagent.components.coder.CoSTEER.evaluators import (
1923
CoSTEERMultiEvaluator,
@@ -35,6 +39,8 @@
3539
from rdagent.utils.agent.ret import PythonAgentOut
3640
from rdagent.utils.agent.tpl import T
3741

42+
DIRNAME = Path(__file__).absolute().resolve().parent
43+
3844

3945
class EnsembleMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
4046
def implement_one_task(
@@ -79,8 +85,24 @@ def implement_one_task(
7985
all_code=workspace.all_codes,
8086
out_spec=PythonAgentOut.get_spec(),
8187
)
88+
89+
if DS_RD_SETTING.spec_enabled:
90+
code_spec = workspace.file_dict["spec/ensemble.md"]
91+
else:
92+
test_code = (
93+
Environment(undefined=StrictUndefined)
94+
.from_string((DIRNAME / "eval_tests" / "ensemble_test.txt").read_text())
95+
.render(
96+
model_names=[
97+
fn[:-3] for fn in workspace.file_dict.keys() if fn.startswith("model_") and "test" not in fn
98+
]
99+
)
100+
)
101+
code_spec = T("scenarios.data_science.share:component_spec.general").r(
102+
spec=T("scenarios.data_science.share:component_spec.Ensemble").r(), test_code=test_code
103+
)
82104
user_prompt = T(".prompts:ensemble_coder.user").r(
83-
ensemble_spec=workspace.file_dict["spec/ensemble.md"],
105+
code_spec=code_spec,
84106
latest_code=workspace.file_dict.get("ensemble.py"),
85107
latest_code_feedback=prev_task_feedback,
86108
)

rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
"""
2-
A qualified ensemble implementation should:
3-
- Successfully run
2+
Tests for `ensemble_workflow` in ensemble.py
3+
4+
A qualified ensemble_workflow implementation should:
45
- Return predictions
56
- Have correct shapes for inputs and outputs
67
- Use validation data appropriately
8+
- Generate a scores.csv file
79
"""
810

911
import numpy as np

rdagent/components/coder/data_science/ensemble/prompts.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ ensemble_coder:
5151
{% endif %}
5252
5353
user: |-
54-
--------- Ensemble Specification ---------
55-
{{ ensemble_spec }}
54+
--------- Code Specification ---------
55+
{{ code_spec }}
5656
5757
{% if latest_code %}
5858
--------- Former code ---------

rdagent/components/coder/data_science/feature/__init__.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import json
2+
from pathlib import Path
23
from typing import Dict
34

5+
from rdagent.app.data_science.conf import DS_RD_SETTING
46
from rdagent.components.coder.CoSTEER import CoSTEER
57
from rdagent.components.coder.CoSTEER.evaluators import (
68
CoSTEERMultiEvaluator,
@@ -22,6 +24,8 @@
2224
from rdagent.utils.agent.ret import PythonAgentOut
2325
from rdagent.utils.agent.tpl import T
2426

27+
DIRNAME = Path(__file__).absolute().resolve().parent
28+
2529

2630
class FeatureMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
2731
def implement_one_task(
@@ -64,8 +68,16 @@ def implement_one_task(
6468
queried_former_failed_knowledge=queried_former_failed_knowledge[0],
6569
out_spec=PythonAgentOut.get_spec(),
6670
)
71+
code_spec = (
72+
workspace.file_dict["spec/feature.md"]
73+
if DS_RD_SETTING.spec_enabled
74+
else T("scenarios.data_science.share:component_spec.general").r(
75+
spec=T("scenarios.data_science.share:component_spec.FeatureEng").r(),
76+
test_code=(DIRNAME / "eval_tests" / "feature_test.txt").read_text(),
77+
)
78+
)
6779
user_prompt = T(".prompts:feature_coder.user").r(
68-
feature_spec=workspace.file_dict["spec/feature.md"],
80+
code_spec=code_spec,
6981
latest_code=workspace.file_dict.get("feature.py"),
7082
latest_code_feedback=prev_task_feedback,
7183
)

rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
11
"""
2-
A qualified data loader should support following features
3-
- successfully run
4-
- len(test) == len(test_ids) == submission length
5-
- len(train) == len(y)
6-
7-
Please make sure the stdout is rich enough to support informative feedback
2+
Tests for `feat_eng` in feature.py
83
"""
94

105
import pickle

rdagent/components/coder/data_science/feature/prompts.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ feature_coder:
5555
{% endif %}
5656
5757
user: |-
58-
--------- Feature Processing Specification ---------
59-
{{ feature_spec }}
58+
--------- Code Specification ---------
59+
{{ code_spec }}
6060
6161
{% if latest_code %}
6262
--------- Former code ---------

rdagent/components/coder/data_science/model/__init__.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
from pathlib import Path
12
from typing import Dict
23

4+
from rdagent.app.data_science.conf import DS_RD_SETTING
35
from rdagent.components.coder.CoSTEER import CoSTEER
46
from rdagent.components.coder.CoSTEER.evaluators import (
57
CoSTEERMultiEvaluator,
@@ -23,6 +25,8 @@
2325
from rdagent.utils.agent.ret import PythonBatchEditOut
2426
from rdagent.utils.agent.tpl import T
2527

28+
DIRNAME = Path(__file__).absolute().resolve().parent
29+
2630

2731
class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
2832
def implement_one_task(
@@ -71,8 +75,16 @@ def implement_one_task(
7175
# latest_code=workspace.file_dict.get(f"{target_task.name}.py", None),
7276
# )
7377
# We want to use a simpler way to
78+
code_spec = (
79+
workspace.file_dict["spec/model.md"]
80+
if DS_RD_SETTING.spec_enabled
81+
else T("scenarios.data_science.share:component_spec.general").r(
82+
spec=T("scenarios.data_science.share:component_spec.Model").r(),
83+
test_code=(DIRNAME / "eval_tests" / "model_test.txt").read_text().replace("model01", target_task.name),
84+
)
85+
)
7486
user_prompt = T(".prompts:model_coder.user_general").r(
75-
model_spec=workspace.file_dict["spec/model.md"],
87+
code_spec=code_spec,
7688
latest_model_code=workspace.get_codes(
7789
r"^model_(?!test)\w+\.py$"
7890
), # TODO: If we have high failure rate here, we should clean this step with less information.

rdagent/components/coder/data_science/model/eval.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,6 @@ def evaluate(
9898
task_desc=target_task.get_task_information(),
9999
test_code=test_code,
100100
code=implementation.file_dict[f"{target_task.name}.py"],
101-
scenario=self.scen.get_scenario_all_desc(),
102-
spec=implementation.file_dict["spec/model.md"],
103101
workflow_stdout=workflow_stdout,
104102
workflow_code=implementation.all_codes,
105103
)

rdagent/components/coder/data_science/model/eval_tests/model_test.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
"""
2+
Tests for `model_workflow` in model01.py
3+
"""
14
import time
25

36
from feature import feat_eng

0 commit comments

Comments
 (0)