Skip to content

Commit 084dd6d

Browse files
you-n-gpeteryang1
andauthored
feat: condaenv & full docker env (#668)
* use conda to run kaggle and mlebench code * refactor: Simplify environment configuration and execution logic * add setting to use local env in ds * refine dockerfile * fix: Move MLEBDockerEnv initialization inside conditionals & fix condaenv * refactor: reformat code for better readability and consistency * feat: add conda env to all envs. * fix: fix bugs when run loop * refactor: Simplify DockerEnv configuration in mle_summary.py * fix image bug * style: reformat code for better readability and consistency * change commit * feat: Add entrypoint script for sing_docker scenario in rdagent * refactor: add Any type hints and comments for clarity in env.py * feat: Create log directory if it doesn't exist in entrypoint script * feat: Add debug mode and list root directory in entrypoint script * fix: Remove specific branch checkout in Dockerfile for RD-Agent * fix: Add competition argument to loop.py script execution * fix: Correct directory navigation and dependency installation in entrypoint.sh * fix: Correct user ownership assignment in entrypoint script * refactor: Comment out redundant log copying to RD_OUTPUT_DIR * fix: Unset LOG_TRACE_PATH to prevent log contamination in entrypoint.sh --------- Co-authored-by: Xu Yang <[email protected]>
1 parent 13f7922 commit 084dd6d

File tree

20 files changed

+834
-246
lines changed

20 files changed

+834
-246
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,6 @@ mlruns/
170170

171171
# shell script
172172
*.out
173-
*.sh
173+
/*.sh
174174
.aider*
175175
rdagent/app/benchmark/factor/example.json

rdagent/components/coder/data_science/conf.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,15 @@
1+
from typing import Literal
2+
13
from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
4+
from rdagent.utils.env import (
5+
CondaConf,
6+
DockerEnv,
7+
DSDockerConf,
8+
Env,
9+
LocalEnv,
10+
MLEBDockerConf,
11+
MLECondaConf,
12+
)
213

314

415
class DSCoderCoSTEERSettings(CoSTEERSettings):
@@ -8,3 +19,32 @@ class Config:
819
env_prefix = "DS_Coder_CoSTEER_"
920

1021
max_seconds: int = 2400
22+
env_type: str = "docker"
23+
# TODO: extract a function for env and conf.
24+
25+
26+
def get_ds_env(conf_type: Literal["kaggle", "mlebench"] = "kaggle") -> Env:
27+
"""
28+
Retrieve the appropriate environment configuration based on the env_type setting.
29+
30+
Returns:
31+
Env: An instance of the environment configured either as DockerEnv or LocalEnv.
32+
33+
Raises:
34+
ValueError: If the env_type is not recognized.
35+
"""
36+
conf = DSCoderCoSTEERSettings()
37+
assert conf_type in ["kaggle", "mlebench"], f"Unknown conf_type: {conf_type}"
38+
39+
if conf.env_type == "docker":
40+
env_conf = DSDockerConf() if conf_type == "kaggle" else MLEBDockerConf()
41+
env = DockerEnv(conf=env_conf)
42+
elif conf.env_type == "conda":
43+
env = LocalEnv(
44+
conf=(
45+
CondaConf(conda_env_name=conf_type) if conf_type == "kaggle" else MLECondaConf(conda_env_name=conf_type)
46+
)
47+
)
48+
else:
49+
raise ValueError(f"Unknown env type: {conf.env_type}")
50+
return env

rdagent/components/coder/data_science/ensemble/eval.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,11 @@
99
CoSTEEREvaluator,
1010
CoSTEERSingleFeedback,
1111
)
12+
from rdagent.components.coder.data_science.conf import get_ds_env
1213
from rdagent.core.evolving_framework import QueriedKnowledge
1314
from rdagent.core.experiment import FBWorkspace, Task
14-
from rdagent.oai.llm_utils import APIBackend
1515
from rdagent.utils.agent.tpl import T
1616
from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
17-
from rdagent.utils.env import DockerEnv, DSDockerConf
1817

1918
DIRNAME = Path(__file__).absolute().resolve().parent
2019

@@ -45,11 +44,8 @@ def evaluate(
4544
final_decision=False,
4645
)
4746

48-
ds_docker_conf = DSDockerConf()
49-
ds_docker_conf.extra_volumes = {
50-
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
51-
}
52-
de = DockerEnv(conf=ds_docker_conf)
47+
env = get_ds_env()
48+
env.conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
5349

5450
fname = "test/ensemble_test.txt"
5551
test_code = (DIRNAME / "eval_tests" / "ensemble_test.txt").read_text()
@@ -64,12 +60,12 @@ def evaluate(
6460
)
6561

6662
implementation.inject_files(**{fname: test_code})
67-
stdout, ret_code = implementation.execute_ret_code(env=de, entry=f"python {fname}")
63+
stdout, ret_code = implementation.execute_ret_code(env=env, entry=f"python {fname}")
6864

6965
stdout += f"\nNOTE: the above scripts run with return code {ret_code}"
7066

7167
if "main.py" in implementation.file_dict:
72-
workflow_stdout = implementation.execute(env=de, entry="python main.py")
68+
workflow_stdout = implementation.execute(env=env, entry="python main.py")
7369
workflow_stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", workflow_stdout)
7470
else:
7571
workflow_stdout = None

rdagent/components/coder/data_science/feature/eval.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@
77
CoSTEEREvaluator,
88
CoSTEERSingleFeedback,
99
)
10+
from rdagent.components.coder.data_science.conf import get_ds_env
1011
from rdagent.core.evolving_framework import QueriedKnowledge
1112
from rdagent.core.experiment import FBWorkspace, Task
12-
from rdagent.oai.llm_utils import APIBackend
1313
from rdagent.utils.agent.tpl import T
1414
from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
15-
from rdagent.utils.env import DockerEnv, DSDockerConf
1615
from rdagent.utils.fmt import shrink_text
1716

1817
DIRNAME = Path(__file__).absolute().resolve().parent
@@ -45,22 +44,18 @@ def evaluate(
4544
final_decision=False,
4645
)
4746

48-
ds_docker_conf = DSDockerConf()
49-
# TODO: we should /= 20 for the timeout period on debug component
50-
ds_docker_conf.extra_volumes = {
51-
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
52-
}
53-
de = DockerEnv(conf=ds_docker_conf)
47+
env = get_ds_env()
48+
env.conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
5449

5550
# TODO: do we need to clean the generated temporary content?
5651
fname = "test/feature_test.py"
5752
test_code = (DIRNAME / "eval_tests" / "feature_test.txt").read_text()
5853
implementation.inject_files(**{fname: test_code})
5954

60-
stdout = implementation.execute(env=de, entry=f"python {fname}")
55+
stdout = implementation.execute(env=env, entry=f"python {fname}")
6156

6257
if "main.py" in implementation.file_dict:
63-
workflow_stdout = implementation.execute(env=de, entry="python main.py")
58+
workflow_stdout = implementation.execute(env=env, entry="python main.py")
6459
workflow_stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", workflow_stdout)
6560
else:
6661
workflow_stdout = None

rdagent/components/coder/data_science/model/eval.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@
1212
CoSTEEREvaluator,
1313
CoSTEERSingleFeedback,
1414
)
15+
from rdagent.components.coder.data_science.conf import get_ds_env
1516
from rdagent.core.evolving_framework import QueriedKnowledge
1617
from rdagent.core.exception import CoderError
1718
from rdagent.core.experiment import FBWorkspace, Task
1819
from rdagent.oai.llm_utils import APIBackend
1920
from rdagent.utils.agent.tpl import T
2021
from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
21-
from rdagent.utils.env import DockerEnv, DSDockerConf
2222

2323
DIRNAME = Path(__file__).absolute().resolve().parent
2424
ModelSingleFeedback = CoSTEERSingleFeedback
@@ -56,26 +56,23 @@ def evaluate(
5656
final_decision=False,
5757
)
5858

59-
ds_docker_conf = DSDockerConf()
60-
ds_docker_conf.extra_volumes = {
61-
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
62-
}
63-
de = DockerEnv(conf=ds_docker_conf)
59+
env = get_ds_env()
60+
env.conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
6461

6562
fname = "test/model_test.py"
6663
test_code = (
6764
(DIRNAME / "eval_tests" / "model_test.txt").read_text().replace("model01", target_task.name)
6865
) # only check the model changed this time
6966
implementation.inject_files(**{fname: test_code})
70-
stdout = implementation.execute(env=de, entry=f"python {fname}")
67+
stdout = implementation.execute(env=env, entry=f"python {fname}")
7168

7269
if stdout is None:
7370
raise CoderError(
7471
"The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
7572
)
7673

7774
if "main.py" in implementation.file_dict:
78-
workflow_stdout = implementation.execute(env=de, entry="python main.py")
75+
workflow_stdout = implementation.execute(env=env, entry="python main.py")
7976
workflow_stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", workflow_stdout)
8077
else:
8178
workflow_stdout = None

rdagent/components/coder/data_science/model/exp.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,6 @@
1-
import pickle
2-
import site
3-
import traceback
4-
from pathlib import Path
51
from typing import Dict, Optional
62

73
from rdagent.components.coder.CoSTEER.task import CoSTEERTask
8-
from rdagent.core.experiment import Experiment, FBWorkspace
9-
from rdagent.core.utils import cache_with_pickle
10-
from rdagent.oai.llm_utils import md5_hash
11-
from rdagent.utils.env import DockerEnv, DSDockerConf
124

135

146
# Because we use isinstance to distinguish between different types of tasks, we need to use sub classes to represent different types of tasks

rdagent/components/coder/data_science/raw_data_loader/__init__.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@
3838
from rdagent.components.coder.CoSTEER.knowledge_management import (
3939
CoSTEERQueriedKnowledge,
4040
)
41-
from rdagent.components.coder.data_science.conf import DSCoderCoSTEERSettings
41+
from rdagent.components.coder.data_science.conf import (
42+
DSCoderCoSTEERSettings,
43+
get_ds_env,
44+
)
4245
from rdagent.components.coder.data_science.raw_data_loader.eval import (
4346
DataLoaderCoSTEEREvaluator,
4447
)
@@ -48,7 +51,6 @@
4851
from rdagent.core.scenario import Scenario
4952
from rdagent.oai.llm_utils import APIBackend
5053
from rdagent.utils.agent.tpl import T
51-
from rdagent.utils.env import DockerEnv, DSDockerConf
5254

5355

5456
class DataLoaderMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
@@ -214,10 +216,10 @@ def __init__(
214216
def develop(self, exp):
215217
new_exp = super().develop(exp)
216218

217-
ds_docker_conf = DSDockerConf()
218-
ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
219-
de = DockerEnv(conf=ds_docker_conf)
220-
stdout = new_exp.experiment_workspace.execute(env=de, entry=f"python test/data_loader_test.py")
219+
env = get_ds_env()
220+
env.conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
221+
222+
stdout = new_exp.experiment_workspace.execute(env=env, entry=f"python test/data_loader_test.py")
221223
match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===", stdout, re.DOTALL)
222224
eda_output = match.groups()[1] if match else None
223225
self.scen.eda_output = eda_output

rdagent/components/coder/data_science/raw_data_loader/eval.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@
1212
from rdagent.components.coder.CoSTEER.knowledge_management import (
1313
CoSTEERQueriedKnowledgeV2,
1414
)
15+
from rdagent.components.coder.data_science.conf import get_ds_env
1516
from rdagent.core.experiment import FBWorkspace, Task
16-
from rdagent.oai.llm_utils import APIBackend
1717
from rdagent.utils.agent.tpl import T
1818
from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
19-
from rdagent.utils.env import DockerEnv, DSDockerConf
2019

2120
DIRNAME = Path(__file__).absolute().resolve().parent
2221

@@ -48,25 +47,22 @@ def evaluate(
4847
final_decision=False,
4948
)
5049

51-
ds_docker_conf = DSDockerConf()
52-
ds_docker_conf.extra_volumes = {
53-
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
54-
}
55-
de = DockerEnv(conf=ds_docker_conf)
50+
env = get_ds_env()
51+
env.conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
5652

5753
# TODO: do we need to clean the generated temporary content?
5854
fname = "test/data_loader_test.py"
5955
test_code = (DIRNAME / "eval_tests" / "data_loader_test.txt").read_text()
6056
implementation.inject_files(**{fname: test_code})
61-
stdout = implementation.execute(env=de, entry=f"python {fname}")
57+
stdout = implementation.execute(env=env, entry=f"python {fname}")
6258
match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===(.*)", stdout, re.DOTALL)
6359
stdout_part_1, eda_output, stdout_part_2 = match.groups() if match else (stdout, None, "")
6460
stdout = stdout_part_1 + stdout_part_2
6561
if eda_output is not None and len(eda_output.split(" ")) > 10000:
6662
eda_output += "Length of EDA output is too long, truncated. Please reject this implementation and motivate it to reduce the length of EDA output."
6763

6864
if "main.py" in implementation.file_dict:
69-
workflow_stdout = implementation.execute(env=de, entry="python main.py")
65+
workflow_stdout = implementation.execute(env=env, entry="python main.py")
7066
workflow_stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", workflow_stdout)
7167
else:
7268
workflow_stdout = None

rdagent/components/coder/data_science/raw_data_loader/exp.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,4 @@
1-
import pickle
2-
import site
3-
import traceback
4-
from pathlib import Path
5-
from typing import Dict, Optional
6-
71
from rdagent.components.coder.CoSTEER.task import CoSTEERTask
8-
from rdagent.core.experiment import Experiment, FBWorkspace
9-
from rdagent.core.utils import cache_with_pickle
10-
from rdagent.oai.llm_utils import md5_hash
11-
from rdagent.utils.agent.tpl import T
12-
from rdagent.utils.env import DockerEnv, DSDockerConf
132

143

154
# Because we use isinstance to distinguish between different types of tasks, we need to use sub classes to represent different types of tasks

rdagent/components/coder/data_science/workflow/eval.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,11 @@
1010
CoSTEERMultiFeedback,
1111
CoSTEERSingleFeedback,
1212
)
13+
from rdagent.components.coder.data_science.conf import get_ds_env
1314
from rdagent.core.evolving_framework import QueriedKnowledge
1415
from rdagent.core.experiment import FBWorkspace, Task
15-
from rdagent.oai.llm_utils import APIBackend
1616
from rdagent.utils.agent.tpl import T
1717
from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
18-
from rdagent.utils.env import DockerEnv, DSDockerConf, MLEBDockerConf
1918

2019
DIRNAME = Path(__file__).absolute().resolve().parent
2120

@@ -54,12 +53,8 @@ def evaluate(
5453
final_decision=False,
5554
)
5655

57-
# DockerEnv for Kaggle Competition
58-
ds_docker_conf = DSDockerConf()
59-
ds_docker_conf.extra_volumes = {
60-
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
61-
}
62-
de = DockerEnv(conf=ds_docker_conf)
56+
env = get_ds_env()
57+
env.conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
6358

6459
# # DockerEnv for MLEBench submission validation
6560
# mle_de_conf = MLEBDockerConf()
@@ -70,9 +65,9 @@ def evaluate(
7065
# mde.prepare()
7166

7267
# Clean the scores.csv & submission.csv.
73-
implementation.execute(env=de, entry=f"rm submission.csv scores.csv")
68+
implementation.execute(env=env, entry=f"rm submission.csv scores.csv")
7469

75-
stdout = implementation.execute(env=de, entry=f"python main.py")
70+
stdout = implementation.execute(env=env, entry=f"python main.py")
7671
stdout = re.sub(r"=== Start of EDA part ===(.*)=== End of EDA part ===", "", stdout)
7772

7873
# Check score file
@@ -102,7 +97,7 @@ def evaluate(
10297
implementation.inject_files(**{"test/submission_format_test.py": base_check_code})
10398
# stdout += "----Submission Check 1-----\n"
10499
submission_check_out, submission_ret_code = implementation.execute_ret_code(
105-
env=de, entry="python test/submission_format_test.py"
100+
env=env, entry="python test/submission_format_test.py"
106101
)
107102
stdout += "\n" + submission_check_out
108103

0 commit comments

Comments
 (0)