Skip to content

Commit 6a840d8

Browse files
feat: add sample submission file check (#1053)
* check sample submission & add package constraint * add trace.log into clear * change default * simplify * clear CI workspace before running * move to CI * use sudo to clean workspace * move prepare out of global var --------- Co-authored-by: Xu Yang <[email protected]>
1 parent 3ea2dab commit 6a840d8

File tree

11 files changed

+109
-36
lines changed

11 files changed

+109
-36
lines changed

rdagent/app/data_science/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
4848
max_trace_hist: int = 3
4949

5050
coder_max_loop: int = 10
51-
runner_max_loop: int = 1
51+
runner_max_loop: int = 3
5252

53-
sample_data_by_LLM: bool = False
53+
sample_data_by_LLM: bool = True
5454
use_raw_description: bool = False
5555
show_nan_columns: bool = False
5656

rdagent/components/coder/data_science/conf.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def get_ds_env(
5454
raise ValueError(f"Unknown env type: {conf.env_type}")
5555
env.conf.extra_volumes = extra_volumes
5656
env.conf.running_timeout_period = running_timeout_period
57+
env.prepare()
5758
return env
5859

5960

@@ -63,7 +64,7 @@ def get_clear_ws_cmd(stage: Literal["before_training", "before_inference"] = "be
6364
"""
6465
assert stage in ["before_training", "before_inference"], f"Unknown stage: {stage}"
6566
if DS_RD_SETTING.enable_model_dump and stage == "before_training":
66-
cmd = "rm -r submission.csv scores.csv models"
67+
cmd = "rm -r submission.csv scores.csv models trace.log"
6768
else:
68-
cmd = "rm submission.csv scores.csv"
69+
cmd = "rm submission.csv scores.csv trace.log"
6970
return cmd

rdagent/components/coder/data_science/pipeline/eval.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,28 @@ def evaluate(
5959
implementation.execute(env=env, entry=get_clear_ws_cmd())
6060
if DS_RD_SETTING.sample_data_by_LLM:
6161
# Because coder runs on full data, we need to run debug mode in advance to save time
62-
result = implementation.run(env=env, entry=f"python -m coverage run main.py --debug")
62+
result = implementation.run(
63+
env=env, entry=f"strace -e trace=file -f -o trace.log python -m coverage run main.py --debug"
64+
)
6365
else:
64-
result = implementation.run(env=env, entry=f"python -m coverage run main.py")
66+
result = implementation.run(
67+
env=env, entry=f"strace -e trace=file -f -o trace.log python -m coverage run main.py"
68+
)
69+
70+
sample_submission_check = True
71+
test_eval = get_test_eval()
72+
if (sample_submission_file_name := test_eval.get_sample_submission_name(self.scen.competition)) is not None:
73+
# check whether code ever opens the sample submission file
74+
if (implementation.workspace_path / "trace.log").exists():
75+
opened_trace_lines = [
76+
line
77+
for line in (implementation.workspace_path / "trace.log").read_text().splitlines()
78+
if "openat" in line and sample_submission_file_name in line
79+
]
80+
if len(opened_trace_lines) > 0:
81+
stdout += f"Code opened the sample submission file '{sample_submission_file_name}' during execution.\n Reject the implementation!\n"
82+
sample_submission_check = False
83+
6584
result.stdout = remove_eda_part(result.stdout)
6685
if result.exit_code != 0:
6786
stdout += f"Code failed to run. Please check the stdout:\n Following the stdout of the debug mode run:\n{result.stdout.strip()}\n"
@@ -114,7 +133,6 @@ def evaluate(
114133
score_check_text += f"\n[Error] in checking the scores.csv file: {e}\nscores.csv's content:\n-----\n{score_fp.read_text()}\n-----"
115134
score_ret_code = 1
116135

117-
test_eval = get_test_eval()
118136
if not test_eval.is_sub_enabled(self.scen.competition):
119137
submission_ret_code = 0
120138
else:
@@ -149,10 +167,15 @@ def evaluate(
149167
user_prompt=user_prompt,
150168
init_kwargs_update_func=PipelineSingleFeedback.val_and_update_init_dict,
151169
)
152-
if score_ret_code != 0:
170+
if score_ret_code != 0 and wfb.final_decision is True:
153171
wfb.final_decision = False
154172
wfb.return_checking += "\n" + score_check_text
155-
if submission_ret_code != 0:
173+
if submission_ret_code != 0 and wfb.final_decision is True:
156174
wfb.final_decision = False
157175
wfb.return_checking += "\nSubmission file check failed."
176+
if sample_submission_check is False and wfb.final_decision is True:
177+
wfb.final_decision = False
178+
wfb.return_checking += (
179+
"\nSample submission file check failed. Code should not open the sample submission file."
180+
)
158181
return wfb

rdagent/components/coder/data_science/pipeline/eval_tests/submission_format_test.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ find . | grep -i sample | grep -i submission | grep -v sample_submission.csv | g
3131
input_dir = Path('{% include "scenarios.data_science.share:scen.input_path" %}')
3232
sample_submission_files = list(input_dir.glob("*sample_submission*.csv")) + list(
3333
input_dir.glob("*sampleSubmission*.csv")
34-
)
34+
) + list(input_dir.glob("*randomPredictions*.tsv"))
3535

3636
if not sample_submission_files:
3737
print(f'Error: No sample submission file found in {% include "scenarios.data_science.share:scen.input_path" %}')

rdagent/log/mle_summary.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,11 @@
2020
from rdagent.scenarios.kaggle.kaggle_crawler import score_rank
2121
from rdagent.utils.workflow import LoopBase
2222

23-
test_eval = get_test_eval()
24-
25-
is_mle = isinstance(test_eval, MLETestEval)
26-
2723

2824
def save_grade_info(log_trace_path: Path):
25+
test_eval = get_test_eval()
26+
27+
is_mle = isinstance(test_eval, MLETestEval)
2928
trace_storage = FileStorage(log_trace_path)
3029
for msg in trace_storage.iter_msg():
3130
if "competition" in msg.tag:
@@ -74,6 +73,9 @@ def _get_loop_and_fn_after_hours(log_folder: Path, hours: int):
7473

7574

7675
def summarize_folder(log_folder: Path, hours: int | None = None):
76+
test_eval = get_test_eval()
77+
78+
is_mle = isinstance(test_eval, MLETestEval)
7779
"""
7880
Summarize the log folder and save the summary as a pickle file.
7981
Args:

rdagent/scenarios/data_science/dev/runner/__init__.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
MultiProcessEvolvingStrategy,
1515
)
1616
from rdagent.components.coder.CoSTEER.task import CoSTEERTask
17+
from rdagent.components.coder.data_science.conf import DSCoderCoSTEERSettings
1718
from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
1819
from rdagent.core.exception import RunnerError
1920
from rdagent.core.scenario import Scenario
@@ -24,6 +25,17 @@
2425
from rdagent.utils.agent.tpl import T
2526

2627

28+
class DSRunnerCoSTEERSettings(DSCoderCoSTEERSettings):
29+
"""Data Science CoSTEER settings"""
30+
31+
class Config:
32+
env_prefix = "DS_Runner_CoSTEER_"
33+
34+
max_seconds: int = 3600
35+
env_type: str = "docker"
36+
# TODO: extract a function for env and conf.
37+
38+
2739
class DSRunnerMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
2840
def implement_one_task(
2941
self,
@@ -97,12 +109,13 @@ def __init__(
97109
eva = CoSTEERMultiEvaluator(
98110
single_evaluator=eval_l, scen=scen
99111
) # Please specify whether you agree running your eva in parallel or not
100-
es = DSRunnerMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
112+
settings = DSRunnerCoSTEERSettings()
113+
es = DSRunnerMultiProcessEvolvingStrategy(scen=scen, settings=settings)
101114

102115
# In runner, we don't need very big loops, so we set max_loop to 3
103116
super().__init__(
104117
*args,
105-
settings=CoSTEER_SETTINGS,
118+
settings=settings,
106119
eva=eva,
107120
es=es,
108121
evolving_version=2,

rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,12 @@ task_gen:
306306
- When only one model is used, its score should be present, and an "ensemble" score (which would be the same as the single model's score in this case) must also be recorded.
307307
- Ensure validation metrics and processes are consistent across all parts of the pipeline. Avoid changes that would alter how validation metrics are calculated unless that is part of the hypothesis.
308308
8. **Submission File (`submission.csv`)**: Generate `submission.csv` in the **exact format** required (column names, order, data types), as detailed by `sample_submission.csv` in the `Competition Scenario Description`. This is a critical step.
309+
9. **Preferred Packages Notes**:
310+
- You can choose the most proper packages for the task to best achieve the hypothesis.
311+
- When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding. Especially those you are not familiar with.
312+
- For GBDT models, prefer XGBoost or RandomForest over LightGBM unless the SOTA or hypothesis dictates otherwise.
313+
- For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
314+
- For neural networks, prefer fine-tuning pre-trained models over training from scratch.
309315
310316
# Guidelines for Sketching the `main.py` Workflow
311317

rdagent/scenarios/data_science/share.yaml

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -264,44 +264,41 @@ component_spec:
264264
{% endraw %}
265265
266266
Pipeline: |-
267-
0. Program Execution:
267+
1. Program Execution:
268268
- The workflow will be executed by running `python main.py` with no command-line arguments. Ensure that `main.py` does not require or expect any parameters.
269269
- The working directory will only contain `main.py`. Any additional files required for execution must be downloaded or generated by `main.py` itself.
270270
271-
1. File Handling:
271+
2. File Handling:
272272
- Handle file encoding and delimiters appropriately.
273273
- Combine or process multiple files if necessary.
274274
- Avoid using the sample submission file to infer test indices. If a dedicated test index file is available, use that. If not, use the order in the test file as the test index.
275275
- Ensure you load the actual data from the files, not just the filenames or paths. Do not postpone data loading to later steps.
276276
277-
2. Data Preprocessing:
277+
3. Data Preprocessing:
278278
- Convert data types correctly (e.g., numeric, categorical, date parsing).
279279
- Optimize memory usage for large datasets using techniques like downcasting or reading data in chunks if necessary.
280280
- Domain-Specific Handling:
281281
- Apply competition-specific preprocessing steps as needed (e.g., text tokenization, image resizing).
282282
283-
3. Code Standards:
283+
4. Code Standards:
284284
- DO NOT use progress bars (e.g., `tqdm`).
285285
- DO NOT use the sample submission file to extract test index information.
286286
- DO NOT exclude features inadvertently during this process.
287287
288-
4. NOTES
288+
5. NOTES
289289
- Never use sample submission as the test index, as it may not be the same as the test data. Use the test index file or test data source to get the test index.
290-
- For neural network models, use pytorch rather than tensorflow as the backend if possible.
291-
- For decision tree models, use xgboost or RandomForest rather than lightgbm as the backend if possible.
292-
- For neural network models, it's always better to firstly try from a pretrained model and then fine-tune it rather than training from scratch.
293290
294-
5. General Considerations:
291+
6. General Considerations:
295292
- Ensure scalability for large datasets.
296293
- Handle missing values and outliers appropriately (e.g., impute, remove, or replace).
297294
- Ensure consistency between feature data types and transformations.
298295
- Prevent data leakage: Do not use information derived from the test set when transforming training data.
299296
- Sampling a subset of the training data for efficiency (e.g., randomly selecting a portion of the data) is discouraged unless it demonstrably improves performance (e.g., removing irrelevant or outlier samples).
300297
301-
6. Notes:
298+
7. Notes:
302299
- GPU and multiprocessing are available and are encouraged to use for accelerating transformations.
303300
304-
7. Metric Calculation and Storage:
301+
8. Metric Calculation and Storage:
305302
- Calculate the metric (mentioned in the evaluation section of the competition information) for each model and ensemble strategy on valid, and save the results in `scores.csv`
306303
- The evaluation should be based on k-fold cross-validation but only if that's an appropriate evaluation for the task at hand. Store the mean validation score of k-fold cross-validation in `scores.csv` on each model. Refer to the hyperparameter specification for rules to set the CV folds.
307304
- Even if only one model is present, compute the ensemble score and store it under `"ensemble"`.
@@ -311,9 +308,16 @@ component_spec:
311308
- <metric_name>: The calculated metric value for that model or ensemble strategy. The metric name can be found in the scenario description. The metric name should be exactly the same as the one in the scenario description since user will use it to check the result.
312309
- Validation metrics should be aligned across all ideas and implementations. Avoid proposing ideas that might affect the validation metrics and modifying the related code.
313310
314-
8. Submission File:
311+
9. Submission File:
315312
- Save the final predictions as `submission.csv`, ensuring the format matches the competition requirements (refer to `sample_submission` in the Folder Description for the correct structure).
316313
- Present the required submission format explicitly and ensure the output adheres to it.
314+
315+
10. Preferred Packages:
316+
- You can choose the most proper packages to achieve the task.
317+
- When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding. Especially those you are not familiar with.
318+
- For GBDT models, prefer XGBoost or RandomForest over LightGBM unless the SOTA or hypothesis dictates otherwise.
319+
- For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
320+
- For neural networks, prefer fine-tuning pre-trained models over training from scratch.
317321
318322
guidelines:
319323
coding: |-

rdagent/scenarios/data_science/test_eval.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,28 @@ def valid(self, competition: str, workspace: FBWorkspace) -> tuple[str, int]:
2525
def enabled(self, competition) -> bool:
2626
"""able to eval or not"""
2727

28+
@abstractmethod
29+
def get_sample_submission_name(self, competition: str) -> str:
30+
"""
31+
Get the sample submission file name for the given competition.
32+
33+
This is used to determine the file name for the submission file.
34+
"""
35+
input_dir = Path(f"{DS_RD_SETTING.local_data_path}/{competition}")
36+
sample_submission_files = (
37+
list(input_dir.glob("*sample_submission*.csv"))
38+
+ list(input_dir.glob("*sampleSubmission*.csv"))
39+
+ list(input_dir.glob("*randomPredictions*.tsv"))
40+
)
41+
if len(sample_submission_files) == 0:
42+
return None
43+
else:
44+
return sample_submission_files[0].name
45+
2846
@abstractmethod
2947
def is_sub_enabled(self, competition: str) -> bool:
3048
"""
31-
Is subsmiossion file enabled
49+
Is submission file enabled
3250
3351
If a file like <sample submission csv> is provided; then we think inference from test data to submission file is enabled.
3452
According test will be enabled as well.
@@ -38,11 +56,7 @@ def is_sub_enabled(self, competition: str) -> bool:
3856
2. We proivde a sample submission. But we don't proivde strict evaluation.
3957
4058
"""
41-
input_dir = Path(f"{DS_RD_SETTING.local_data_path}/{competition}")
42-
sample_submission_files = list(input_dir.glob("*sample_submission*.csv")) + list(
43-
input_dir.glob("*sampleSubmission*.csv")
44-
)
45-
return len(sample_submission_files) > 0
59+
return self.get_sample_submission_name(competition) is not None
4660

4761

4862
class TestEval(TestEvalBase):
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
FROM gcr.io/kaggle-gpu-images/python:latest
2+
3+
RUN apt-get clean && apt-get update && apt-get install -y \
4+
curl \
5+
vim \
6+
git \
7+
build-essential \
8+
strace \
9+
&& rm -rf /var/lib/apt/lists/*

0 commit comments

Comments
 (0)