feat: add sample submission file check (#1053)

peteryang1 · peteryangms · web-flow · commit 6a840d819251 · 2025-07-10T18:10:32.000+08:00
* check sample submission &amp; add  package constraint

* add trace.log into clear

* change default

* simplify

* clear CI workspace before running

* move to CI

* use sudo to clean workspace

* move prepare out of global var

---------

Co-authored-by: Xu Yang &lt;xuyang1@microsoft.com&gt;
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -48,9 +48,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     max_trace_hist: int = 3
 
     coder_max_loop: int = 10
-    runner_max_loop: int = 1
+    runner_max_loop: int = 3
 
-    sample_data_by_LLM: bool = False
+    sample_data_by_LLM: bool = True
     use_raw_description: bool = False
     show_nan_columns: bool = False
 
diff --git a/rdagent/components/coder/data_science/conf.py b/rdagent/components/coder/data_science/conf.py
@@ -54,6 +54,7 @@ def get_ds_env(
         raise ValueError(f"Unknown env type: {conf.env_type}")
     env.conf.extra_volumes = extra_volumes
     env.conf.running_timeout_period = running_timeout_period
+    env.prepare()
     return env
 
 
@@ -63,7 +64,7 @@ def get_clear_ws_cmd(stage: Literal["before_training", "before_inference"] = "be
     """
     assert stage in ["before_training", "before_inference"], f"Unknown stage: {stage}"
     if DS_RD_SETTING.enable_model_dump and stage == "before_training":
-        cmd = "rm -r submission.csv scores.csv models"
+        cmd = "rm -r submission.csv scores.csv models trace.log"
     else:
-        cmd = "rm submission.csv scores.csv"
+        cmd = "rm submission.csv scores.csv trace.log"
     return cmd
diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -59,9 +59,28 @@ def evaluate(
         implementation.execute(env=env, entry=get_clear_ws_cmd())
         if DS_RD_SETTING.sample_data_by_LLM:
             # Because coder runs on full data, we need to run debug mode in advance to save time
-            result = implementation.run(env=env, entry=f"python -m coverage run main.py --debug")
+            result = implementation.run(
+                env=env, entry=f"strace -e trace=file -f -o trace.log python -m coverage run main.py --debug"
+            )
         else:
-            result = implementation.run(env=env, entry=f"python -m coverage run main.py")
+            result = implementation.run(
+                env=env, entry=f"strace -e trace=file -f -o trace.log python -m coverage run main.py"
+            )
+
+        sample_submission_check = True
+        test_eval = get_test_eval()
+        if (sample_submission_file_name := test_eval.get_sample_submission_name(self.scen.competition)) is not None:
+            # check whether code ever opens the sample submission file
+            if (implementation.workspace_path / "trace.log").exists():
+                opened_trace_lines = [
+                    line
+                    for line in (implementation.workspace_path / "trace.log").read_text().splitlines()
+                    if "openat" in line and sample_submission_file_name in line
+                ]
+                if len(opened_trace_lines) > 0:
+                    stdout += f"Code opened the sample submission file '{sample_submission_file_name}' during execution.\n Reject the implementation!\n"
+                    sample_submission_check = False
+
         result.stdout = remove_eda_part(result.stdout)
         if result.exit_code != 0:
             stdout += f"Code failed to run. Please check the stdout:\n Following the stdout of the debug mode run:\n{result.stdout.strip()}\n"
@@ -114,7 +133,6 @@ def evaluate(
                 score_check_text += f"\n[Error] in checking the scores.csv file: {e}\nscores.csv's content:\n-----\n{score_fp.read_text()}\n-----"
                 score_ret_code = 1
 
-        test_eval = get_test_eval()
         if not test_eval.is_sub_enabled(self.scen.competition):
             submission_ret_code = 0
         else:
@@ -149,10 +167,15 @@ def evaluate(
             user_prompt=user_prompt,
             init_kwargs_update_func=PipelineSingleFeedback.val_and_update_init_dict,
         )
-        if score_ret_code != 0:
+        if score_ret_code != 0 and wfb.final_decision is True:
             wfb.final_decision = False
             wfb.return_checking += "\n" + score_check_text
-        if submission_ret_code != 0:
+        if submission_ret_code != 0 and wfb.final_decision is True:
             wfb.final_decision = False
             wfb.return_checking += "\nSubmission file check failed."
+        if sample_submission_check is False and wfb.final_decision is True:
+            wfb.final_decision = False
+            wfb.return_checking += (
+                "\nSample submission file check failed. Code should not open the sample submission file."
+            )
         return wfb
diff --git a/rdagent/components/coder/data_science/pipeline/eval_tests/submission_format_test.txt b/rdagent/components/coder/data_science/pipeline/eval_tests/submission_format_test.txt
@@ -31,7 +31,7 @@ find . | grep -i sample | grep -i submission | grep -v sample_submission.csv | g
 input_dir = Path('{% include "scenarios.data_science.share:scen.input_path" %}')
 sample_submission_files = list(input_dir.glob("*sample_submission*.csv")) + list(
     input_dir.glob("*sampleSubmission*.csv")
-)
+) + list(input_dir.glob("*randomPredictions*.tsv"))
 
 if not sample_submission_files:
     print(f'Error: No sample submission file found in {% include "scenarios.data_science.share:scen.input_path" %}')
diff --git a/rdagent/log/mle_summary.py b/rdagent/log/mle_summary.py
@@ -20,12 +20,11 @@
 from rdagent.scenarios.kaggle.kaggle_crawler import score_rank
 from rdagent.utils.workflow import LoopBase
 
-test_eval = get_test_eval()
-
-is_mle = isinstance(test_eval, MLETestEval)
-
 
 def save_grade_info(log_trace_path: Path):
+    test_eval = get_test_eval()
+
+    is_mle = isinstance(test_eval, MLETestEval)
     trace_storage = FileStorage(log_trace_path)
     for msg in trace_storage.iter_msg():
         if "competition" in msg.tag:
@@ -74,6 +73,9 @@ def _get_loop_and_fn_after_hours(log_folder: Path, hours: int):
 
 
 def summarize_folder(log_folder: Path, hours: int | None = None):
+    test_eval = get_test_eval()
+
+    is_mle = isinstance(test_eval, MLETestEval)
     """
     Summarize the log folder and save the summary as a pickle file.
     Args:
diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
@@ -14,6 +14,7 @@
     MultiProcessEvolvingStrategy,
 )
 from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.components.coder.data_science.conf import DSCoderCoSTEERSettings
 from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
 from rdagent.core.exception import RunnerError
 from rdagent.core.scenario import Scenario
@@ -24,6 +25,17 @@
 from rdagent.utils.agent.tpl import T
 
 
+class DSRunnerCoSTEERSettings(DSCoderCoSTEERSettings):
+    """Data Science CoSTEER settings"""
+
+    class Config:
+        env_prefix = "DS_Runner_CoSTEER_"
+
+    max_seconds: int = 3600
+    env_type: str = "docker"
+    # TODO: extract a function for env and conf.
+
+
 class DSRunnerMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
         self,
@@ -97,12 +109,13 @@ def __init__(
         eva = CoSTEERMultiEvaluator(
             single_evaluator=eval_l, scen=scen
         )  # Please specify whether you agree running your eva in parallel or not
-        es = DSRunnerMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+        settings = DSRunnerCoSTEERSettings()
+        es = DSRunnerMultiProcessEvolvingStrategy(scen=scen, settings=settings)
 
         # In runner, we don't need very big loops, so we set max_loop to 3
         super().__init__(
             *args,
-            settings=CoSTEER_SETTINGS,
+            settings=settings,
             eva=eva,
             es=es,
             evolving_version=2,
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -306,6 +306,12 @@ task_gen:
       - When only one model is used, its score should be present, and an "ensemble" score (which would be the same as the single model's score in this case) must also be recorded.
       - Ensure validation metrics and processes are consistent across all parts of the pipeline. Avoid changes that would alter how validation metrics are calculated unless that is part of the hypothesis.
     8. **Submission File (`submission.csv`)**: Generate `submission.csv` in the **exact format** required (column names, order, data types), as detailed by `sample_submission.csv` in the `Competition Scenario Description`. This is a critical step.
+    9. **Preferred Packages Notes**:
+      - You can choose the most proper packages for the task to best achieve the hypothesis.
+      - When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding. Especially those you are not familiar with.
+      - For GBDT models, prefer XGBoost or RandomForest over LightGBM unless the SOTA or hypothesis dictates otherwise.
+      - For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
+      - For neural networks, prefer fine-tuning pre-trained models over training from scratch.
 
     # Guidelines for Sketching the `main.py` Workflow
 
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -264,44 +264,41 @@ component_spec:
       {% endraw %}
     
   Pipeline: |-
-    0. Program Execution:
+    1. Program Execution:
       - The workflow will be executed by running `python main.py` with no command-line arguments. Ensure that `main.py` does not require or expect any parameters.
       - The working directory will only contain `main.py`. Any additional files required for execution must be downloaded or generated by `main.py` itself.
       
-    1. File Handling:
+    2. File Handling:
       - Handle file encoding and delimiters appropriately.
       - Combine or process multiple files if necessary.
       - Avoid using the sample submission file to infer test indices. If a dedicated test index file is available, use that. If not, use the order in the test file as the test index.
       - Ensure you load the actual data from the files, not just the filenames or paths. Do not postpone data loading to later steps.
 
-    2. Data Preprocessing:
+    3. Data Preprocessing:
       - Convert data types correctly (e.g., numeric, categorical, date parsing).
       - Optimize memory usage for large datasets using techniques like downcasting or reading data in chunks if necessary.
       - Domain-Specific Handling: 
         - Apply competition-specific preprocessing steps as needed (e.g., text tokenization, image resizing).
 
-    3. Code Standards:
+    4. Code Standards:
       - DO NOT use progress bars (e.g., `tqdm`).
       - DO NOT use the sample submission file to extract test index information.
       - DO NOT exclude features inadvertently during this process.
 
-    4. NOTES
+    5. NOTES
       - Never use sample submission as the test index, as it may not be the same as the test data. Use the test index file or test data source to get the test index.
-      - For neural network models, use pytorch rather than tensorflow as the backend if possible.
-      - For decision tree models, use xgboost or RandomForest rather than lightgbm as the backend if possible.
-      - For neural network models, it's always better to firstly try from a pretrained model and then fine-tune it rather than training from scratch.
 
-    5. General Considerations:
+    6. General Considerations:
       - Ensure scalability for large datasets.
       - Handle missing values and outliers appropriately (e.g., impute, remove, or replace).
       - Ensure consistency between feature data types and transformations.
       - Prevent data leakage: Do not use information derived from the test set when transforming training data.
       - Sampling a subset of the training data for efficiency (e.g., randomly selecting a portion of the data) is discouraged unless it demonstrably improves performance (e.g., removing irrelevant or outlier samples).
 
-    6. Notes:
+    7. Notes:
       - GPU and multiprocessing are available and are encouraged to use for accelerating transformations.
   
-    7. Metric Calculation and Storage:
+    8. Metric Calculation and Storage:
       - Calculate the metric (mentioned in the evaluation section of the competition information) for each model and ensemble strategy on valid, and save the results in `scores.csv`
       - The evaluation should be based on k-fold cross-validation but only if that's an appropriate evaluation for the task at hand. Store the mean validation score of k-fold cross-validation in `scores.csv` on each model. Refer to the hyperparameter specification for rules to set the CV folds.
       - Even if only one model is present, compute the ensemble score and store it under `"ensemble"`.
@@ -311,9 +308,16 @@ component_spec:
         - <metric_name>: The calculated metric value for that model or ensemble strategy. The metric name can be found in the scenario description. The metric name should be exactly the same as the one in the scenario description since user will use it to check the result.
       - Validation metrics should be aligned across all ideas and implementations. Avoid proposing ideas that might affect the validation metrics and modifying the related code.
 
-    8. Submission File:
+    9. Submission File:
       - Save the final predictions as `submission.csv`, ensuring the format matches the competition requirements (refer to `sample_submission` in the Folder Description for the correct structure).
       - Present the required submission format explicitly and ensure the output adheres to it.
+    
+    10. Preferred Packages:
+      - You can choose the most proper packages to achieve the task.
+      - When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding. Especially those you are not familiar with.
+      - For GBDT models, prefer XGBoost or RandomForest over LightGBM unless the SOTA or hypothesis dictates otherwise.
+      - For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
+      - For neural networks, prefer fine-tuning pre-trained models over training from scratch.
 
 guidelines:
   coding: |-
diff --git a/rdagent/scenarios/data_science/test_eval.py b/rdagent/scenarios/data_science/test_eval.py
@@ -25,10 +25,28 @@ def valid(self, competition: str, workspace: FBWorkspace) -> tuple[str, int]:
     def enabled(self, competition) -> bool:
         """able to eval or not"""
 
+    @abstractmethod
+    def get_sample_submission_name(self, competition: str) -> str:
+        """
+        Get the sample submission file name for the given competition.
+
+        This is used to determine the file name for the submission file.
+        """
+        input_dir = Path(f"{DS_RD_SETTING.local_data_path}/{competition}")
+        sample_submission_files = (
+            list(input_dir.glob("*sample_submission*.csv"))
+            + list(input_dir.glob("*sampleSubmission*.csv"))
+            + list(input_dir.glob("*randomPredictions*.tsv"))
+        )
+        if len(sample_submission_files) == 0:
+            return None
+        else:
+            return sample_submission_files[0].name
+
     @abstractmethod
     def is_sub_enabled(self, competition: str) -> bool:
         """
-        Is subsmiossion file enabled
+        Is submission file enabled
 
         If a file like <sample submission csv> is provided; then we think inference from test data to submission file is enabled.
         According test will be enabled as well.
@@ -38,11 +56,7 @@ def is_sub_enabled(self, competition: str) -> bool:
         2. We proivde a sample submission. But we don't proivde strict evaluation.
 
         """
-        input_dir = Path(f"{DS_RD_SETTING.local_data_path}/{competition}")
-        sample_submission_files = list(input_dir.glob("*sample_submission*.csv")) + list(
-            input_dir.glob("*sampleSubmission*.csv")
-        )
-        return len(sample_submission_files) > 0
+        return self.get_sample_submission_name(competition) is not None
 
 
 class TestEval(TestEvalBase):
diff --git a/rdagent/scenarios/kaggle/docker/DS_docker/Dockerfile b/rdagent/scenarios/kaggle/docker/DS_docker/Dockerfile
@@ -0,0 +1,9 @@
+FROM gcr.io/kaggle-gpu-images/python:latest
+
+RUN apt-get clean && apt-get update && apt-get install -y \  
+    curl \  
+    vim \  
+    git \  
+    build-essential \
+    strace \
+    && rm -rf /var/lib/apt/lists/* 
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
@@ -716,8 +716,9 @@ class KGDockerConf(DockerConf):
 class DSDockerConf(DockerConf):
     model_config = SettingsConfigDict(env_prefix="DS_DOCKER_")
 
-    build_from_dockerfile: bool = False
-    image: str = "gcr.io/kaggle-gpu-images/python:latest"
+    build_from_dockerfile: bool = True
+    dockerfile_folder_path: Path = Path(__file__).parent.parent / "scenarios" / "kaggle" / "docker" / "DS_docker"
+    image: str = "local_ds:latest"
     mount_path: str = "/kaggle/workspace"
     default_entry: str = "python main.py"