fix: merge datascience v3 and v2 (#974)

peteryang1 · peteryangms · web-flow · commit 1ba754853ce2 · 2025-06-19T13:25:10.000+08:00
* add coder version

* merge cooder and feedback prompts

* align v2 and v3 proposal prompts

* fix a small bug

* fix a bug

* fix another bug

* support both function calling and json mode in v2 proposal

* fix minor bug

* reformat

* remove proposal v3

* fix a small bug in json mode

* fix CI

* remove tmp file

* remove v3 check

---------

Co-authored-by: Xu Yang &lt;xuyang1@microsoft.com&gt;
diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -98,21 +98,12 @@ def implement_one_task(
             spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
             enable_model_dump=DS_RD_SETTING.enable_model_dump,
         )
-        if DS_RD_SETTING.proposal_version == "v3":
-            # FIXME: A temporary patch for BUILD
-            user_prompt = T(".prompts:pipeline_coder.user_v3").r(
-                competition_info=competition_info,
-                folder_spec=data_folder_info,
-                latest_code=workspace.file_dict.get("main.py"),
-                latest_code_feedback=prev_task_feedback,
-            )
-        else:
-            user_prompt = T(".prompts:pipeline_coder.user").r(
-                competition_info=competition_info,
-                folder_spec=data_folder_info,
-                latest_code=workspace.file_dict.get("main.py"),
-                latest_code_feedback=prev_task_feedback,
-            )
+        user_prompt = T(".prompts:pipeline_coder.user").r(
+            competition_info=competition_info,
+            folder_spec=data_folder_info,
+            latest_code=workspace.file_dict.get("main.py"),
+            latest_code_feedback=prev_task_feedback,
+        )
 
         for _ in range(5):
             pipeline_code = PythonAgentOut.extract_output(
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -83,27 +83,6 @@ pipeline_coder:
     --------- Data Folder Description (All path are relative to the data folder) ---------
     {{ folder_spec }}
     
-    {% if latest_code %}
-    --------- Former code ---------
-    {{ latest_code }}
-    {% if latest_code_feedback is not none %}
-    --------- Feedback to former code ---------
-    {{ latest_code_feedback }}
-    The former code contains errors. You should correct the code based on the provided information, ensuring you do not repeat the same mistakes.
-    {% else %}
-    The former code is correct. You should try to improve the code based on the provided task while not changing the irrelevant parts.
-    {% endif %}
-    {% endif %} 
-
-    You should strictly follow the code specifications provided by the specification to implement the function.
-
-  user_v3: |-
-    --------- Competition Information ---------
-    {{ competition_info }}
-
-    --------- Data Folder Description (All path are relative to the data folder) ---------
-    {{ folder_spec }}
-    
     {% if latest_code %}
     --------- Former code ---------
     {{ latest_code }}
diff --git a/rdagent/oai/backend/base.py b/rdagent/oai/backend/base.py
@@ -499,6 +499,13 @@ def _create_embedding_with_cache(
                 self.cache.embedding_set(content_to_embedding_dict)
         return [content_to_embedding_dict[content] for content in input_content_list]  # type: ignore[misc]
 
+    @abstractmethod
+    def support_function_calling(self) -> bool:
+        """
+        Check if the backend supports function calling
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+
     @abstractmethod
     def _calculate_token_from_messages(self, messages: list[dict[str, Any]]) -> int:
         """
diff --git a/rdagent/oai/backend/deprec.py b/rdagent/oai/backend/deprec.py
@@ -261,6 +261,13 @@ def _azure_patch(model: str) -> str:
                     raise
         return encoding
 
+    def support_function_calling(self) -> bool:
+        """
+        Check if the backend supports function calling.
+        Currently, deprec backend does not support function calling so it returns False. #FIXME: maybe a mapping to the backend class is needed.
+        """
+        return False
+
     def _create_embedding_inner_function(  # type: ignore[no-untyped-def]
         self, input_content_list: list[str], *args, **kwargs
     ) -> list[list[float]]:  # noqa: ARG002
diff --git a/rdagent/oai/backend/litellm.py b/rdagent/oai/backend/litellm.py
@@ -7,6 +7,7 @@
     completion,
     completion_cost,
     embedding,
+    supports_function_calling,
     supports_response_schema,
     token_counter,
 )
@@ -93,6 +94,12 @@ def _create_chat_completion_inner_function(  # type: ignore[no-untyped-def] # no
         """
         if json_mode and supports_response_schema(model=LITELLM_SETTINGS.chat_model):
             kwargs["response_format"] = {"type": "json_object"}
+        elif not supports_response_schema(model=LITELLM_SETTINGS.chat_model) and "response_format" in kwargs:
+            logger.warning(
+                f"{LogColors.RED}Model {LITELLM_SETTINGS.chat_model} does not support response schema, ignoring response_format argument.{LogColors.END}",
+                tag="llm_messages",
+            )
+            kwargs.pop("response_format")
 
         if LITELLM_SETTINGS.log_llm_chat_content:
             logger.info(self._build_log_messages(messages), tag="llm_messages")
@@ -183,3 +190,9 @@ def _create_chat_completion_inner_function(  # type: ignore[no-untyped-def] # no
             tag="token_cost",
         )
         return content, finish_reason
+
+    def support_function_calling(self) -> bool:
+        """
+        Check if the backend supports function calling
+        """
+        return supports_function_calling(model=LITELLM_SETTINGS.chat_model) and LITELLM_SETTINGS.enable_function_call
diff --git a/rdagent/oai/llm_conf.py b/rdagent/oai/llm_conf.py
@@ -16,6 +16,9 @@ class LLMSettings(ExtendedBaseSettings):
     embedding_model: str = "text-embedding-3-small"
 
     reasoning_effort: Literal["low", "medium", "high"] | None = None
+    enable_function_call: bool = (
+        True  # Whether to enable function calling in chat models. may not work for models that do not support it.
+    )
 
     # Handling format
     reasoning_think_rm: bool = False
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
@@ -87,29 +87,16 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
                     )
 
         eda_output = exp.experiment_workspace.file_dict.get("EDA.md", None)
-        if DS_RD_SETTING.proposal_version == "v3":
-            # FIXME: Some minor changes. Did not have time to test the full.
-            system_prompt = T(".prompts:exp_feedback_v3.system").r(
-                scenario=self.scen.get_scenario_all_desc(eda_output=eda_output)
-            )
-            user_prompt = T(".prompts:exp_feedback_v3.user").r(
-                sota_desc=sota_desc,
-                cur_exp=exp,
-                diff_edition=diff_edition,
-                feedback_desc=feedback_desc,
-                cur_vs_sota_score=cur_vs_sota_score,
-            )
-        else:
-            system_prompt = T(".prompts:exp_feedback.system").r(
-                scenario=self.scen.get_scenario_all_desc(eda_output=eda_output)
-            )
-            user_prompt = T(".prompts:exp_feedback.user").r(
-                sota_desc=sota_desc,
-                cur_exp=exp,
-                diff_edition=diff_edition,
-                feedback_desc=feedback_desc,
-                cur_vs_sota_score=cur_vs_sota_score,
-            )
+        system_prompt = T(".prompts:exp_feedback.system").r(
+            scenario=self.scen.get_scenario_all_desc(eda_output=eda_output)
+        )
+        user_prompt = T(".prompts:exp_feedback.user").r(
+            sota_desc=sota_desc,
+            cur_exp=exp,
+            diff_edition=diff_edition,
+            feedback_desc=feedback_desc,
+            cur_vs_sota_score=cur_vs_sota_score,
+        )
 
         resp_dict = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -15,7 +15,9 @@ exp_feedback:
       - Recommend corrective actions explicitly.
       - Set `"Replace Best Result": "no"`.
       - Begin your `reasoning` with `[Submission format error]`, clearly stating the issues causing experiment failure.
-    - If submission passes, proceed to Step 2.
+    - If submission passes the submission format check:
+      - If this is the first valid submission ever, set `"Replace Best Result": "yes"`.
+      - Otherwise, proceed to Step 2.
 
     Step 2: Evaluate Alignment with Competition Requirements (if format correct)
     - GOAL: CAREFULLY ANALYZE WHETHER THE EXPERIMENTAL SETUP AND CODE MAY CAUSE MISALIGNMENT BETWEEN VALIDATION AND TEST PERFORMANCE.
@@ -59,6 +61,8 @@ exp_feedback:
     Provide detailed and constructive feedback structured as follows:
     Example JSON Structure for Result Analysis:
     {
+      "Submission Format Check": "yes or no",
+      "First Valid Submission": "yes or no",
       "Observations": "Clearly summarize current and SOTA ensemble results with exact scores and notable patterns. Limit to no more than three concise, data-focused sentences. Your observation must be grounded by explicit evidence from scenario description or code implementation, not just validation scores.",
       "Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
       "Evaluation Aligned With Task": "yes or no",
@@ -110,11 +114,11 @@ exp_feedback:
     {{ cur_exp.experiment_workspace.all_codes }}
 
     ## Feedback of past experiments
-    {{ feedback_desc }}
+    {{ feedback_desc or "There has not been any experiments yet." }}
     Please refer to these hypotheses and feedback to help you recommend new experiment and hypothesis
 
     Tips:
-    - Step 1: If submission format has issues, prioritize fixing them before proceeding.
+    - Step 1: If submission format has issues, prioritize fixing them before proceeding. If the format is correct and it's the first valid submission ever (there has never been valid submissions in the past), set `"Replace Best Result": "yes"`. If the format is correct and this is not the first valid submission, proceed to Step 2.
     - Step 2: If evaluation alignment issues are identified (validation approach does not follow competition requirements), address these methodological discrepancies immediately.
     - Step 3: If new results significantly worse than SOTA, or repeated hyperparameter adjustments yield no improvement, it might be time to rethink or shift focus.
 
diff --git a/rdagent/scenarios/data_science/loop.py b/rdagent/scenarios/data_science/loop.py
@@ -90,14 +90,11 @@ def _get_exp_gen(class_uri: str, scen: Scenario):
         from rdagent.scenarios.data_science.proposal.exp_gen.proposal import (
             DSProposalV1ExpGen,
             DSProposalV2ExpGen,
-            DSProposalV3ExpGen,
         )
 
         if class_uri == "rdagent.scenarios.data_science.proposal.exp_gen.DSExpGen":
-            if DS_RD_SETTING.proposal_version not in ["v1", "v2", "v3"]:
+            if DS_RD_SETTING.proposal_version not in ["v1", "v2"]:
                 return import_class(DS_RD_SETTING.proposal_version)(scen=scen)
-            if DS_RD_SETTING.proposal_version == "v3":
-                return DSProposalV3ExpGen(scen=scen)
             if DS_RD_SETTING.proposal_version == "v1":
                 return DSProposalV1ExpGen(scen=scen)
             if DS_RD_SETTING.proposal_version == "v2":
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py