feat: add code change summary (#1000)

you-n-g · web-flow · commit 937ec263b215 · 2025-06-29T13:40:15.000+08:00
* feat: add code change summary and dict_get_with_warning util

* feat: support code_change_summary in feedback classes

* lint

* feat: validate response_format using BaseModel and warn unknown formats
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -55,6 +55,7 @@ def __init__(
         self,
         reason: str,
         *,
+        code_change_summary: str | None = None,
         decision: bool,
         exception: Exception | None = None,
     ) -> None:
@@ -65,12 +66,16 @@ def __init__(
         self.exception: Exception | None = (
             exception  # if the experiment raises exception, it will be integrated into part of the feedback.
         )
+        self.code_change_summary = code_change_summary
 
     def __bool__(self) -> bool:
         return self.decision
 
     def __str__(self) -> str:
-        return f"Decision: {self.decision}\nReason: {self.reason}"
+        res = f"Decision: {self.decision}\nReason: {self.reason}"
+        if self.code_change_summary is not None:
+            res += "\nCode Change Summary: " + self.code_change_summary
+        return res
 
     @classmethod
     def from_exception(cls, e: Exception) -> ExperimentFeedback:
@@ -88,9 +93,10 @@ def __init__(
         new_hypothesis: str,
         reason: str,
         *,
+        code_change_summary: str | None = None,
         decision: bool,
     ) -> None:
-        super().__init__(reason, decision=decision)
+        super().__init__(reason, decision=decision, code_change_summary=code_change_summary)
         self.observations = observations
         self.hypothesis_evaluation = hypothesis_evaluation
         self.new_hypothesis = new_hypothesis
diff --git a/rdagent/log/utils/__init__.py b/rdagent/log/utils/__init__.py
@@ -3,7 +3,7 @@
 import re
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Optional, TypedDict, cast
+from typing import Any, Optional, TypedDict, cast
 
 
 class LogColors:
@@ -112,3 +112,18 @@ def gen_datetime(dt: datetime | None = None) -> datetime:
     if dt is None:
         return datetime.now(timezone.utc)
     return dt.astimezone(timezone.utc)
+
+
+def dict_get_with_warning(d: dict, key: str, default: Any = None) -> Any:
+    """
+    Motivation:
+    - When handling the repsonse from the LLM, we may use dict get to get the value.
+    - the function prevent falling into default value **silently**.
+    - Instead, it will log a warning message.
+    """
+    from rdagent.log import rdagent_logger as logger
+
+    if key not in d:
+        logger.warning(f"Key {key} not found in {d}")
+        return default
+    return d[key]
diff --git a/rdagent/oai/backend/base.py b/rdagent/oai/backend/base.py
@@ -12,7 +12,7 @@
 from typing import Any, Optional, cast
 
 import pytz
-from pydantic import TypeAdapter
+from pydantic import BaseModel, TypeAdapter
 
 from rdagent.core.exception import PolicyError
 from rdagent.core.utils import LLM_CACHE_SEED_GEN, SingletonBaseClass
@@ -275,6 +275,11 @@ def build_messages_and_create_chat_completion(  # type: ignore[no-untyped-def]
         *args,
         **kwargs,
     ) -> str:
+        """
+        Responseible for building messages and logging messages
+
+        TODO: What is weird is that the function is called before we seperate embeddings and chat completion.
+        """
         if former_messages is None:
             former_messages = []
         messages = self._build_messages(
@@ -463,6 +468,7 @@ def _create_chat_completion_auto_continue(
             match = re.search(r"<think>(.*?)</think>(.*)", all_response, re.DOTALL)
             _, all_response = match.groups() if match else ("", all_response)
 
+        # 3) format checking
         if json_mode:
             try:
                 json.loads(all_response)
@@ -472,6 +478,12 @@ def _create_chat_completion_auto_continue(
                 json.loads(all_response)
         if json_target_type is not None:
             TypeAdapter(json_target_type).validate_json(all_response)
+        if (response_format := kwargs.get("response_format")) is not None:
+            if issubclass(response_format, BaseModel):
+                # It may raise TypeError if initialization fails
+                response_format(**json.loads(all_response))
+            else:
+                logger.warning(f"Unknown response_format: {response_format}, skipping validation.")
         if self.dump_chat_cache:
             self.cache.chat_set(input_content_json, all_response)
         return all_response
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
@@ -9,6 +9,7 @@
     ExperimentFeedback,
     HypothesisFeedback,
 )
+from rdagent.log.utils import dict_get_with_warning
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
@@ -114,11 +115,14 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
         # Currently, we do not use `observations`, `hypothesis_evaluation`, and `new_hypothesis` in the framework.
         # `new_hypothesis` should not exist in the feedback.
         hypothesis_feedback = HypothesisFeedback(
-            observations=resp_dict.get("Observations", "No observations provided"),
-            hypothesis_evaluation=resp_dict.get("Feedback for Hypothesis", "No feedback provided"),
-            new_hypothesis=resp_dict.get("New Hypothesis", "No new hypothesis provided"),
-            reason=resp_dict.get("Reasoning", "No reasoning provided"),
-            decision=convert2bool(resp_dict.get("Replace Best Result", "no")),
+            observations=dict_get_with_warning(resp_dict, "Observations", "No observations provided"),
+            hypothesis_evaluation=dict_get_with_warning(resp_dict, "Feedback for Hypothesis", "No feedback provided"),
+            new_hypothesis=dict_get_with_warning(resp_dict, "New Hypothesis", "No new hypothesis provided"),
+            reason=dict_get_with_warning(resp_dict, "Reasoning", "No reasoning provided"),
+            code_change_summary=dict_get_with_warning(
+                resp_dict, "Code Change Summary", "No code change summary provided"
+            ),
+            decision=convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no")),
         )
 
         if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -5,7 +5,7 @@ exp_feedback:
     Below is a detailed description of the current Kaggle competition scenario:
     {{ scenario }}
 
-    Your task is to analyze the current experiment's hypothesis, implementation (code), and results, explicitly comparing them with previous experiments and the best previous result (SOTA).
+    Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous experiments and the best previous result (SOTA).
 
     Step-by-step Analysis Process:
 
@@ -63,6 +63,7 @@ exp_feedback:
     {
       "Submission Format Check": "yes or no",
       "First Valid Submission": "yes or no",
+      "Code Change Summary": "Clearly summarize the changes made to the code (please cover the most important changes while being concise); during development, extra modifications may be made beyond the intent of the hypothesis, so these changes should also be included to provide complete information",
       "Observations": "Clearly summarize current and SOTA ensemble results with exact scores and notable patterns. Limit to no more than three concise, data-focused sentences. Your observation must be grounded by explicit evidence from scenario description or code implementation, not just validation scores.",
       "Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
       "Evaluation Aligned With Task": "yes or no",
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -32,10 +32,11 @@ describe: # some template to describe some object
     {% if exp_and_feedback and exp_and_feedback|length > 1 %}
     ## {{heading | default('Previous trial and feedback')}}
     {% if exp_and_feedback[0].hypothesis %}
-    the experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
+    The experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
     {% endif %}
-    feedback decision: {{ exp_and_feedback[1].decision }}
-    reason: {{ exp_and_feedback[1].reason }}
+    Feedback decision: {{ exp_and_feedback[1].decision }}
+    {% if exp_and_feedback[1].code_change_summary  %}Code change summary: {{ exp_and_feedback[1].code_change_summary }}{% endif %}
+    Reason: {{ exp_and_feedback[1].reason }}
     {% endif %}
 
   trace: |-
@@ -47,6 +48,7 @@ describe: # some template to describe some object
     Target Problem: {{ exp_and_feedback[0].hypothesis.problem_desc }}
     {% if not pipeline %}Chosen Component: {{ exp_and_feedback[0].hypothesis.component }}{% endif %}
     Proposed Hypothesis: {{ exp_and_feedback[0].hypothesis.hypothesis }}
+    {% if exp_and_feedback[1].code_change_summary  %}Code Change Summary: {{ exp_and_feedback[1].code_change_summary }}{% endif %}
     Surpass Previous SOTA: {{ exp_and_feedback[1].decision }}
     {% if exp_and_feedback[0].result is none %}
     Experiment Score: Running buggy