Skip to content

Commit 937ec26

Browse files
authored
feat: add code change summary (#1000)
* feat: add code change summary and dict_get_with_warning util * feat: support code_change_summary in feedback classes * lint * feat: validate response_format using BaseModel and warn unknown formats
1 parent 1122c04 commit 937ec26

File tree

6 files changed

+53
-13
lines changed

6 files changed

+53
-13
lines changed

rdagent/core/proposal.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def __init__(
5555
self,
5656
reason: str,
5757
*,
58+
code_change_summary: str | None = None,
5859
decision: bool,
5960
exception: Exception | None = None,
6061
) -> None:
@@ -65,12 +66,16 @@ def __init__(
6566
self.exception: Exception | None = (
6667
exception # if the experiment raises exception, it will be integrated into part of the feedback.
6768
)
69+
self.code_change_summary = code_change_summary
6870

6971
def __bool__(self) -> bool:
7072
return self.decision
7173

7274
def __str__(self) -> str:
73-
return f"Decision: {self.decision}\nReason: {self.reason}"
75+
res = f"Decision: {self.decision}\nReason: {self.reason}"
76+
if self.code_change_summary is not None:
77+
res += "\nCode Change Summary: " + self.code_change_summary
78+
return res
7479

7580
@classmethod
7681
def from_exception(cls, e: Exception) -> ExperimentFeedback:
@@ -88,9 +93,10 @@ def __init__(
8893
new_hypothesis: str,
8994
reason: str,
9095
*,
96+
code_change_summary: str | None = None,
9197
decision: bool,
9298
) -> None:
93-
super().__init__(reason, decision=decision)
99+
super().__init__(reason, decision=decision, code_change_summary=code_change_summary)
94100
self.observations = observations
95101
self.hypothesis_evaluation = hypothesis_evaluation
96102
self.new_hypothesis = new_hypothesis

rdagent/log/utils/__init__.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import re
44
from datetime import datetime, timezone
55
from pathlib import Path
6-
from typing import Optional, TypedDict, cast
6+
from typing import Any, Optional, TypedDict, cast
77

88

99
class LogColors:
@@ -112,3 +112,18 @@ def gen_datetime(dt: datetime | None = None) -> datetime:
112112
if dt is None:
113113
return datetime.now(timezone.utc)
114114
return dt.astimezone(timezone.utc)
115+
116+
117+
def dict_get_with_warning(d: dict, key: str, default: Any = None) -> Any:
118+
"""
119+
Motivation:
120+
- When handling the repsonse from the LLM, we may use dict get to get the value.
121+
- the function prevent falling into default value **silently**.
122+
- Instead, it will log a warning message.
123+
"""
124+
from rdagent.log import rdagent_logger as logger
125+
126+
if key not in d:
127+
logger.warning(f"Key {key} not found in {d}")
128+
return default
129+
return d[key]

rdagent/oai/backend/base.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from typing import Any, Optional, cast
1313

1414
import pytz
15-
from pydantic import TypeAdapter
15+
from pydantic import BaseModel, TypeAdapter
1616

1717
from rdagent.core.exception import PolicyError
1818
from rdagent.core.utils import LLM_CACHE_SEED_GEN, SingletonBaseClass
@@ -275,6 +275,11 @@ def build_messages_and_create_chat_completion( # type: ignore[no-untyped-def]
275275
*args,
276276
**kwargs,
277277
) -> str:
278+
"""
279+
Responseible for building messages and logging messages
280+
281+
TODO: What is weird is that the function is called before we seperate embeddings and chat completion.
282+
"""
278283
if former_messages is None:
279284
former_messages = []
280285
messages = self._build_messages(
@@ -463,6 +468,7 @@ def _create_chat_completion_auto_continue(
463468
match = re.search(r"<think>(.*?)</think>(.*)", all_response, re.DOTALL)
464469
_, all_response = match.groups() if match else ("", all_response)
465470

471+
# 3) format checking
466472
if json_mode:
467473
try:
468474
json.loads(all_response)
@@ -472,6 +478,12 @@ def _create_chat_completion_auto_continue(
472478
json.loads(all_response)
473479
if json_target_type is not None:
474480
TypeAdapter(json_target_type).validate_json(all_response)
481+
if (response_format := kwargs.get("response_format")) is not None:
482+
if issubclass(response_format, BaseModel):
483+
# It may raise TypeError if initialization fails
484+
response_format(**json.loads(all_response))
485+
else:
486+
logger.warning(f"Unknown response_format: {response_format}, skipping validation.")
475487
if self.dump_chat_cache:
476488
self.cache.chat_set(input_content_json, all_response)
477489
return all_response

rdagent/scenarios/data_science/dev/feedback.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
ExperimentFeedback,
1010
HypothesisFeedback,
1111
)
12+
from rdagent.log.utils import dict_get_with_warning
1213
from rdagent.oai.llm_utils import APIBackend
1314
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
1415
from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
@@ -114,11 +115,14 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
114115
# Currently, we do not use `observations`, `hypothesis_evaluation`, and `new_hypothesis` in the framework.
115116
# `new_hypothesis` should not exist in the feedback.
116117
hypothesis_feedback = HypothesisFeedback(
117-
observations=resp_dict.get("Observations", "No observations provided"),
118-
hypothesis_evaluation=resp_dict.get("Feedback for Hypothesis", "No feedback provided"),
119-
new_hypothesis=resp_dict.get("New Hypothesis", "No new hypothesis provided"),
120-
reason=resp_dict.get("Reasoning", "No reasoning provided"),
121-
decision=convert2bool(resp_dict.get("Replace Best Result", "no")),
118+
observations=dict_get_with_warning(resp_dict, "Observations", "No observations provided"),
119+
hypothesis_evaluation=dict_get_with_warning(resp_dict, "Feedback for Hypothesis", "No feedback provided"),
120+
new_hypothesis=dict_get_with_warning(resp_dict, "New Hypothesis", "No new hypothesis provided"),
121+
reason=dict_get_with_warning(resp_dict, "Reasoning", "No reasoning provided"),
122+
code_change_summary=dict_get_with_warning(
123+
resp_dict, "Code Change Summary", "No code change summary provided"
124+
),
125+
decision=convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no")),
122126
)
123127

124128
if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:

rdagent/scenarios/data_science/dev/prompts.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ exp_feedback:
55
Below is a detailed description of the current Kaggle competition scenario:
66
{{ scenario }}
77
8-
Your task is to analyze the current experiment's hypothesis, implementation (code), and results, explicitly comparing them with previous experiments and the best previous result (SOTA).
8+
Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous experiments and the best previous result (SOTA).
99
1010
Step-by-step Analysis Process:
1111
@@ -63,6 +63,7 @@ exp_feedback:
6363
{
6464
"Submission Format Check": "yes or no",
6565
"First Valid Submission": "yes or no",
66+
"Code Change Summary": "Clearly summarize the changes made to the code (please cover the most important changes while being concise); during development, extra modifications may be made beyond the intent of the hypothesis, so these changes should also be included to provide complete information",
6667
"Observations": "Clearly summarize current and SOTA ensemble results with exact scores and notable patterns. Limit to no more than three concise, data-focused sentences. Your observation must be grounded by explicit evidence from scenario description or code implementation, not just validation scores.",
6768
"Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
6869
"Evaluation Aligned With Task": "yes or no",

rdagent/scenarios/data_science/share.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,11 @@ describe: # some template to describe some object
3232
{% if exp_and_feedback and exp_and_feedback|length > 1 %}
3333
## {{heading | default('Previous trial and feedback')}}
3434
{% if exp_and_feedback[0].hypothesis %}
35-
the experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
35+
The experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
3636
{% endif %}
37-
feedback decision: {{ exp_and_feedback[1].decision }}
38-
reason: {{ exp_and_feedback[1].reason }}
37+
Feedback decision: {{ exp_and_feedback[1].decision }}
38+
{% if exp_and_feedback[1].code_change_summary %}Code change summary: {{ exp_and_feedback[1].code_change_summary }}{% endif %}
39+
Reason: {{ exp_and_feedback[1].reason }}
3940
{% endif %}
4041
4142
trace: |-
@@ -47,6 +48,7 @@ describe: # some template to describe some object
4748
Target Problem: {{ exp_and_feedback[0].hypothesis.problem_desc }}
4849
{% if not pipeline %}Chosen Component: {{ exp_and_feedback[0].hypothesis.component }}{% endif %}
4950
Proposed Hypothesis: {{ exp_and_feedback[0].hypothesis.hypothesis }}
51+
{% if exp_and_feedback[1].code_change_summary %}Code Change Summary: {{ exp_and_feedback[1].code_change_summary }}{% endif %}
5052
Surpass Previous SOTA: {{ exp_and_feedback[1].decision }}
5153
{% if exp_and_feedback[0].result is none %}
5254
Experiment Score: Running buggy

0 commit comments

Comments
 (0)