feat: add a rag mcp in proposal (#1267)

XianBW · you-n-g · web-flow · commit a0cd1025c141 · 2025-10-20T16:46:25.000+08:00
* add simple rag mcp

* add rag_agent in expGen v2

* add conf config for research rag

* fix CI

* refactor: move context7 and rag config files to new conf modules

* make rag agent general

* fix CI

---------

Co-authored-by: Young &lt;afe.young@gmail.com&gt;
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -175,6 +175,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     enable_generate_unique_hypothesis: bool = False
     """Enable generate unique hypothesis. If True, generate unique hypothesis for each component. If False, generate unique hypothesis for each component."""
 
+    enable_research_rag: bool = False
+    """Enable research RAG for hypothesis generation."""
+
     #### hypothesis critique and rewrite
     enable_hypo_critique_rewrite: bool = False
     """Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
diff --git a/rdagent/components/agent/context7/__init__.py b/rdagent/components/agent/context7/__init__.py
@@ -3,7 +3,7 @@
 from pydantic_ai.mcp import MCPServerStreamableHTTP
 
 from rdagent.components.agent.base import PAIAgent
-from rdagent.components.agent.mcp.context7 import SETTINGS
+from rdagent.components.agent.context7.conf import SETTINGS
 from rdagent.log import rdagent_logger as logger
 from rdagent.utils.agent.tpl import T
 
diff --git a/rdagent/components/agent/context7/conf.py b/rdagent/components/agent/context7/conf.py
diff --git a/rdagent/components/agent/rag/__init__.py b/rdagent/components/agent/rag/__init__.py
@@ -0,0 +1,17 @@
+from pydantic_ai.mcp import MCPServerStreamableHTTP
+
+from rdagent.components.agent.base import PAIAgent
+from rdagent.components.agent.rag.conf import SETTINGS
+from rdagent.utils.agent.tpl import T
+
+
+class Agent(PAIAgent):
+    """
+    A specific agent for RAG
+    """
+
+    def __init__(self, system_prompt: str | None = None):
+        toolsets = [MCPServerStreamableHTTP(SETTINGS.url, timeout=SETTINGS.timeout)]
+        if system_prompt is None:
+            system_prompt = "You are a Retrieval-Augmented Generation (RAG) agent. Use the retrieved documents to answer the user's queries accurately and concisely."
+        super().__init__(system_prompt=system_prompt, toolsets=toolsets)
diff --git a/rdagent/components/agent/rag/conf.py b/rdagent/components/agent/rag/conf.py
@@ -0,0 +1,22 @@
+"""
+Settings for RAG agent.
+
+TODO: how run the RAG mcp server
+"""
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    """Project specific settings."""
+
+    url: str = "http://localhost:8124/mcp"
+    timeout: int = 120
+
+    model_config = SettingsConfigDict(
+        env_prefix="RAG_",
+        # extra="allow", # Does it allow extrasettings
+    )
+
+
+SETTINGS = Settings()
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -294,6 +294,11 @@ hypothesis_gen:
     # Identified Challenges{% if enable_idea_pool %} with Sampled Ideas{% endif %}
     {{ problems }}
 
+    {% if knowledge %}
+    # Some reference knowledge from the community
+    {{ knowledge }}
+    {% endif %}
+
 hypothesis_critique:
   system: |-
     {% include "scenarios.data_science.share:scen.role" %}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -9,6 +9,7 @@
 from pydantic import BaseModel, Field
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.agent.rag import Agent as RAGAgent
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
@@ -645,12 +646,24 @@ def hypothesis_gen(
             sibling_hypotheses=sibling_hypotheses,
             former_user_instructions_str=str(former_user_instructions) if former_user_instructions else None,
         )
+
+        # knowledge retrieval
+        if DS_RD_SETTING.enable_research_rag:
+            rag_agent = RAGAgent(
+                system_prompt="""You are a helpful assistant.
+You help users retrieve relevant knowledge from community discussions and public code."""
+            )
+            knowledge = rag_agent.query(problem_formatted_str)
+        else:
+            knowledge = None
+
         user_prompt = T(".prompts_v2:hypothesis_gen.user").r(
             scenario_desc=scenario_desc,
             exp_and_feedback_list_desc=exp_feedback_list_desc,
             sota_exp_desc=sota_exp_desc,
             problems=problem_formatted_str,
             enable_idea_pool=enable_idea_pool,
+            knowledge=knowledge,
         )
         response = APIBackend().build_messages_and_create_chat_completion(
             user_prompt=user_prompt,