Skip to content

Commit fd3c0fd

Browse files
authored
feat: Initial version if Graph RAG in KAGGLE scenario (#301)
* Initial version if Graph RAG in KAGGLE scenario * fix CI * fix a small bug * fix CI * fix CI * fix CI
1 parent 4ecf25f commit fd3c0fd

File tree

22 files changed

+382
-118
lines changed

22 files changed

+382
-118
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ mypy:
9797
# First deal with the core folder, and then gradually increase the scope of detection,
9898
# and eventually realize the detection of the complete project.
9999
ruff:
100-
$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002 # --exclude rdagent/scripts,git_ignore_folder
100+
$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002,I001 # --exclude rdagent/scripts,git_ignore_folder
101101

102102
# Check lint with toml-sort.
103103
toml-sort:

rdagent/app/kaggle/conf.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ class Config:
1616
scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
1717
"""Scenario class for data mining model"""
1818

19+
knowledge_base: str = "" # TODO enable this line to use the knowledge base
20+
# knowledge_base: str = "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph"
21+
"""Knowledge base class"""
22+
23+
knowledge_base_path: str = "kg_graph.pkl"
24+
"""Knowledge base path"""
25+
1926
hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
2027
"""Hypothesis generation class"""
2128

rdagent/app/kaggle/loop.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from rdagent.scenarios.kaggle.proposal.proposal import (
2323
KG_ACTION_FEATURE_ENGINEERING,
2424
KG_ACTION_FEATURE_PROCESSING,
25+
KGTrace,
2526
)
2627

2728

@@ -32,6 +33,13 @@ def __init__(self, PROP_SETTING: BasePropSetting):
3233
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
3334
logger.log_object(scen, tag="scenario")
3435

36+
knowledge_base = (
37+
import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
38+
if PROP_SETTING.knowledge_base != ""
39+
else None
40+
)
41+
logger.log_object(knowledge_base, tag="knowledge_base")
42+
3543
self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
3644
logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
3745

@@ -50,7 +58,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
5058

5159
self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
5260
logger.log_object(self.summarizer, tag="summarizer")
53-
self.trace = Trace(scen=scen)
61+
self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
5462
super(RDLoop, self).__init__()
5563

5664
@measure_time

rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
)
2323
from rdagent.core.evolving_framework import (
2424
EvolvableSubjects,
25+
EvolvingKnowledgeBase,
2526
EvoStep,
2627
Knowledge,
27-
KnowledgeBase,
2828
QueriedKnowledge,
2929
RAGStrategy,
3030
)
@@ -71,12 +71,13 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
7171
self.failed_task_info_set = failed_task_info_set
7272

7373

74-
class FactorKnowledgeBaseV1(KnowledgeBase):
75-
def __init__(self) -> None:
74+
class FactorKnowledgeBaseV1(EvolvingKnowledgeBase):
75+
def __init__(self, path: str | Path = None) -> None:
7676
self.implementation_trace: dict[str, FactorKnowledge] = dict()
7777
self.success_task_info_set: set[str] = set()
7878

7979
self.task_to_embedding = dict()
80+
super().__init__(path)
8081

8182
def query(self) -> QueriedKnowledge | None:
8283
"""
@@ -746,12 +747,12 @@ def dataset_query(
746747
return factor_implementation_queried_graph_knowledge
747748

748749

749-
class FactorGraphKnowledgeBase(KnowledgeBase):
750-
def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> None:
750+
class FactorGraphKnowledgeBase(EvolvingKnowledgeBase):
751+
def __init__(self, init_component_list=None, path: str | Path = None, data_set_knowledge_path=None) -> None:
751752
"""
752753
Load knowledge, offer brief information of knowledge and common handle interfaces
753754
"""
754-
self.graph: UndirectedGraph = UndirectedGraph.load(Path.cwd() / "graph.pkl")
755+
self.graph: UndirectedGraph = UndirectedGraph(Path.cwd() / "graph.pkl")
755756
logger.info(f"Knowledge Graph loaded, size={self.graph.size()}")
756757

757758
if init_component_list:
@@ -780,6 +781,7 @@ def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> No
780781
if data_set_knowledge_path:
781782
with open(data_set_knowledge_path, "r") as f:
782783
self.data_set_knowledge_dict = json.load(f)
784+
super().__init__(path)
783785

784786
def get_all_nodes_by_label(self, label: str) -> list[UndirectedNode]:
785787
return self.graph.get_all_nodes_by_label(label)

rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1+
from pathlib import Path
2+
13
from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
24
from rdagent.components.coder.model_coder.CoSTEER.evaluators import ModelCoderFeedback
35
from rdagent.components.coder.model_coder.model import ModelTask
46
from rdagent.core.evolving_framework import (
57
EvolvableSubjects,
8+
EvolvingKnowledgeBase,
69
EvoStep,
710
Knowledge,
8-
KnowledgeBase,
911
QueriedKnowledge,
1012
RAGStrategy,
1113
)
@@ -49,13 +51,15 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
4951
self.working_task_to_similar_successful_knowledge_dict = dict()
5052

5153

52-
class ModelKnowledgeBase(KnowledgeBase):
53-
def __init__(self) -> None:
54+
class ModelKnowledgeBase(EvolvingKnowledgeBase):
55+
def __init__(self, path: str | Path = None) -> None:
5456
self.implementation_trace: dict[str, ModelKnowledge] = dict()
5557
self.success_task_info_set: set[str] = set()
5658

5759
self.task_to_embedding = dict()
5860

61+
super().__init__(path)
62+
5963
def query(self) -> QueriedKnowledge | None:
6064
"""
6165
Query the knowledge base to get the queried knowledge. So far is handled in RAG strategy.

rdagent/components/knowledge_management/graph.py

Lines changed: 4 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
VectorBase,
1313
cosine,
1414
)
15+
from rdagent.core.knowledge_base import KnowledgeBase
1516
from rdagent.oai.llm_utils import APIBackend
1617

1718
Node = KnowledgeMetaData
@@ -47,14 +48,14 @@ def __repr__(self) -> str:
4748
)
4849

4950

50-
class Graph:
51+
class Graph(KnowledgeBase):
5152
"""
5253
base Graph class for Knowledge Graph Search
5354
"""
5455

5556
def __init__(self, path: str | Path | None = None) -> None:
56-
self.path = path
5757
self.nodes = {}
58+
super().__init__(path=path)
5859

5960
def size(self) -> int:
6061
return len(self.nodes)
@@ -77,22 +78,6 @@ def find_node(self, content: str, label: str) -> Node | None:
7778
return node
7879
return None
7980

80-
@classmethod
81-
def load(cls: type[Graph], path: str | Path) -> Graph:
82-
"""use pickle as the default load method"""
83-
path = path if isinstance(path, Path) else Path(path)
84-
if not path.exists():
85-
return cls(path=path)
86-
87-
with path.open("rb") as f:
88-
return pickle.load(f)
89-
90-
def save(self, path: str | Path) -> None:
91-
"""use pickle as the default save method"""
92-
Path.mkdir(path.parent, exist_ok=True)
93-
with path.open("wb") as f:
94-
pickle.dump(self, f)
95-
9681
@staticmethod
9782
def batch_embedding(nodes: list[Node]) -> list[Node]:
9883
contents = [node.content for node in nodes]
@@ -119,8 +104,8 @@ class UndirectedGraph(Graph):
119104
"""
120105

121106
def __init__(self, path: str | Path | None = None) -> None:
122-
super().__init__(path=path)
123107
self.vector_base: VectorBase = PDVectorBase()
108+
super().__init__(path=path)
124109

125110
def __str__(self) -> str:
126111
return f"UndirectedGraph(nodes={self.nodes})"
@@ -174,16 +159,6 @@ def add_node(
174159

175160
node.add_neighbor(neighbor)
176161

177-
@classmethod
178-
def load(cls: type[UndirectedGraph], path: str | Path) -> UndirectedGraph:
179-
"""use pickle as the default load method"""
180-
path = path if isinstance(path, Path) else Path(path)
181-
if not path.exists():
182-
return cls(path=path)
183-
184-
with path.open("rb") as f:
185-
return pickle.load(f)
186-
187162
def add_nodes(self, node: UndirectedNode, neighbors: list[UndirectedNode]) -> None:
188163
if not neighbors:
189164
self.add_node(node)

rdagent/components/knowledge_management/vector_base.py

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pandas as pd
66
from scipy.spatial.distance import cosine
77

8+
from rdagent.core.knowledge_base import KnowledgeBase
89
from rdagent.log import rdagent_logger as logger
910
from rdagent.oai.llm_utils import APIBackend
1011

@@ -68,14 +69,11 @@ def contents_to_documents(contents: List[str], label: str = None) -> List[Docume
6869
return docs
6970

7071

71-
class VectorBase:
72+
class VectorBase(KnowledgeBase):
7273
"""
7374
This class is used for handling vector storage and query
7475
"""
7576

76-
def __init__(self, vector_df_path: Union[str, Path] = None, **kwargs):
77-
pass
78-
7977
def add(self, document: Union[Document, List[Document]]):
8078
"""
8179
add new node to vector_df
@@ -104,28 +102,15 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
104102
"""
105103
pass
106104

107-
def load(self, **kwargs):
108-
"""load vector_df"""
109-
110-
def save(self, **kwargs):
111-
"""save vector_df"""
112-
113105

114106
class PDVectorBase(VectorBase):
115107
"""
116108
Implement of VectorBase using Pandas
117109
"""
118110

119-
def __init__(self, vector_df_path: Union[str, Path] = None):
120-
super().__init__(vector_df_path)
121-
122-
if vector_df_path:
123-
try:
124-
self.vector_df = self.load(vector_df_path)
125-
except FileNotFoundError:
126-
self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
127-
else:
128-
self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
111+
def __init__(self, path: Union[str, Path] = None):
112+
self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
113+
super().__init__(path)
129114

130115
def shape(self):
131116
return self.vector_df.shape
@@ -196,10 +181,3 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
196181
for _, similar_docs in most_similar_docs.iterrows():
197182
docs.append(Document().from_dict(similar_docs.to_dict()))
198183
return docs, searched_similarities.to_list()
199-
200-
def load(self, vector_df_path, **kwargs):
201-
vector_df = pd.read_pickle(vector_df_path)
202-
return vector_df
203-
204-
def save(self, vector_df_path, **kwargs):
205-
self.vector_df.to_pickle(vector_df_path)

rdagent/components/workflow/conf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ class Config:
1414
"""
1515

1616
scen: str = ""
17+
knowledge_base: str = ""
18+
knowledge_base_path: str = ""
1719
hypothesis_gen: str = ""
1820
hypothesis2experiment: str = ""
1921
coder: str = ""

rdagent/core/evolving_framework.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from dataclasses import dataclass
66
from typing import TYPE_CHECKING, Any
77

8+
from rdagent.core.knowledge_base import KnowledgeBase
9+
810
if TYPE_CHECKING:
911
from rdagent.core.evaluation import Feedback
1012
from rdagent.core.scenario import Scenario
@@ -18,7 +20,7 @@ class QueriedKnowledge:
1820
pass
1921

2022

21-
class KnowledgeBase(ABC):
23+
class EvolvingKnowledgeBase(KnowledgeBase):
2224
@abstractmethod
2325
def query(
2426
self,
@@ -78,7 +80,7 @@ def evolve(
7880
class RAGStrategy(ABC):
7981
"""Retrieval Augmentation Generation Strategy"""
8082

81-
def __init__(self, knowledgebase: KnowledgeBase) -> None:
83+
def __init__(self, knowledgebase: EvolvingKnowledgeBase) -> None:
8284
self.knowledgebase = knowledgebase
8385

8486
@abstractmethod

rdagent/core/knowledge_base.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from pathlib import Path
2+
3+
import dill as pickle # type: ignore[import-untyped]
4+
5+
from rdagent.log import rdagent_logger as logger
6+
7+
8+
class KnowledgeBase:
9+
def __init__(self, path: str | Path | None = None) -> None:
10+
self.path = Path(path) if path else None
11+
self.load()
12+
13+
def load(self) -> None:
14+
if self.path is not None and self.path.exists():
15+
with self.path.open("rb") as f:
16+
self.__dict__.update(
17+
pickle.load(f).__dict__,
18+
) # TODO: because we need to align with init function, we need a less hacky way to do this
19+
20+
def dump(self) -> None:
21+
if self.path is not None:
22+
self.path.parent.mkdir(parents=True, exist_ok=True)
23+
pickle.dump(self, self.path.open("wb"))
24+
else:
25+
logger.warning("KnowledgeBase path is not set, dump failed.")

0 commit comments

Comments
 (0)