Skip to content

Commit be0e48a

Browse files
authored
feat: Integrate RAG into the Kaggle scenarios. (#262)
* init a scenario for kaggle feature engineering * add 1st version rag for Kaggle hypo * refine the code * fix a bug * add the process of extracting exp from docs * Remove the unnecessary file. * refine the code for ci test * Delete rdagent/app/kaggle_feature/conf.py * refine the comments * Update extract_experience_from_docs.py * Update extract_experience_from_docs.py
1 parent 4523b93 commit be0e48a

File tree

3 files changed

+324
-0
lines changed

3 files changed

+324
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import json
2+
import os
3+
from pathlib import Path
4+
5+
from jinja2 import Environment, StrictUndefined
6+
7+
from rdagent.core.prompts import Prompts
8+
from rdagent.oai.llm_utils import APIBackend
9+
10+
prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
11+
12+
13+
def process_with_gpt(content: str):
14+
sys_prompt = (
15+
Environment(undefined=StrictUndefined)
16+
.from_string(prompt_dict["extract_kaggle_knowledge_prompts"]["system"])
17+
.render()
18+
)
19+
20+
user_prompt = (
21+
Environment(undefined=StrictUndefined)
22+
.from_string(prompt_dict["extract_kaggle_knowledge_prompts"]["user"])
23+
.render(file_content=content)
24+
)
25+
26+
response_analysis = APIBackend().build_messages_and_create_chat_completion(
27+
user_prompt=user_prompt,
28+
system_prompt=sys_prompt,
29+
json_mode=True,
30+
)
31+
32+
try:
33+
response_json_analysis = json.loads(response_analysis)
34+
except json.JSONDecodeError:
35+
response_json_analysis = {"error": "Failed to parse LLM's response as JSON"}
36+
37+
return response_json_analysis
38+
39+
40+
def process_all_case_files(directory_path: str):
41+
output_file = Path(directory_path) / "kaggle_experience_results.json"
42+
json_output = []
43+
for filename in os.listdir(directory_path):
44+
if filename.endswith(".case"):
45+
file_path = os.path.join(directory_path, filename)
46+
47+
with open(file_path, "r", encoding="utf-8") as file:
48+
content = file.read()
49+
gpt_response = process_with_gpt(content)
50+
json_output.append(gpt_response)
51+
52+
with open(output_file, "w", encoding="utf-8") as json_file:
53+
json.dump(json_output, json_file, ensure_ascii=False)
54+
55+
56+
if __name__ == "__main__":
57+
process_all_case_files(directory_path="git_ignore_folder/experience/tabular_cases_all")
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
extract_kaggle_knowledge_prompts:
2+
system: |-
3+
You are a Kaggle competition expert with extensive experience in analyzing high-ranking Kaggle notebooks and competition strategies.
4+
Your task is to summarize or infer key information such as the competition name, task type, and specific techniques employed in the notebook or strategy.
5+
For each provided content, you are expected to extract valuable insights and organize the analysis in the structured format outlined below.
6+
7+
Please provide the analysis in the following JSON format:
8+
{
9+
"content": "all provided content",
10+
"title": "extracted title, if available",
11+
"competition_name": "extracted competition name",
12+
"task_category": "extracted task type, e.g., Classification, Regression",
13+
"field": "field of focus, e.g., Feature Engineering, Modeling",
14+
"ranking": "extracted ranking, if available",
15+
"score": "extracted score or metric, if available"
16+
}
17+
18+
user: |-
19+
High-ranking Kaggle notebooks or competition strategies: {{ file_content }}
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
import uuid
2+
from pathlib import Path
3+
from typing import List, Tuple, Union
4+
5+
import pandas as pd
6+
from _pytest.cacheprovider import json
7+
from scipy.spatial.distance import cosine
8+
9+
from rdagent.components.knowledge_management.vector_base import (
10+
KnowledgeMetaData,
11+
PDVectorBase,
12+
)
13+
from rdagent.log import rdagent_logger as logger
14+
from rdagent.oai.llm_utils import APIBackend
15+
16+
17+
class KGKnowledgeMetaData(KnowledgeMetaData):
18+
"""
19+
Class for handling Kaggle competition specific metadata
20+
"""
21+
22+
def __init__(
23+
self,
24+
content: str = "",
25+
label: str = None,
26+
embedding=None,
27+
identity=None,
28+
competition_name=None,
29+
task_category=None,
30+
field=None,
31+
ranking=None,
32+
score=None,
33+
entities=None,
34+
relations=None,
35+
):
36+
"""
37+
Initialize KGKnowledgeMetaData for Kaggle competition posts
38+
39+
Parameters:
40+
----------
41+
competition_name: str, optional
42+
The name of the Kaggle competition.
43+
task_category: str, required
44+
The type of task (e.g., classification, regression).
45+
field: str, optional
46+
The specific field of knowledge (e.g., feature engineering, modeling).
47+
ranking: str or int, optional
48+
The ranking achieved in the competition.
49+
score: float, optional
50+
The score or metric achieved in the competition.
51+
entities: list, optional
52+
Entities related to the content (for knowledge graph integration).
53+
relations: list, optional
54+
Relations between entities (for knowledge graph integration).
55+
"""
56+
super().__init__(content, label, embedding, identity)
57+
self.competition_name = competition_name
58+
self.task_category = task_category # Task type is required
59+
self.field = field # Knowledge field, optional (model/data/others/overall)
60+
self.ranking = ranking # Ranking
61+
# TODO ranking and score might be unified
62+
self.score = score # Competition score
63+
# TODO Perhaps this shouldn't be here?
64+
self.entities = entities or [] # Entities in the knowledge graph
65+
self.relations = relations or [] # Relations in the knowledge graph
66+
67+
def split_into_trunk(self, size: int = 1000, overlap: int = 0):
68+
"""
69+
Split content into trunks and create embeddings by trunk
70+
#TODO let GPT do the split based on the field of knowledge(data/model/others)
71+
"""
72+
73+
def split_string_into_chunks(string: str, chunk_size: int):
74+
chunks = []
75+
for i in range(0, len(string), chunk_size):
76+
chunk = string[i : i + chunk_size]
77+
chunks.append(chunk)
78+
return chunks
79+
80+
self.trunks = split_string_into_chunks(self.content, chunk_size=size)
81+
self.trunks_embedding = APIBackend().create_embedding(input_content=self.trunks)
82+
83+
def from_dict(self, data: dict):
84+
"""
85+
Load Kaggle post data from a dictionary
86+
"""
87+
super().from_dict(data)
88+
self.competition_name = data.get("competition_name", None)
89+
self.task_category = data.get("task_category", None)
90+
self.field = data.get("field", None)
91+
self.ranking = data.get("ranking", None)
92+
self.score = data.get("score", None)
93+
self.entities = data.get("entities", [])
94+
self.relations = data.get("relations", [])
95+
return self
96+
97+
def __repr__(self):
98+
return (
99+
f"KGKnowledgeMetaData(id={self.id}, label={self.label}, competition={self.competition_name}, "
100+
f"task_category={self.task_category}, field={self.field}, ranking={self.ranking}, score={self.score})"
101+
)
102+
103+
104+
KGDocument = KGKnowledgeMetaData
105+
106+
107+
class KaggleExperienceBase(PDVectorBase):
108+
"""
109+
Class for handling Kaggle competition experience posts and organizing them for reference
110+
"""
111+
112+
def __init__(self, vector_df_path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None):
113+
"""
114+
Initialize the KaggleExperienceBase class
115+
116+
Parameters:
117+
----------
118+
vector_df_path: str or Path, optional
119+
Path to the vector DataFrame for embedding management.
120+
kaggle_experience_path: str or Path, optional
121+
Path to the Kaggle experience post data.
122+
"""
123+
super().__init__(vector_df_path)
124+
self.kaggle_experience_path = kaggle_experience_path
125+
self.kaggle_experience_data = []
126+
127+
if kaggle_experience_path:
128+
self.load_kaggle_experience(kaggle_experience_path)
129+
130+
def add(self, document: Union[KGDocument, List[KGDocument]]):
131+
document.split_into_trunk()
132+
docs = [
133+
{
134+
"id": document.id,
135+
"label": document.label,
136+
"content": document.content,
137+
"competition_name": document.competition_name,
138+
"task_category": document.task_category,
139+
"field": document.field,
140+
"ranking": document.ranking,
141+
"score": document.score,
142+
"embedding": document.embedding,
143+
}
144+
]
145+
if len(document.trunks) > 1:
146+
docs.extend(
147+
[
148+
{
149+
"id": document.id,
150+
"label": document.label,
151+
"content": document.content,
152+
"competition_name": document.competition_name,
153+
"task_category": document.task_category,
154+
"field": document.field,
155+
"ranking": document.ranking,
156+
"score": document.score,
157+
"embedding": trunk_embedding,
158+
}
159+
for trunk, trunk_embedding in zip(document.trunks, document.trunks_embedding)
160+
]
161+
)
162+
self.vector_df = pd.concat([self.vector_df, pd.DataFrame(docs)], ignore_index=True)
163+
164+
def load_kaggle_experience(self, kaggle_experience_path: Union[str, Path]):
165+
"""
166+
Load Kaggle experience posts from a JSON or text file
167+
168+
Parameters:
169+
----------
170+
kaggle_experience_path: str or Path
171+
Path to the Kaggle experience post data.
172+
"""
173+
try:
174+
with open(kaggle_experience_path, "r", encoding="utf-8") as file:
175+
self.kaggle_experience_data = json.load(file)
176+
logger.info(f"Kaggle experience data loaded from {kaggle_experience_path}")
177+
except FileNotFoundError:
178+
logger.error(f"Kaggle experience data not found at {kaggle_experience_path}")
179+
self.kaggle_experience_data = []
180+
181+
def add_experience_to_vector_base(self):
182+
"""
183+
Process the Kaggle experience data and add relevant information to the vector base
184+
"""
185+
for experience in self.kaggle_experience_data:
186+
content = experience.get("content", "")
187+
label = experience.get("title", "Kaggle Experience")
188+
competition_name = experience.get("competition_name", "Unknown Competition")
189+
task_category = experience.get("task_category", "General Task")
190+
field = experience.get("field", None)
191+
ranking = experience.get("ranking", None)
192+
score = experience.get("score", None)
193+
194+
document = KGKnowledgeMetaData(
195+
content=content,
196+
label=label,
197+
competition_name=competition_name,
198+
task_category=task_category,
199+
field=field,
200+
ranking=ranking,
201+
score=score,
202+
)
203+
document.create_embedding()
204+
self.add(document)
205+
206+
def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: float = 0.1):
207+
"""
208+
Search for Kaggle experience posts related to the query
209+
210+
Parameters:
211+
----------
212+
query: str
213+
The search query to find relevant experience posts.
214+
topk_k: int, optional
215+
Number of top similar results to return (default is 5).
216+
similarity_threshold: float, optional
217+
The similarity threshold for filtering results (default is 0.1).
218+
219+
Returns:
220+
-------
221+
List[KGKnowledgeMetaData], List[float]:
222+
A list of the most relevant documents and their similarities.
223+
"""
224+
search_results, similarities = super().search(query, topk_k=topk_k, similarity_threshold=similarity_threshold)
225+
226+
kaggle_docs = []
227+
for result in search_results:
228+
kg_doc = KGKnowledgeMetaData().from_dict(result.__dict__)
229+
kaggle_docs.append(kg_doc)
230+
231+
return kaggle_docs, similarities
232+
233+
234+
if __name__ == "__main__":
235+
kaggle_base = KaggleExperienceBase(
236+
kaggle_experience_path="git_ignore_folder/experience/tabular_cases/kaggle_experience_results.json"
237+
)
238+
239+
kaggle_base.add_experience_to_vector_base()
240+
241+
print(f"There are {kaggle_base.shape()[0]} records in the vector base.")
242+
243+
search_results, similarities = kaggle_base.search_experience(query="image classification", topk_k=3)
244+
245+
for result, similarity in zip(search_results, similarities):
246+
print(
247+
f"Competition name: {result.competition_name}, task_category: {result.task_category}, score: {result.score}, similarity: {similarity}"
248+
)

0 commit comments

Comments
 (0)