feat: Integrate RAG into the Kaggle scenarios. (#262)

WinstonLiyt · web-flow · commit be0e48a7dfbe · 2024-09-13T18:34:57.000+08:00
* init a scenario for kaggle feature engineering

* add 1st version rag for Kaggle hypo

* refine the code

* fix a bug

* add the process of extracting exp from docs

* Remove the unnecessary file.

* refine the code for ci test

* Delete rdagent/app/kaggle_feature/conf.py

* refine the comments

* Update extract_experience_from_docs.py

* Update extract_experience_from_docs.py
diff --git a/rdagent/scenarios/kaggle/knowledge_management/extract_experience_from_docs.py b/rdagent/scenarios/kaggle/knowledge_management/extract_experience_from_docs.py
@@ -0,0 +1,57 @@
+import json
+import os
+from pathlib import Path
+
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.core.prompts import Prompts
+from rdagent.oai.llm_utils import APIBackend
+
+prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+
+def process_with_gpt(content: str):
+    sys_prompt = (
+        Environment(undefined=StrictUndefined)
+        .from_string(prompt_dict["extract_kaggle_knowledge_prompts"]["system"])
+        .render()
+    )
+
+    user_prompt = (
+        Environment(undefined=StrictUndefined)
+        .from_string(prompt_dict["extract_kaggle_knowledge_prompts"]["user"])
+        .render(file_content=content)
+    )
+
+    response_analysis = APIBackend().build_messages_and_create_chat_completion(
+        user_prompt=user_prompt,
+        system_prompt=sys_prompt,
+        json_mode=True,
+    )
+
+    try:
+        response_json_analysis = json.loads(response_analysis)
+    except json.JSONDecodeError:
+        response_json_analysis = {"error": "Failed to parse LLM's response as JSON"}
+
+    return response_json_analysis
+
+
+def process_all_case_files(directory_path: str):
+    output_file = Path(directory_path) / "kaggle_experience_results.json"
+    json_output = []
+    for filename in os.listdir(directory_path):
+        if filename.endswith(".case"):
+            file_path = os.path.join(directory_path, filename)
+
+            with open(file_path, "r", encoding="utf-8") as file:
+                content = file.read()
+                gpt_response = process_with_gpt(content)
+                json_output.append(gpt_response)
+
+    with open(output_file, "w", encoding="utf-8") as json_file:
+        json.dump(json_output, json_file, ensure_ascii=False)
+
+
+if __name__ == "__main__":
+    process_all_case_files(directory_path="git_ignore_folder/experience/tabular_cases_all")
diff --git a/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml b/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml
@@ -0,0 +1,19 @@
+extract_kaggle_knowledge_prompts:
+  system: |-
+    You are a Kaggle competition expert with extensive experience in analyzing high-ranking Kaggle notebooks and competition strategies. 
+    Your task is to summarize or infer key information such as the competition name, task type, and specific techniques employed in the notebook or strategy.
+    For each provided content, you are expected to extract valuable insights and organize the analysis in the structured format outlined below.
+    
+    Please provide the analysis in the following JSON format:
+    {
+      "content": "all provided content",
+      "title": "extracted title, if available",
+      "competition_name": "extracted competition name",
+      "task_category": "extracted task type, e.g., Classification, Regression",
+      "field": "field of focus, e.g., Feature Engineering, Modeling",
+      "ranking": "extracted ranking, if available",
+      "score": "extracted score or metric, if available"
+    }
+  
+  user: |-
+    High-ranking Kaggle notebooks or competition strategies: {{ file_content }}
diff --git a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py
@@ -0,0 +1,248 @@
+import uuid
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import pandas as pd
+from _pytest.cacheprovider import json
+from scipy.spatial.distance import cosine
+
+from rdagent.components.knowledge_management.vector_base import (
+    KnowledgeMetaData,
+    PDVectorBase,
+)
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+
+
+class KGKnowledgeMetaData(KnowledgeMetaData):
+    """
+    Class for handling Kaggle competition specific metadata
+    """
+
+    def __init__(
+        self,
+        content: str = "",
+        label: str = None,
+        embedding=None,
+        identity=None,
+        competition_name=None,
+        task_category=None,
+        field=None,
+        ranking=None,
+        score=None,
+        entities=None,
+        relations=None,
+    ):
+        """
+        Initialize KGKnowledgeMetaData for Kaggle competition posts
+
+        Parameters:
+        ----------
+        competition_name: str, optional
+            The name of the Kaggle competition.
+        task_category: str, required
+            The type of task (e.g., classification, regression).
+        field: str, optional
+            The specific field of knowledge (e.g., feature engineering, modeling).
+        ranking: str or int, optional
+            The ranking achieved in the competition.
+        score: float, optional
+            The score or metric achieved in the competition.
+        entities: list, optional
+            Entities related to the content (for knowledge graph integration).
+        relations: list, optional
+            Relations between entities (for knowledge graph integration).
+        """
+        super().__init__(content, label, embedding, identity)
+        self.competition_name = competition_name
+        self.task_category = task_category  # Task type is required
+        self.field = field  # Knowledge field, optional (model/data/others/overall)
+        self.ranking = ranking  # Ranking
+        # TODO ranking and score might be unified
+        self.score = score  # Competition score
+        # TODO Perhaps this shouldn't be here?
+        self.entities = entities or []  # Entities in the knowledge graph
+        self.relations = relations or []  # Relations in the knowledge graph
+
+    def split_into_trunk(self, size: int = 1000, overlap: int = 0):
+        """
+        Split content into trunks and create embeddings by trunk
+        #TODO let GPT do the split based on the field of knowledge(data/model/others)
+        """
+
+        def split_string_into_chunks(string: str, chunk_size: int):
+            chunks = []
+            for i in range(0, len(string), chunk_size):
+                chunk = string[i : i + chunk_size]
+                chunks.append(chunk)
+            return chunks
+
+        self.trunks = split_string_into_chunks(self.content, chunk_size=size)
+        self.trunks_embedding = APIBackend().create_embedding(input_content=self.trunks)
+
+    def from_dict(self, data: dict):
+        """
+        Load Kaggle post data from a dictionary
+        """
+        super().from_dict(data)
+        self.competition_name = data.get("competition_name", None)
+        self.task_category = data.get("task_category", None)
+        self.field = data.get("field", None)
+        self.ranking = data.get("ranking", None)
+        self.score = data.get("score", None)
+        self.entities = data.get("entities", [])
+        self.relations = data.get("relations", [])
+        return self
+
+    def __repr__(self):
+        return (
+            f"KGKnowledgeMetaData(id={self.id}, label={self.label}, competition={self.competition_name}, "
+            f"task_category={self.task_category}, field={self.field}, ranking={self.ranking}, score={self.score})"
+        )
+
+
+KGDocument = KGKnowledgeMetaData
+
+
+class KaggleExperienceBase(PDVectorBase):
+    """
+    Class for handling Kaggle competition experience posts and organizing them for reference
+    """
+
+    def __init__(self, vector_df_path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None):
+        """
+        Initialize the KaggleExperienceBase class
+
+        Parameters:
+        ----------
+        vector_df_path: str or Path, optional
+            Path to the vector DataFrame for embedding management.
+        kaggle_experience_path: str or Path, optional
+            Path to the Kaggle experience post data.
+        """
+        super().__init__(vector_df_path)
+        self.kaggle_experience_path = kaggle_experience_path
+        self.kaggle_experience_data = []
+
+        if kaggle_experience_path:
+            self.load_kaggle_experience(kaggle_experience_path)
+
+    def add(self, document: Union[KGDocument, List[KGDocument]]):
+        document.split_into_trunk()
+        docs = [
+            {
+                "id": document.id,
+                "label": document.label,
+                "content": document.content,
+                "competition_name": document.competition_name,
+                "task_category": document.task_category,
+                "field": document.field,
+                "ranking": document.ranking,
+                "score": document.score,
+                "embedding": document.embedding,
+            }
+        ]
+        if len(document.trunks) > 1:
+            docs.extend(
+                [
+                    {
+                        "id": document.id,
+                        "label": document.label,
+                        "content": document.content,
+                        "competition_name": document.competition_name,
+                        "task_category": document.task_category,
+                        "field": document.field,
+                        "ranking": document.ranking,
+                        "score": document.score,
+                        "embedding": trunk_embedding,
+                    }
+                    for trunk, trunk_embedding in zip(document.trunks, document.trunks_embedding)
+                ]
+            )
+        self.vector_df = pd.concat([self.vector_df, pd.DataFrame(docs)], ignore_index=True)
+
+    def load_kaggle_experience(self, kaggle_experience_path: Union[str, Path]):
+        """
+        Load Kaggle experience posts from a JSON or text file
+
+        Parameters:
+        ----------
+        kaggle_experience_path: str or Path
+            Path to the Kaggle experience post data.
+        """
+        try:
+            with open(kaggle_experience_path, "r", encoding="utf-8") as file:
+                self.kaggle_experience_data = json.load(file)
+            logger.info(f"Kaggle experience data loaded from {kaggle_experience_path}")
+        except FileNotFoundError:
+            logger.error(f"Kaggle experience data not found at {kaggle_experience_path}")
+            self.kaggle_experience_data = []
+
+    def add_experience_to_vector_base(self):
+        """
+        Process the Kaggle experience data and add relevant information to the vector base
+        """
+        for experience in self.kaggle_experience_data:
+            content = experience.get("content", "")
+            label = experience.get("title", "Kaggle Experience")
+            competition_name = experience.get("competition_name", "Unknown Competition")
+            task_category = experience.get("task_category", "General Task")
+            field = experience.get("field", None)
+            ranking = experience.get("ranking", None)
+            score = experience.get("score", None)
+
+            document = KGKnowledgeMetaData(
+                content=content,
+                label=label,
+                competition_name=competition_name,
+                task_category=task_category,
+                field=field,
+                ranking=ranking,
+                score=score,
+            )
+            document.create_embedding()
+            self.add(document)
+
+    def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: float = 0.1):
+        """
+        Search for Kaggle experience posts related to the query
+
+        Parameters:
+        ----------
+        query: str
+            The search query to find relevant experience posts.
+        topk_k: int, optional
+            Number of top similar results to return (default is 5).
+        similarity_threshold: float, optional
+            The similarity threshold for filtering results (default is 0.1).
+
+        Returns:
+        -------
+        List[KGKnowledgeMetaData], List[float]:
+            A list of the most relevant documents and their similarities.
+        """
+        search_results, similarities = super().search(query, topk_k=topk_k, similarity_threshold=similarity_threshold)
+
+        kaggle_docs = []
+        for result in search_results:
+            kg_doc = KGKnowledgeMetaData().from_dict(result.__dict__)
+            kaggle_docs.append(kg_doc)
+
+        return kaggle_docs, similarities
+
+
+if __name__ == "__main__":
+    kaggle_base = KaggleExperienceBase(
+        kaggle_experience_path="git_ignore_folder/experience/tabular_cases/kaggle_experience_results.json"
+    )
+
+    kaggle_base.add_experience_to_vector_base()
+
+    print(f"There are {kaggle_base.shape()[0]} records in the vector base.")
+
+    search_results, similarities = kaggle_base.search_experience(query="image classification", topk_k=3)
+
+    for result, similarity in zip(search_results, similarities):
+        print(
+            f"Competition name: {result.competition_name}, task_category: {result.task_category}, score: {result.score}, similarity: {similarity}"
+        )