feat: Add ranking in kaggle scenario (#401)

WinstonLiyt · web-flow · commit b16b4beb402e · 2024-09-30T04:15:07.000+08:00
* fix some bugs in rag

* add ranking in kaggle scenario

* fix two mistouches

* fix two bugs

* fix a bug
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -148,6 +148,16 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         new_hypothesis = response_json.get("New Hypothesis", "No new hypothesis provided")
         reason = response_json.get("Reasoning", "No reasoning provided")
         decision = convert2bool(response_json.get("Replace Best Result", "no"))
+        leaderboard = self.scen.leaderboard
+        current_score = current_result.iloc[0]
+        sorted_scores = sorted(leaderboard, reverse=True)
+        import bisect
+
+        if self.scen.evaluation_metric_direction:
+            insert_position = bisect.bisect_right([-score for score in sorted_scores], -current_score)
+        else:
+            insert_position = bisect.bisect_left(sorted_scores, current_score, lo=0, hi=len(sorted_scores))
+        percentile_ranking = (insert_position) / (len(sorted_scores)) * 100
 
         experiment_feedback = {
             "current_competition": self.scen.get_competition_full_desc(),
@@ -158,6 +168,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             "observations": observations,
             "hypothesis_evaluation": hypothesis_evaluation,
             "reason": reason,
+            "percentile_ranking": percentile_ranking,
         }
 
         if self.scen.if_using_vector_rag:
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_randomforest.py
@@ -15,7 +15,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
     Define and train the Random Forest model. Merge feature selection into the pipeline.
     """
     # Initialize the Random Forest model
-    model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
+    model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)
 
     # Fit the model
     model.fit(X_train, y_train)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py
@@ -16,7 +16,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
         "tree_method": "gpu_hist",
         "device": "cuda",
     }
-    num_round = 180
+    num_round = 200
 
     evallist = [(dtrain, "train"), (dvalid, "eval")]
     bst = xgb.train(params, dtrain, num_round, evallist)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py
@@ -88,7 +88,7 @@ def import_module_from_path(module_name, module_path):
     metrics_all.append(metrics)
 
 # 5) Save the validation accuracy
-min_index = np.argmin(metrics_all)
+min_index = np.argmax(metrics_all)
 pd.Series(data=[metrics_all[min_index]], index=["MCC"]).to_csv("submission_score.csv")
 
 # 6) Make predictions on the test set and save them
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -13,7 +13,10 @@
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KGFactorExperiment
-from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions
+from rdagent.scenarios.kaggle.kaggle_crawler import (
+    crawl_descriptions,
+    leaderboard_scores,
+)
 from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
     KaggleExperienceBase,
 )
@@ -72,6 +75,8 @@ def __init__(self, competition: str) -> None:
         self.confidence_parameter = 1.0
         self.initial_performance = 0.0
 
+        self.leaderboard = leaderboard_scores(competition)
+
     def _analysis_competition_description(self):
         sys_prompt = (
             Environment(undefined=StrictUndefined)
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -87,6 +87,15 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
             zip_ref.extractall(data_path)
 
 
+def leaderboard_scores(competition: str) -> list[float]:
+    from kaggle.api.kaggle_api_extended import KaggleApi
+
+    api = KaggleApi()
+    api.authenticate()
+    ll = api.competition_leaderboard_view(competition)
+    return [float(x.score) for x in ll]
+
+
 def download_notebooks(
     competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks", num: int = 15
 ) -> None:
@@ -254,15 +263,18 @@ def collect_knowledge_texts(local_path: str = "/data/userdata/share/kaggle") ->
         "facebook-v-predicting-check-ins",
     ]
 
-    all_cs = mini_case_cs + other_cs
-    for c in all_cs:
-        convert_notebooks_to_text(c)
-    exit()
-    from kaggle.api.kaggle_api_extended import KaggleApi
+    # all_cs = mini_case_cs + other_cs
+    # for c in all_cs:
+    #     convert_notebooks_to_text(c)
+    # exit()
+    # from kaggle.api.kaggle_api_extended import KaggleApi
 
-    api = KaggleApi()
-    api.authenticate()
-    cs = api.competitions_list()
-    for c in cs:
-        name = c.ref.split("/")[-1]
-        crawl_descriptions(name)
+    # api = KaggleApi()
+    # api.authenticate()
+    # cs = api.competitions_list()
+    # for c in cs:
+    #     name = c.ref.split("/")[-1]
+    #     crawl_descriptions(name)
+    res = leaderboard_scores(competition="playground-series-s4e8")
+
+# %%
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -217,6 +217,7 @@ def execute_next_action(self, trace: Trace) -> str:
         # Select action with highest UCB
         selected_action = max(ucb_values, key=ucb_values.get)
         self.scen.action_counts[selected_action] += 1
+
         return selected_action
 
     def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
@@ -336,6 +337,7 @@ def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperi
             raise ModelEmptyError(
                 f"Invalid model type '{model_type}'. Allowed model types are: {', '.join(KG_SELECT_MAPPING)}."
             )
+
         tasks.append(
             ModelTask(
                 name=response_dict.get("model_name", "Model name not provided"),

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v`
`16`	`16`	`"tree_method": "gpu_hist",`
`17`	`17`	`"device": "cuda",`
`18`	`18`	`}`
`19`		`- num_round = 180`
	`19`	`+ num_round = 200`
`20`	`20`
`21`	`21`	`evallist = [(dtrain, "train"), (dvalid, "eval")]`
`22`	`22`	`bst = xgb.train(params, dtrain, num_round, evallist)`