Skip to content

Commit b16b4be

Browse files
authored
feat: Add ranking in kaggle scenario (#401)
* fix some bugs in rag * add ranking in kaggle scenario * fix two mistouches * fix two bugs * fix a bug
1 parent 194215c commit b16b4be

File tree

7 files changed

+45
-15
lines changed

7 files changed

+45
-15
lines changed

rdagent/scenarios/kaggle/developer/feedback.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,16 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
148148
new_hypothesis = response_json.get("New Hypothesis", "No new hypothesis provided")
149149
reason = response_json.get("Reasoning", "No reasoning provided")
150150
decision = convert2bool(response_json.get("Replace Best Result", "no"))
151+
leaderboard = self.scen.leaderboard
152+
current_score = current_result.iloc[0]
153+
sorted_scores = sorted(leaderboard, reverse=True)
154+
import bisect
155+
156+
if self.scen.evaluation_metric_direction:
157+
insert_position = bisect.bisect_right([-score for score in sorted_scores], -current_score)
158+
else:
159+
insert_position = bisect.bisect_left(sorted_scores, current_score, lo=0, hi=len(sorted_scores))
160+
percentile_ranking = (insert_position) / (len(sorted_scores)) * 100
151161

152162
experiment_feedback = {
153163
"current_competition": self.scen.get_competition_full_desc(),
@@ -158,6 +168,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
158168
"observations": observations,
159169
"hypothesis_evaluation": hypothesis_evaluation,
160170
"reason": reason,
171+
"percentile_ranking": percentile_ranking,
161172
}
162173

163174
if self.scen.if_using_vector_rag:

rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_randomforest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
1515
Define and train the Random Forest model. Merge feature selection into the pipeline.
1616
"""
1717
# Initialize the Random Forest model
18-
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
18+
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)
1919

2020
# Fit the model
2121
model.fit(X_train, y_train)

rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
1616
"tree_method": "gpu_hist",
1717
"device": "cuda",
1818
}
19-
num_round = 180
19+
num_round = 200
2020

2121
evallist = [(dtrain, "train"), (dvalid, "eval")]
2222
bst = xgb.train(params, dtrain, num_round, evallist)

rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def import_module_from_path(module_name, module_path):
8888
metrics_all.append(metrics)
8989

9090
# 5) Save the validation accuracy
91-
min_index = np.argmin(metrics_all)
91+
min_index = np.argmax(metrics_all)
9292
pd.Series(data=[metrics_all[min_index]], index=["MCC"]).to_csv("submission_score.csv")
9393

9494
# 6) Make predictions on the test set and save them

rdagent/scenarios/kaggle/experiment/scenario.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
from rdagent.core.scenario import Scenario
1414
from rdagent.oai.llm_utils import APIBackend
1515
from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KGFactorExperiment
16-
from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions
16+
from rdagent.scenarios.kaggle.kaggle_crawler import (
17+
crawl_descriptions,
18+
leaderboard_scores,
19+
)
1720
from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
1821
KaggleExperienceBase,
1922
)
@@ -72,6 +75,8 @@ def __init__(self, competition: str) -> None:
7275
self.confidence_parameter = 1.0
7376
self.initial_performance = 0.0
7477

78+
self.leaderboard = leaderboard_scores(competition)
79+
7580
def _analysis_competition_description(self):
7681
sys_prompt = (
7782
Environment(undefined=StrictUndefined)

rdagent/scenarios/kaggle/kaggle_crawler.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,15 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
8787
zip_ref.extractall(data_path)
8888

8989

90+
def leaderboard_scores(competition: str) -> list[float]:
91+
from kaggle.api.kaggle_api_extended import KaggleApi
92+
93+
api = KaggleApi()
94+
api.authenticate()
95+
ll = api.competition_leaderboard_view(competition)
96+
return [float(x.score) for x in ll]
97+
98+
9099
def download_notebooks(
91100
competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks", num: int = 15
92101
) -> None:
@@ -254,15 +263,18 @@ def collect_knowledge_texts(local_path: str = "/data/userdata/share/kaggle") ->
254263
"facebook-v-predicting-check-ins",
255264
]
256265

257-
all_cs = mini_case_cs + other_cs
258-
for c in all_cs:
259-
convert_notebooks_to_text(c)
260-
exit()
261-
from kaggle.api.kaggle_api_extended import KaggleApi
266+
# all_cs = mini_case_cs + other_cs
267+
# for c in all_cs:
268+
# convert_notebooks_to_text(c)
269+
# exit()
270+
# from kaggle.api.kaggle_api_extended import KaggleApi
262271

263-
api = KaggleApi()
264-
api.authenticate()
265-
cs = api.competitions_list()
266-
for c in cs:
267-
name = c.ref.split("/")[-1]
268-
crawl_descriptions(name)
272+
# api = KaggleApi()
273+
# api.authenticate()
274+
# cs = api.competitions_list()
275+
# for c in cs:
276+
# name = c.ref.split("/")[-1]
277+
# crawl_descriptions(name)
278+
res = leaderboard_scores(competition="playground-series-s4e8")
279+
280+
# %%

rdagent/scenarios/kaggle/proposal/proposal.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ def execute_next_action(self, trace: Trace) -> str:
217217
# Select action with highest UCB
218218
selected_action = max(ucb_values, key=ucb_values.get)
219219
self.scen.action_counts[selected_action] += 1
220+
220221
return selected_action
221222

222223
def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
@@ -336,6 +337,7 @@ def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperi
336337
raise ModelEmptyError(
337338
f"Invalid model type '{model_type}'. Allowed model types are: {', '.join(KG_SELECT_MAPPING)}."
338339
)
340+
339341
tasks.append(
340342
ModelTask(
341343
name=response_dict.get("model_name", "Model name not provided"),

0 commit comments

Comments
 (0)