Skip to content

Commit f3405ca

Browse files
authored
feat: (Kaggle) add base template for competition: tabular-playground-series-may-2022 (#481)
* add tpl kaggle * CI
1 parent 5c93fe7 commit f3405ca

File tree

8 files changed

+250
-0
lines changed

8 files changed

+250
-0
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import os
2+
3+
import pandas as pd
4+
from sklearn.model_selection import train_test_split
5+
from sklearn.preprocessing import MinMaxScaler
6+
7+
8+
def preprocess_script():
9+
"""
10+
This method applies the preprocessing steps to the training, validation, and test datasets.
11+
"""
12+
if os.path.exists("/kaggle/input/X_train.pkl"):
13+
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
14+
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
15+
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
16+
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
17+
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
18+
others = pd.read_pickle("/kaggle/input/others.pkl")
19+
20+
return X_train, X_valid, y_train, y_valid, X_test, *others
21+
22+
train_df = pd.read_csv("/kaggle/input/train.csv")
23+
test_df = pd.read_csv("/kaggle/input/test.csv")
24+
25+
x = train_df.drop(columns=["target", "id", "f_27"])
26+
y = train_df["target"]
27+
scaler = MinMaxScaler()
28+
x_scaled = pd.DataFrame(scaler.fit_transform(x))
29+
30+
X_train, X_valid, y_train, y_valid = train_test_split(x_scaled, y, test_size=0.20, random_state=101)
31+
32+
# Load and preprocess the test data
33+
ids = test_df["id"]
34+
X_test = test_df.drop(["id", "f_27"], axis=1)
35+
X_test = pd.DataFrame(scaler.transform(X_test))
36+
37+
return X_train, X_valid, y_train, y_valid, X_test, ids
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pandas as pd
2+
3+
"""
4+
Here is the feature engineering code for each task, with a class that has a fit and transform method.
5+
Remember
6+
"""
7+
8+
9+
class IdentityFeature:
10+
def fit(self, train_df: pd.DataFrame):
11+
"""
12+
Fit the feature engineering model to the training data.
13+
"""
14+
pass
15+
16+
def transform(self, X: pd.DataFrame):
17+
"""
18+
Transform the input data.
19+
"""
20+
return X
21+
22+
23+
feature_engineering_cls = IdentityFeature
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""
2+
Motivation of the model:
3+
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
4+
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
5+
baseline model for many classification tasks.
6+
"""
7+
8+
import pandas as pd
9+
from sklearn.ensemble import RandomForestClassifier
10+
from sklearn.metrics import accuracy_score
11+
12+
13+
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
14+
"""
15+
Define and train the Random Forest model. Merge feature selection into the pipeline.
16+
"""
17+
# Initialize the Random Forest model
18+
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)
19+
20+
# Fit the model
21+
model.fit(X_train, y_train)
22+
23+
# Validate the model
24+
y_valid_pred = model.predict(X_valid)
25+
accuracy = accuracy_score(y_valid, y_valid_pred)
26+
print(f"Validation Accuracy: {accuracy:.4f}")
27+
28+
return model
29+
30+
31+
def predict(model, X):
32+
"""
33+
Keep feature selection's consistency and make predictions.
34+
"""
35+
# Predict using the trained model
36+
y_pred = model.predict(X)
37+
38+
return y_pred.reshape(-1, 1)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""
2+
motivation of the model
3+
"""
4+
5+
import pandas as pd
6+
import xgboost as xgb
7+
8+
9+
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> xgb.Booster:
10+
"""Define and train the model. Merge feature_select"""
11+
# 将数据转换为 DMatrix 并指定设备
12+
dtrain = xgb.DMatrix(X_train, label=y_train)
13+
dvalid = xgb.DMatrix(X_valid, label=y_valid)
14+
15+
params = {
16+
"learning_rate": 0.5,
17+
"max_depth": 10,
18+
"device": "cuda",
19+
"tree_method": "hist",
20+
"objective": "binary:logistic",
21+
}
22+
num_boost_round = 10
23+
24+
model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dvalid, "validation")], verbose_eval=100)
25+
return model
26+
27+
28+
def predict(model: xgb.Booster, X):
29+
"""
30+
Keep feature select's consistency.
31+
"""
32+
dtest = xgb.DMatrix(X)
33+
y_pred = pd.Series([round(v) for v in model.predict(dtest)])
34+
return y_pred
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
3+
4+
def select(X: pd.DataFrame) -> pd.DataFrame:
5+
"""
6+
Select relevant features. To be used in fit & predict function.
7+
"""
8+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
9+
if X.columns.nlevels == 1:
10+
return X
11+
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
12+
return X
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
3+
4+
def select(X: pd.DataFrame) -> pd.DataFrame:
5+
"""
6+
Select relevant features. To be used in fit & predict function.
7+
"""
8+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
9+
if X.columns.nlevels == 1:
10+
return X
11+
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
12+
return X
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
3+
4+
def select(X: pd.DataFrame) -> pd.DataFrame:
5+
"""
6+
Select relevant features. To be used in fit & predict function.
7+
"""
8+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
9+
if X.columns.nlevels == 1:
10+
return X
11+
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
12+
return X
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import importlib.util
2+
import random
3+
from pathlib import Path
4+
5+
import numpy as np
6+
import pandas as pd
7+
from fea_share_preprocess import preprocess_script
8+
from sklearn.metrics import roc_auc_score
9+
10+
# Set random seed for reproducibility
11+
SEED = 42
12+
random.seed(SEED)
13+
np.random.seed(SEED)
14+
DIRNAME = Path(__file__).absolute().resolve().parent
15+
16+
17+
def import_module_from_path(module_name, module_path):
18+
spec = importlib.util.spec_from_file_location(module_name, module_path)
19+
module = importlib.util.module_from_spec(spec)
20+
spec.loader.exec_module(module)
21+
return module
22+
23+
24+
# 1) Preprocess the data
25+
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
26+
27+
# 2) Auto feature engineering
28+
X_train_l, X_valid_l = [], []
29+
X_test_l = []
30+
31+
for f in DIRNAME.glob("feature/feat*.py"):
32+
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
33+
cls.fit(X_train)
34+
X_train_f = cls.transform(X_train)
35+
X_valid_f = cls.transform(X_valid)
36+
X_test_f = cls.transform(X_test)
37+
38+
if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
39+
X_train_l.append(X_train_f)
40+
X_valid_l.append(X_valid_f)
41+
X_test_l.append(X_test_f)
42+
print(f"Feature [{f.stem}] has been added to the feature list")
43+
44+
X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
45+
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
46+
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
47+
48+
49+
model_l = [] # list[tuple[model, predict_func]]
50+
for f in DIRNAME.glob("model/model*.py"):
51+
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
52+
select_m = import_module_from_path(select_python_path.stem, select_python_path)
53+
X_train_selected = select_m.select(X_train.copy())
54+
X_valid_selected = select_m.select(X_valid.copy())
55+
56+
m = import_module_from_path(f.stem, f)
57+
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))
58+
print(f"Model [{f.stem}] has been trained")
59+
60+
# 4) Evaluate the model on the validation set
61+
metrics_all = []
62+
for model, predict_func, select_m in model_l:
63+
X_valid_selected = select_m.select(X_valid.copy())
64+
y_valid_pred = predict_func(model, X_valid_selected)
65+
auroc = roc_auc_score(y_valid, y_valid_pred)
66+
print(f"[{type(model).__name__}] AUROC on valid set: {auroc}")
67+
metrics_all.append(auroc)
68+
69+
# 5) Save the validation accuracy
70+
max_index = np.argmax(metrics_all)
71+
pd.Series(data=[metrics_all[max_index]], index=["AUROC"]).to_csv("submission_score.csv")
72+
73+
# 6) Make predictions on the test set and save them
74+
X_test_selected = model_l[max_index][2].select(X_test.copy())
75+
y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1
76+
77+
78+
# 7) Submit predictions for the test set
79+
submission_result = pd.DataFrame(y_test_pred, columns=["target"])
80+
submission_result.insert(0, "id", ids)
81+
82+
submission_result.to_csv("submission.csv", index=False)

0 commit comments

Comments
 (0)