Skip to content

Commit f12ce72

Browse files
TPLin22WinstonLiyt
andauthored
fix: a bug of developer& edit s4e8 template (#338)
* s4e8 preprocess remove onehot & fix a bug * Update runner.py * Update fea_share_preprocess.py * Update runner.py --------- Co-authored-by: WinstonLiyt <[email protected]>
1 parent c86afad commit f12ce72

File tree

3 files changed

+14
-25
lines changed

3 files changed

+14
-25
lines changed

rdagent/scenarios/kaggle/developer/runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
9797
self.build_from_SOTA(exp)
9898

9999
sub_ws = exp.sub_workspace_list[0]
100-
model_type = sub_ws.target_task.model_type
100+
# TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list. Hardcoded now.
101+
model_type = sub_ws.target_task.model_type[0]
101102

102103
if sub_ws.code_dict == {}:
103104
raise ModelEmptyError("No model is implemented.")

rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from sklearn.impute import SimpleImputer
66
from sklearn.model_selection import train_test_split
77
from sklearn.pipeline import Pipeline
8-
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
8+
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
99

1010

1111
def prepreprocess():
@@ -40,42 +40,30 @@ def preprocess_fit(X_train: pd.DataFrame):
4040
categorical_transformer = Pipeline(
4141
steps=[
4242
("imputer", SimpleImputer(strategy="most_frequent")),
43-
("onehot", OneHotEncoder(handle_unknown="ignore")),
43+
("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
4444
]
4545
)
4646

4747
numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
4848

49-
# Combine preprocessing steps
5049
preprocessor = ColumnTransformer(
5150
transformers=[
52-
("cat", categorical_transformer, categorical_cols),
5351
("num", numerical_transformer, numerical_cols),
52+
("cat", categorical_transformer, categorical_cols),
5453
]
5554
)
5655

5756
# Fit the preprocessor on the training data
5857
preprocessor.fit(X_train)
5958

60-
return preprocessor
61-
59+
return preprocessor, numerical_cols, categorical_cols
6260

63-
def preprocess_transform(X: pd.DataFrame, preprocessor):
64-
"""
65-
Transforms the given DataFrame using the fitted preprocessor.
66-
Ensures the processed data has consistent features across train, validation, and test sets.
67-
"""
68-
# Transform the data using the fitted preprocessor
69-
X_array = preprocessor.transform(X).toarray()
7061

71-
# Get feature names for the columns in the transformed data
72-
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
73-
feature_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(
74-
categorical_cols
75-
).tolist() + [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]]
62+
def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
63+
X_transformed = preprocessor.transform(X)
7664

7765
# Convert arrays back to DataFrames
78-
X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index)
66+
X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)
7967

8068
return X_transformed
8169

@@ -96,16 +84,16 @@ def preprocess_script():
9684
X_train, X_valid, y_train, y_valid = prepreprocess()
9785

9886
# Fit the preprocessor on the training data
99-
preprocessor = preprocess_fit(X_train)
87+
preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)
10088

10189
# Preprocess the train, validation, and test data
102-
X_train = preprocess_transform(X_train, preprocessor)
103-
X_valid = preprocess_transform(X_valid, preprocessor)
90+
X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
91+
X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)
10492

10593
# Load and preprocess the test data
10694
submission_df = pd.read_csv("/kaggle/input/test.csv")
10795
passenger_ids = submission_df["id"]
10896
submission_df = submission_df.drop(["id"], axis=1)
109-
X_test = preprocess_transform(submission_df, preprocessor)
97+
X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)
11098

11199
return X_train, X_valid, y_train, y_valid, X_test, passenger_ids

rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import numpy as np
66
import pandas as pd
77
from fea_share_preprocess import preprocess_script
8-
from sklearn.metrics import accuracy_score, matthews_corrcoef
8+
from sklearn.metrics import matthews_corrcoef
99
from sklearn.preprocessing import LabelEncoder
1010

1111
# Set random seed for reproducibility

0 commit comments

Comments
 (0)