fix: a bug of developer& edit s4e8 template (#338)

TPLin22 · WinstonLiyt · web-flow · commit f12ce726e7de · 2024-09-25T20:09:52.000+08:00
* s4e8 preprocess remove onehot &amp; fix a bug

* Update runner.py

* Update fea_share_preprocess.py

* Update runner.py

---------

Co-authored-by: WinstonLiyt &lt;104308117+WinstonLiyt@users.noreply.github.com&gt;
diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
@@ -97,7 +97,8 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
         self.build_from_SOTA(exp)
 
         sub_ws = exp.sub_workspace_list[0]
-        model_type = sub_ws.target_task.model_type
+        # TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list. Hardcoded now.
+        model_type = sub_ws.target_task.model_type[0]
 
         if sub_ws.code_dict == {}:
             raise ModelEmptyError("No model is implemented.")
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py
@@ -5,7 +5,7 @@
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
 
 
 def prepreprocess():
@@ -40,42 +40,30 @@ def preprocess_fit(X_train: pd.DataFrame):
     categorical_transformer = Pipeline(
         steps=[
             ("imputer", SimpleImputer(strategy="most_frequent")),
-            ("onehot", OneHotEncoder(handle_unknown="ignore")),
+            ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
         ]
     )
 
     numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
 
-    # Combine preprocessing steps
     preprocessor = ColumnTransformer(
         transformers=[
-            ("cat", categorical_transformer, categorical_cols),
             ("num", numerical_transformer, numerical_cols),
+            ("cat", categorical_transformer, categorical_cols),
         ]
     )
 
     # Fit the preprocessor on the training data
     preprocessor.fit(X_train)
 
-    return preprocessor
-
+    return preprocessor, numerical_cols, categorical_cols
 
-def preprocess_transform(X: pd.DataFrame, preprocessor):
-    """
-    Transforms the given DataFrame using the fitted preprocessor.
-    Ensures the processed data has consistent features across train, validation, and test sets.
-    """
-    # Transform the data using the fitted preprocessor
-    X_array = preprocessor.transform(X).toarray()
 
-    # Get feature names for the columns in the transformed data
-    categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
-    feature_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(
-        categorical_cols
-    ).tolist() + [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]]
+def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
+    X_transformed = preprocessor.transform(X)
 
     # Convert arrays back to DataFrames
-    X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index)
+    X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)
 
     return X_transformed
 
@@ -96,16 +84,16 @@ def preprocess_script():
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
     # Fit the preprocessor on the training data
-    preprocessor = preprocess_fit(X_train)
+    preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)
 
     # Preprocess the train, validation, and test data
-    X_train = preprocess_transform(X_train, preprocessor)
-    X_valid = preprocess_transform(X_valid, preprocessor)
+    X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
+    X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)
 
     # Load and preprocess the test data
     submission_df = pd.read_csv("/kaggle/input/test.csv")
     passenger_ids = submission_df["id"]
     submission_df = submission_df.drop(["id"], axis=1)
-    X_test = preprocess_transform(submission_df, preprocessor)
+    X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)
 
     return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 from fea_share_preprocess import preprocess_script
-from sklearn.metrics import accuracy_score, matthews_corrcoef
+from sklearn.metrics import matthews_corrcoef
 from sklearn.preprocessing import LabelEncoder
 
 # Set random seed for reproducibility