55from sklearn .impute import SimpleImputer
66from sklearn .model_selection import train_test_split
77from sklearn .pipeline import Pipeline
8- from sklearn .preprocessing import LabelEncoder , OneHotEncoder
8+ from sklearn .preprocessing import LabelEncoder , OrdinalEncoder
99
1010
1111def prepreprocess ():
@@ -40,42 +40,30 @@ def preprocess_fit(X_train: pd.DataFrame):
4040 categorical_transformer = Pipeline (
4141 steps = [
4242 ("imputer" , SimpleImputer (strategy = "most_frequent" )),
43- ("onehot " , OneHotEncoder (handle_unknown = "ignore" )),
43+ ("ordinal " , OrdinalEncoder (handle_unknown = "use_encoded_value" , unknown_value = - 1 )),
4444 ]
4545 )
4646
4747 numerical_transformer = Pipeline (steps = [("imputer" , SimpleImputer (strategy = "mean" ))])
4848
49- # Combine preprocessing steps
5049 preprocessor = ColumnTransformer (
5150 transformers = [
52- ("cat" , categorical_transformer , categorical_cols ),
5351 ("num" , numerical_transformer , numerical_cols ),
52+ ("cat" , categorical_transformer , categorical_cols ),
5453 ]
5554 )
5655
5756 # Fit the preprocessor on the training data
5857 preprocessor .fit (X_train )
5958
60- return preprocessor
61-
59+ return preprocessor , numerical_cols , categorical_cols
6260
63- def preprocess_transform (X : pd .DataFrame , preprocessor ):
64- """
65- Transforms the given DataFrame using the fitted preprocessor.
66- Ensures the processed data has consistent features across train, validation, and test sets.
67- """
68- # Transform the data using the fitted preprocessor
69- X_array = preprocessor .transform (X ).toarray ()
7061
71- # Get feature names for the columns in the transformed data
72- categorical_cols = [cname for cname in X .columns if X [cname ].dtype == "object" ]
73- feature_names = preprocessor .named_transformers_ ["cat" ]["onehot" ].get_feature_names_out (
74- categorical_cols
75- ).tolist () + [cname for cname in X .columns if X [cname ].dtype in ["int64" , "float64" ]]
62+ def preprocess_transform (X : pd .DataFrame , preprocessor , numerical_cols , categorical_cols ):
63+ X_transformed = preprocessor .transform (X )
7664
7765 # Convert arrays back to DataFrames
78- X_transformed = pd .DataFrame (X_array , columns = feature_names , index = X .index )
66+ X_transformed = pd .DataFrame (X_transformed , columns = numerical_cols + categorical_cols , index = X .index )
7967
8068 return X_transformed
8169
@@ -96,16 +84,16 @@ def preprocess_script():
9684 X_train , X_valid , y_train , y_valid = prepreprocess ()
9785
9886 # Fit the preprocessor on the training data
99- preprocessor = preprocess_fit (X_train )
87+ preprocessor , numerical_cols , categorical_cols = preprocess_fit (X_train )
10088
10189 # Preprocess the train, validation, and test data
102- X_train = preprocess_transform (X_train , preprocessor )
103- X_valid = preprocess_transform (X_valid , preprocessor )
90+ X_train = preprocess_transform (X_train , preprocessor , numerical_cols , categorical_cols )
91+ X_valid = preprocess_transform (X_valid , preprocessor , numerical_cols , categorical_cols )
10492
10593 # Load and preprocess the test data
10694 submission_df = pd .read_csv ("/kaggle/input/test.csv" )
10795 passenger_ids = submission_df ["id" ]
10896 submission_df = submission_df .drop (["id" ], axis = 1 )
109- X_test = preprocess_transform (submission_df , preprocessor )
97+ X_test = preprocess_transform (submission_df , preprocessor , numerical_cols , categorical_cols )
11098
11199 return X_train , X_valid , y_train , y_valid , X_test , passenger_ids
0 commit comments