import pandas as pd
import numpy as np
df = pd.read_csv(r"..\\notebooks\\data\\gemstone.csv")
df.drop(labels=['id'], axis=1, inplace=True)
df.head()
carat cut color clarity depth table x y z
price
0 1.52 Premium F VS2 62.2 58.0 7.27 7.33 4.55
13619
1 2.03 Very Good J SI2 62.0 58.0 8.06 8.12 5.05
13387
2 0.70 Ideal G VS1 61.2 57.0 5.69 5.73 3.50
2772
3 0.32 Ideal G VS1 61.6 56.0 4.38 4.41 2.71
666
4 1.70 Premium G VS2 62.6 59.0 7.65 7.61 4.77
14453
X = df.drop(labels=['price'], axis=1)
y = df[['price']]
X.head()
carat cut color clarity depth table x y z
0 1.52 Premium F VS2 62.2 58.0 7.27 7.33 4.55
1 2.03 Very Good J SI2 62.0 58.0 8.06 8.12 5.05
2 0.70 Ideal G VS1 61.2 57.0 5.69 5.73 3.50
3 0.32 Ideal G VS1 61.6 56.0 4.38 4.41 2.71
4 1.70 Premium G VS2 62.6 59.0 7.65 7.61 4.77
price
0 13619
1 13387
2 2772
3 666
4 14453
... ...
193568 1130
193569 2874
193570 3036
193571 681
193572 2258
[193573 rows x 1 columns]
# Categorical features
cat_features = X.select_dtypes(include="object").columns
print(cat_features)
Index(['cut', 'color', 'clarity'], dtype='object')
# Numerical features
num_features = X.select_dtypes(exclude="object").columns
print(num_features)
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']
from sklearn.impute import SimpleImputer # handling missing values
from sklearn.preprocessing import StandardScaler # handling feature
scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal encoding
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Numrecial Pipeline
num_pipeline=Pipeline(
steps=[
('imputer',SimpleImputer()), # handling missing values
('scaler', StandardScaler()) # handling scaling of values
]
# Categorical Pipeline
cat_pipeline=Pipeline(
steps=[
('imputer',SimpleImputer(strategy='most_frequent')), #
handling missing values
('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_cate
gories,clarity_categories])) # handling categorical to numerical
conversion
]
)
preprocessor=ColumnTransformer(
[
('num_pipeline',num_pipeline,num_features),
('cat_pipeline',cat_pipeline,cat_features)
]
)
## Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,rand
om_state=30)
preprocessor.fit_transform(X_train)
array([[-0.97543926, -0.84960654, -0.12153081, ..., 4. ,
5. , 5. ],
[ 0.2351953 , 1.83363716, -0.12153081, ..., 1. ,
1. , 2. ],
[ 0.49461699, 0.81585507, 0.39980029, ..., 3. ,
3. , 4. ],
...,
[ 0.45138004, 1.55606023, -0.6428619 , ..., 1. ,
3. , 2. ],
[ 0.66756478, -1.77486298, 1.44246248, ..., 4. ,
3. , 4. ],
[ 0.25681377, 0.81585507, -0.12153081, ..., 4. ,
3. , 2. ]])
preprocessor.transform(X_train)
array([[-0.97543926, -0.84960654, -0.12153081, ..., 4. ,
5. , 5. ],
[ 0.2351953 , 1.83363716, -0.12153081, ..., 1. ,
1. , 2. ],
[ 0.49461699, 0.81585507, 0.39980029, ..., 3. ,
3. , 4. ],
...,
[ 0.45138004, 1.55606023, -0.6428619 , ..., 1. ,
3. , 2. ],
[ 0.66756478, -1.77486298, 1.44246248, ..., 4. ,
3. , 4. ],
[ 0.25681377, 0.81585507, -0.12153081, ..., 4. ,
3. , 2. ]])
preprocessor.get_feature_names_out()
array(['num_pipeline__carat', 'num_pipeline__depth',
'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
'cat_pipeline__clarity'], dtype=object)
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=prepr
ocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocesso
r.get_feature_names_out())
X_train
num_pipeline__carat num_pipeline__depth num_pipeline__table
\
0 -0.975439 -0.849607 -0.121531
1 0.235195 1.833637 -0.121531
2 0.494617 0.815855 0.399800
3 -1.018676 0.260701 0.921131
4 -0.953821 -0.664555 -0.642862
... ... ... ...
135496 -1.040295 -0.016876 -0.642862
135497 0.991842 0.168176 -0.642862
135498 0.451380 1.556060 -0.642862
135499 0.667565 -1.774863 1.442462
135500 0.256814 0.815855 -0.121531
num_pipeline__x num_pipeline__y num_pipeline__z
cat_pipeline__cut \
0 -1.042757 -1.080970 -1.123150
4.0
1 0.318447 0.279859 0.485354
1.0
2 0.570855 0.606458 0.673737
3.0
3 -1.214034 -1.244270 -1.195605
3.0
4 -1.069801 -1.044681 -1.094168
4.0
... ... ... ...
...
135496 -1.268122 -1.244270 -1.239078
4.0
135497 1.048629 1.114501 1.079486
4.0
135498 0.516768 0.588314 0.702719
1.0
135499 0.868337 0.951202 0.688228
4.0
135500 0.381549 0.415942 0.470863
4.0
cat_pipeline__color cat_pipeline__clarity
0 5.0 5.0
1 1.0 2.0
2 3.0 4.0
3 3.0 3.0
4 6.0 5.0
... ... ...
135496 1.0 2.0
135497 3.0 1.0
135498 3.0 2.0
135499 3.0 4.0
135500 3.0 2.0
[135501 rows x 9 columns]
X_test
num_pipeline__carat num_pipeline__depth
num_pipeline__table \
0 -0.564688 -0.942132 -0.642862
1 -0.175556 1.000906 -0.121531
2 -1.061913 0.260701 -0.121531
3 0.970223 -0.201927 1.963794
4 -0.932202 -1.312235 0.399800
... ... ... ...
58067 1.013460 1.185958 -0.642862
58068 -0.997058 0.260701 -1.164193
58069 -0.197174 -3.347799 1.442462
58070 -0.824110 -0.201927 -0.121531
58071 2.613227 -0.757081 1.442462
num_pipeline__x num_pipeline__y num_pipeline__z
cat_pipeline__cut \
0 -0.429765 -0.464061 -0.500036
3.0
1 -0.042137 -0.028595 0.036132
2.0
2 -1.304180 -1.298703 -1.268060
4.0
3 1.048629 0.996563 0.978049
3.0
4 -1.006699 -0.990248 -1.065186
3.0
... ... ... ...
...
58067 1.003556 1.041924 1.151941
2.0
58068 -1.141917 -1.126331 -1.108659
4.0
58069 0.102096 0.071199 -0.224706
3.0
58070 -0.853450 -0.881382 -0.876803
4.0
58071 2.139394 2.039865 2.006912
3.0
cat_pipeline__color cat_pipeline__clarity
0 1.0 3.0
1 4.0 2.0
2 4.0 7.0
3 3.0 3.0
4 1.0 4.0
... ... ...
58067 4.0 3.0
58068 2.0 6.0
58069 6.0 3.0
58070 3.0 2.0
58071 6.0 3.0
[58072 rows x 9 columns]
SO far we have only learned Linear Regression and Logistic Regression.. and using it we are
trying to create a End To End project
But, after that, will you be taking the remaining ML Algorithms and End to End project using it?