5.
implementationof mutiplelinear regressionfor housepriceprecedtionusingsklearn
import pandasaspd
import numpy asnp
import [Link]
import seabornassns
import [Link] assm
# Loadthedataset (youcanreplacethiswithyour owndata)
data=pd.read_csv("/ content/ sample_data/ [Link]") # Replace'[Link]' withyour dataset filepath
# Datainspection
print([Link](5)) # Display first 5 records
print([Link]()) # Showdatatypedefinitionsfor columns
print([Link]()) # Descriptivestatistics
print(f"Total rowsandcolumns:{[Link]}")
# Check for null values
print(f"Null values:\ n{[Link]().sum()}")
# Detectoutliersusingboxplots
def detect_outliers():
fig, axs=[Link](2, 3, figsize=(10, 5))
[Link](x=data[ 'Feature1'] , ax=axs[ 0, 0] )
[Link](x=data[ 'Feature2'] , ax=axs[ 0, 1] )
# Repeat for other features..
[Link]()
# MultipleLinear Regression
X=data[ [ 'Feature1', 'Feature2'] ] # Independent variables
y =data[ 'HousePrice'] # Dependent variable
# Addaconstant termfor intercept
X=sm.add_constant(X)
# Fit themodel
model =[Link](y, X).fit()
# Get model summary
model_summary =[Link]()
print(model_summary)
output
[Link] [Link] [Link] of Rooms \
0 79545.45857 5.682861 7.009188
1 79248.64245 6.002900 6.730821
2 61287.06718 5.865890 8.512727
3 63345.24005 7.188236 5.586729
4 59982.19723 5.040555 7.839388
[Link] of Bedrooms AreaPopulation Price \
0 4.09 23086.80050 1.059034e+06
1 3.09 40173.07217 1.505891e+06
2 5.13 36882.15940 1.058988e+06
3 3.26 34310.24283 1.260617e+06
4 4.23 26354.10947 6.309435e+05
Address
0 208 Michael Ferry Apt.674\ nLaurabury, NE 3701..
1 188 JohnsonViewsSuite079\ nLake Kathleen, CA..
2 9127 ElizabethStravenue\ nDanieltown, WI 06482..
3 USSBarnett\ nFPO AP44820
4 USNSRaymond\ nFPO AE 09386
<class'[Link]'>
RangeIndex:5000 entries, 0 to4999
Datacolumns(total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 [Link] 5000 non-null float64
1 [Link] 5000 non-null float64
2 [Link] of Rooms 5000 non-null float64
3 [Link] of Bedrooms 5000 non-null float64
4 AreaPopulation 5000 non-null float64
5 Price 5000 non-null float64
6 Address 5000 non-null object
dtypes:float64(6), object(1)
memory usage:273.6+KB
None
[Link] [Link] [Link] of Rooms \
count 5000.000000 5000.000000 5000.000000
mean 68583.108984 5.977222 6.987792
std 10657.991214 0.991456 1.005833
min 17796.631190 2.644304 3.236194
25% 61480.562390 5.322283 6.299250
50% 68804.286405 5.970429 7.002902
75% 75783.338665 6.650808 7.665871
max 107701.748400 9.519088 10.759588
[Link] of Bedrooms AreaPopulation Price
count 5000.000000 5000.000000 5.000000e+03
mean 3.981330 36163.516039 1.232073e+06
std 1.234137 9925.650114 3.531176e+05
min 2.000000 172.610686 1.593866e+04
25% 3.140000 29403.928700 9.975771e+05
50% 4.050000 36199.406690 1.232669e+06
75% 4.490000 42861.290770 1.471210e+06
max 6.500000 69621.713380 2.469066e+06
Total rowsandcolumns:(5000, 7)
Null values:
[Link] 0
[Link] 0
[Link] of Rooms 0
[Link] of Bedrooms 0
AreaPopulation 0
Price 0
Address 0
dtype:int64
[Link] decisiontreeusingsklearnanditsparameterstuning
[Link] load_iris
fromsklearn.model_selectionimport train_test_split
[Link] import DecisionTreeClassifier
[Link] accuracy_score
iris=load_iris()
X=[Link]
y =[Link]
X_train, X_test, y_train, y_test =train_test_split(X, y,test_size=0.3, random_state=99)
clf =DecisionTreeClassifier(random_state=1)
[Link](X_train, y_train)
y_pred=[Link](X_test)
accuracy =accuracy_score(y_test, y_pred)
print(f'Accuracy:{accuracy}')
fromsklearn.model_selectionimport GridSearchCV
# Hyperparameter tofinetune
param_grid={
'max_depth': range(1,10, 1),
'min_samples_leaf': range(1,20, 2),
'min_samples_split': range(2, 20, 2),
'criterion': [ "entropy", "gini"]
}
# Decisiontreeclassifier
tree=DecisionTreeClassifier(random_state=1)
# GridSearchCV
grid_search=GridSearchCV(estimator=tree, param_grid=param_grid,
cv=5, verbose=True)
grid_search.fit(X_train, y_train)
# Best scoreandestimator
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)
[Link] import plot_tree
import [Link]
# best estimator
tree_clf =grid_search.best_estimator_
# plot
[Link](figsize=(18, 15))
plot_tree(tree_clf, filled=True, feature_names=iris.feature_names,
class_names=iris.target_names)
[Link]()
output:
Accuracy:0.9555555555555556
Fitting5 foldsfor eachof 1620 candidates, totalling8100 fits
best accuracy 0.9714285714285715
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=3,
random_state=1)