0% found this document useful (0 votes)

19 views1 page

Diamond Price Analysis and Factors

Regression on Python

Uploaded by

prakhar.22169

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

19 views1 page

Diamond Price Analysis and Factors

Regression on Python

Uploaded by

prakhar.22169

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

In [2]: import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import scipy.stats as sp
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [3]: df = pd.read_csv(r"C:\Users\prakh\OneDrive\Desktop\Financial Modeling\diamonds.csv", index_col = 0)

Out[3]: carat cut color clarity depth table price x y z

1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43

2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31

3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31

4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63

5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

... ... ... ... ... ... ... ... ... ... ...

53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50

53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61

53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56

53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74

53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64

53940 rows × 10 columns

In [4]: le = LabelEncoder()

In [5]: df['Cut Code'] = le.fit_transform(df.cut)

df['Color Code'] = le.fit_transform(df.color)
df['Clarity Code'] = le.fit_transform(df.clarity)
df

Out[5]: carat cut color clarity depth table price x y z Cut Code Color Code Clarity Code

1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 2 1 3

2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 3 1 2

3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 1 1 4

4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 3 5 5

5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 1 6 3

... ... ... ... ... ... ... ... ... ... ... ... ... ...

53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50 2 0 2

53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61 1 0 2

53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56 4 0 2

53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74 3 4 3

53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64 2 0 3

53940 rows × 13 columns

In [6]: X = df.iloc[:, [0,4,5,7,8,9,10,11,12]].values

y = df.iloc[:, 6].values

In [7]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 9)

In [8]: model = LinearRegression()

model.fit(X_train, y_train)

Out[8]: ▾ LinearRegression

LinearRegression()

In [9]: y_predicted = model.predict(X_test)

print(y_predicted)

[3078.74736951 701.93349033 675.37805081 ... 1002.19084154 9403.96138641

779.67556312]

In [10]: df1 = df.drop(['cut','color','clarity'], axis = 1)

df1

Out[10]: carat depth table price x y z Cut Code Color Code Clarity Code

1 0.23 61.5 55.0 326 3.95 3.98 2.43 2 1 3

2 0.21 59.8 61.0 326 3.89 3.84 2.31 3 1 2

3 0.23 56.9 65.0 327 4.05 4.07 2.31 1 1 4

4 0.29 62.4 58.0 334 4.20 4.23 2.63 3 5 5

5 0.31 63.3 58.0 335 4.34 4.35 2.75 1 6 3

... ... ... ... ... ... ... ... ... ... ...

53936 0.72 60.8 57.0 2757 5.75 5.76 3.50 2 0 2

53937 0.72 63.1 55.0 2757 5.69 5.75 3.61 1 0 2

53938 0.70 62.8 60.0 2757 5.66 5.68 3.56 4 0 2

53939 0.86 61.0 58.0 2757 6.15 6.12 3.74 3 4 3

53940 0.75 62.2 55.0 2757 5.83 5.87 3.64 2 0 3

53940 rows × 10 columns

In [11]: sns.heatmap(df1.corr(), annot = True)

<Axes: >
Out[11]:

In [12]: print(model.coef_)
print(model.intercept_)

[11011.31208688 -157.31520899 -92.67029686 -1162.11876188

29.54875697 -19.67239509 75.71937425 -266.08666228
287.29962724]
16147.588909677248

In [13]: from sklearn.metrics import r2_score, mean_squared_error

In [14]: r_square = r2_score(y_test, y_predicted)

print(r_square)
MSE = mean_squared_error(y_test, y_predicted)
print(MSE)

0.8828127415197767
1847551.1693083993

In [15]: #testing multicollinearity through variable_inflation_factor

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [16]: def calculate_vif(df1):

vif = pd.DataFrame()
vif["features"] = df1.columns
vif["VIF_Value"] = [variance_inflation_factor(df1.values,i) for i in range(df1.shape[1])]

return(vif)

In [17]: features = df1.iloc[:,[0,1,2,4,5,6,7,8,9]]

calculate_vif(features)

Out[17]: features VIF_Value

0 carat 71.280619

1 depth 532.475672

2 table 521.222987

3 x 1285.950850

4 y 531.783819

5 z 542.223353

6 Cut Code 7.452458

7 Color Code 3.656752

8 Clarity Code 6.216738

In [18]: #normality assumption through qq graph

residual = y_test - y_predicted
fig, ax = plt.subplots(figsize=(4,6))
sp.probplot(residual, plot = ax)
plt.show()

In [19]: #normality through a normal distribution

sns.displot(residual,kind='kde')
plt.show()

C:\Users\prakh\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight

self._figure.tight_layout(*args, **kwargs)

In [20]: #checking homoscedasticity

plt.scatter(y_predicted, residual, color = "purple")
plt.axhline(y=0, color = "red", linestyle = '--')
plt.xlabel("Predicted Price")
plt.ylabel("Residual")

Text(0, 0.5, 'Residual')

Out[20]:

In [21]: plt.scatter(y_test, y_predicted)

plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")

Text(0, 0.5, 'Predicted Prices')

Out[21]:

In [22]: # statsmodels
import statsmodels.api as sm

In [23]: X1 = sm.add_constant(X)
X1

array([[ 1. , 0.23, 61.5 , ..., 2. , 1. , 3. ],

Out[23]:
[ 1. , 0.21, 59.8 , ..., 3. , 1. , 2. ],
[ 1. , 0.23, 56.9 , ..., 1. , 1. , 4. ],
...,
[ 1. , 0.7 , 62.8 , ..., 4. , 0. , 2. ],
[ 1. , 0.86, 61. , ..., 3. , 4. , 3. ],
[ 1. , 0.75, 62.2 , ..., 2. , 0. , 3. ]])

In [24]: my_model = sm.OLS(y_test, X_test).fit()

print(my_model.summary())

OLS Regression Results

=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.940
Model: OLS Adj. R-squared (uncentered): 0.940
Method: Least Squares F-statistic: 2.347e+04
Date: Mon, 06 Nov 2023 Prob (F-statistic): 0.00
Time: 20:52:34 Log-Likelihood: -1.1651e+05
No. Observations: 13485 AIC: 2.330e+05
Df Residuals: 13476 BIC: 2.331e+05
Df Model: 9
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
x1 1.012e+04 113.621 89.090 0.000 9899.755 1.03e+04
x2 32.9777 4.778 6.902 0.000 23.612 42.343
x3 -40.0606 4.858 -8.246 0.000 -49.584 -30.537
x4 -2749.9261 229.970 -11.958 0.000 -3200.699 -2299.153
x5 2571.6502 223.625 11.500 0.000 2133.314 3009.987
x6 -993.2114 109.778 -9.047 0.000 -1208.391 -778.032
x7 60.5960 11.753 5.156 0.000 37.559 83.633
x8 -265.0724 7.275 -36.437 0.000 -279.332 -250.813
x9 302.6413 7.069 42.811 0.000 288.785 316.498
==============================================================================
Omnibus: 3507.028 Durbin-Watson: 1.984
Prob(Omnibus): 0.000 Jarque-Bera (JB): 95656.342
Skew: 0.653 Prob(JB): 0.00
Kurtosis: 15.982 Cond. No. 2.29e+03
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[3] The condition number is large, 2.29e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [25]: #checking autocorrelation of errors

plt.plot(residual, color="pink")
plt.grid()

In [26]: #checking multicollinearity through heatmap

sns.heatmap(df1.iloc[:, [0,1,2,4,5,6,7,8,9]].corr(), annot = True)

<Axes: >
Out[26]:

In [53]: #heteroscadasticity using the Breusch-Pagan test

import statsmodels.tools.tools as smt
import statsmodels.stats.diagnostic
import statsmodels.api as sm

In [41]: ivar = df.iloc[:, [0,4,5,7,8,9,10,11,12]]

ivarc = smt.add_constant(data = ivar, prepend = True)
print(ivarc.head())

const carat depth table x y z Cut Code Color Code \

1 1.0 0.23 61.5 55.0 3.95 3.98 2.43 2 1
2 1.0 0.21 59.8 61.0 3.89 3.84 2.31 3 1
3 1.0 0.23 56.9 65.0 4.05 4.07 2.31 1 1
4 1.0 0.29 62.4 58.0 4.20 4.23 2.63 3 5
5 1.0 0.31 63.3 58.0 4.34 4.35 2.75 1 6

Clarity Code
1 3
2 2
3 4
4 5
5 3

Breusch-Pagan Test
2
^ ^ + γ
^ X1 + γ
^ X2+. . . . . γ
^ Xn
residuals = γ 0 1 2 n

In [42]: residual = y_test - y_predicted

In [43]: my_model1 = sm.OLS(y_train, X_train).fit()

print(my_model1)

<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001C534A8E750>

In [44]: y_test.shape

(13485,)
Out[44]:

In [45]: X_test.reshape(-1)
X_test.shape

(13485, 9)
Out[45]:

In [46]: X1 = sm.add_constant(X_test, prepend = True)

array([[ 1. , 0.71, 61. , ..., 2. , 3. , 4. ],

Out[46]:
[ 1. , 0.38, 62. , ..., 2. , 6. , 7. ],
[ 1. , 0.31, 60.6 , ..., 3. , 4. , 7. ],
...,
[ 1. , 0.43, 61.6 , ..., 2. , 2. , 3. ],
[ 1. , 1.53, 62.5 , ..., 2. , 4. , 3. ],
[ 1. , 0.31, 62.4 , ..., 4. , 2. , 6. ]])

In [47]: #It has 10 columns because it has an additional column of constants added to it
X1.shape

(13485, 10)
Out[47]:

In [48]: print(y_test.shape)
print(y_predicted.shape)

(13485,)
(13485,)

In [54]: from statsmodels.stats.diagnostic import het_breuschpagan

In [56]: bptest = het_breuschpagan(residual, exog_het = X1)

p_value = bptest[1]
p_value
if p_value < 0.05:
print("Heteroscedasticity is present")
else:
print("Homoscedasticity is assumed")

Heteroscedasticity is present

In [57]: print("Describe Prakhar in one adjective")

secret_word = "chad"
guess = " "
guess_count = 0
guess_limit = 5
out_of_guesses = False
while guess != secret_word and not(out_of_guesses):
if guess_count < guess_limit:
guess = input("Enter your guess: ")
guess_count += 1
else:
out_of_guesses = True

if out_of_guesses:
print("Hehehe, Loser")
else:
print("He is indeed")

#Game with a count:

#To do that, we need to create a few more variables. Firstly, we need to create the variable where we are gonna store the
#the number of guesses
#Down in the while loop, everytime we go through the loop, we wanna increment that guess
#The last variable out of guesses is a bullion simply telling us if the user is out of guesses or not
#We set it equal to False kyunki game ki start me toh the user is not out of guesses and shit
#Now, before we let the user guess the word, we need to make sure that the user is not out of guesses as of yet.

Describe Prakhar in one adjective

Enter your guess: lauda
Enter your guess: chad
He is indeed

In [58]: #for loops: You just loop over a different collection of items.
#For example: You can loop over the letters for a particular word etc.
#How you run a for loop: You write for and then you specify a variable, that variable is gonna represent a different value
#every time we go through the for loop

#The first line of the code says that for every letter (variable) in the word I have specified, I want to do something
#The second line simply tells you what that something is: that something is simply me printing the variable I specified
for letter in "Giraffe Academy":
print(letter)

G
i
r
a
f
f
e

A
c
a
d
e
m
y

In [ ]:

ML LAB 12 - Jupyter Notebook
No ratings yet
ML LAB 12 - Jupyter Notebook
11 pages
Data
No ratings yet
Data
1,079 pages
One Sample T-Test Result
No ratings yet
One Sample T-Test Result
3 pages
Design Criteria Evaluation and Ranking
No ratings yet
Design Criteria Evaluation and Ranking
16 pages
Assignment4 VidulGarg
No ratings yet
Assignment4 VidulGarg
14 pages
2
No ratings yet
2
6 pages
Criteria Matrix for Mobile Devices
No ratings yet
Criteria Matrix for Mobile Devices
5 pages
Quality Prediction
No ratings yet
Quality Prediction
20 pages
Data Ingestion: Import As Import As Import As
No ratings yet
Data Ingestion: Import As Import As Import As
16 pages
Data Mining 1 Practical File-1
No ratings yet
Data Mining 1 Practical File-1
24 pages
KNN Classifier on Digits Data
No ratings yet
KNN Classifier on Digits Data
3 pages
Bright Star Rubber Statistical Report
No ratings yet
Bright Star Rubber Statistical Report
1 page
Wine Quality Prediction
No ratings yet
Wine Quality Prediction
6 pages
ECOLOGY
No ratings yet
ECOLOGY
414 pages
Karisma 23011101119 Eda Rec
No ratings yet
Karisma 23011101119 Eda Rec
88 pages
CD 505 Itds Practical 2
No ratings yet
CD 505 Itds Practical 2
4 pages
BUS501-new-diamondsa-STEP 3
No ratings yet
BUS501-new-diamondsa-STEP 3
42 pages
Lampiran: 1. Backpropogation
No ratings yet
Lampiran: 1. Backpropogation
12 pages
Code
No ratings yet
Code
5 pages
Planilla de Metrados de Buzones - Nuevos
No ratings yet
Planilla de Metrados de Buzones - Nuevos
5 pages
ANCHOR BOLTv.1a
No ratings yet
ANCHOR BOLTv.1a
1 page
Design of 20m Span Girders (Live Load Analysis Results)
No ratings yet
Design of 20m Span Girders (Live Load Analysis Results)
11 pages
Phan 3 - C9 - Du Bao Doanh Thu - Chi Phi
No ratings yet
Phan 3 - C9 - Du Bao Doanh Thu - Chi Phi
23 pages
Beam Loading:: 2ND/3RD/4TH Roof Deck
No ratings yet
Beam Loading:: 2ND/3RD/4TH Roof Deck
14 pages
Agri Vegetables
No ratings yet
Agri Vegetables
6 pages
Harga Saham Emiten Terbaru
No ratings yet
Harga Saham Emiten Terbaru
17 pages
Peumusan (Update)
No ratings yet
Peumusan (Update)
14 pages
TGdiamonds - 022 2
No ratings yet
TGdiamonds - 022 2
76 pages
TGdiamonds 022
No ratings yet
TGdiamonds 022
36 pages
Industrial Ventilation Manual Overview
100% (11)
Industrial Ventilation Manual Overview
343 pages
MK GEL v8 r1
No ratings yet
MK GEL v8 r1
2 pages
Ams 427 Statistical Model Building
No ratings yet
Ams 427 Statistical Model Building
5 pages
Nokia Updated Pricelist
No ratings yet
Nokia Updated Pricelist
89 pages
Sales Collection Analysis and DSO Metrics
No ratings yet
Sales Collection Analysis and DSO Metrics
3 pages
Staad Results
No ratings yet
Staad Results
12 pages
Slab Design Final
No ratings yet
Slab Design Final
57 pages
Experiment 4-1
No ratings yet
Experiment 4-1
6 pages
Python Machine Learning with Wine Data
No ratings yet
Python Machine Learning with Wine Data
5 pages
Unit 5 Seaborn
No ratings yet
Unit 5 Seaborn
13 pages
Significance Tables
No ratings yet
Significance Tables
1 page
SHIV
No ratings yet
SHIV
283 pages
REGIONAL DIAGNOSTIC TEST Test Result With Formula
No ratings yet
REGIONAL DIAGNOSTIC TEST Test Result With Formula
5 pages
Tourmaline
No ratings yet
Tourmaline
3 pages
Wednesday, June 18, 2014, 03:14 PM: Page 1 of 22 F:/STAAD/EXE - 17.06.14.anl
No ratings yet
Wednesday, June 18, 2014, 03:14 PM: Page 1 of 22 F:/STAAD/EXE - 17.06.14.anl
22 pages
Flica Selector
No ratings yet
Flica Selector
24 pages
Statistical Analysis For Analytical Methods Validations: Authors
No ratings yet
Statistical Analysis For Analytical Methods Validations: Authors
9 pages
Ipcr Checking
No ratings yet
Ipcr Checking
24 pages
Diamonds 2
No ratings yet
Diamonds 2
2,632 pages
Steam Tables
No ratings yet
Steam Tables
12 pages
Python Project 2 Colab
No ratings yet
Python Project 2 Colab
6 pages
Pipe Schedule-Thickness Chart PDF
86% (7)
Pipe Schedule-Thickness Chart PDF
3 pages
Name and Formula: Natl. Bur. Stand. (U.S.) Monogr. 25, 18, 59, (1981)
No ratings yet
Name and Formula: Natl. Bur. Stand. (U.S.) Monogr. 25, 18, 59, (1981)
3 pages
Diamonds
No ratings yet
Diamonds
2,400 pages
Metals Statistical Analysis
No ratings yet
Metals Statistical Analysis
5 pages
(2.1250) (2.2500) Inch: TS Type (53.975) (57.150) MM
No ratings yet
(2.1250) (2.2500) Inch: TS Type (53.975) (57.150) MM
1 page
Ml1.ipynb - Colaboratory
No ratings yet
Ml1.ipynb - Colaboratory
5 pages
R CT210220B
No ratings yet
R CT210220B
62 pages
Data Management and Statistical Tools
No ratings yet
Data Management and Statistical Tools
60 pages
Measures of variation (dispersion) (تتشتل ا سيياقم) : Formula: 1. For ungrouped data population variance
No ratings yet
Measures of variation (dispersion) (تتشتل ا سيياقم) : Formula: 1. For ungrouped data population variance
22 pages
MATH 226 Statistics Winter2025 Solutions 9007
No ratings yet
MATH 226 Statistics Winter2025 Solutions 9007
9 pages
McGill VariationsBoxPlots 1978
No ratings yet
McGill VariationsBoxPlots 1978
6 pages
Statistics 200l Exams
No ratings yet
Statistics 200l Exams
6 pages
Federal University of Kashere Faculty of Education: FUKU/EDU/20/BIO/0022
No ratings yet
Federal University of Kashere Faculty of Education: FUKU/EDU/20/BIO/0022
6 pages
APR-2025 Bs33a
No ratings yet
APR-2025 Bs33a
26 pages
DICTATION PARAGRAPH 3-5 1st
No ratings yet
DICTATION PARAGRAPH 3-5 1st
2 pages
Gender Age Prior - Experience Beta - Experience Education Annual - Salary
No ratings yet
Gender Age Prior - Experience Beta - Experience Education Annual - Salary
10 pages
AK Gr.6 Math Final Revision Sheet 3rd Term 2022-2023
No ratings yet
AK Gr.6 Math Final Revision Sheet 3rd Term 2022-2023
12 pages
Test of Significance About Mean
No ratings yet
Test of Significance About Mean
23 pages
Statistics for Researchers
No ratings yet
Statistics for Researchers
53 pages
Class 10 Lab Data Science
No ratings yet
Class 10 Lab Data Science
7 pages
Unit 1b
No ratings yet
Unit 1b
69 pages
Math AI SL Statistics Practice Blank Test Assesment
No ratings yet
Math AI SL Statistics Practice Blank Test Assesment
12 pages
Process Capability Calculation Guide
No ratings yet
Process Capability Calculation Guide
150 pages
Analyzing Poptastic Sales Data in Excel
No ratings yet
Analyzing Poptastic Sales Data in Excel
2 pages
EDU408 Fall 2021 638018574526453890
No ratings yet
EDU408 Fall 2021 638018574526453890
33 pages
WNBA Data Analysis: Mean, Median, Mode
No ratings yet
WNBA Data Analysis: Mean, Median, Mode
1 page
Ulo 2A - Let'S Analyze: B. Pages 32 To 33 (Only Items 2, 5 & 10)
No ratings yet
Ulo 2A - Let'S Analyze: B. Pages 32 To 33 (Only Items 2, 5 & 10)
4 pages
Statistics Review Univariate Bivariate 2024 MS 1
No ratings yet
Statistics Review Univariate Bivariate 2024 MS 1
16 pages
Unit 3 - Activity 15 - Excel Using Statistics Functions Worksheet
No ratings yet
Unit 3 - Activity 15 - Excel Using Statistics Functions Worksheet
9 pages
MCQs Econometrics Autocorrelation Heteroskedasticity Multicollinearity
No ratings yet
MCQs Econometrics Autocorrelation Heteroskedasticity Multicollinearity
4 pages
Sas 1201 Probability & Statistics I
No ratings yet
Sas 1201 Probability & Statistics I
4 pages
Business Statistics 6th Edition Levine Solutions Manualpdf Download
100% (19)
Business Statistics 6th Edition Levine Solutions Manualpdf Download
51 pages
Statistics For Economics
No ratings yet
Statistics For Economics
58 pages
Mean Deviation
No ratings yet
Mean Deviation
18 pages
F.5 Test 5B Ch.18
No ratings yet
F.5 Test 5B Ch.18
8 pages
Research Methods For Engineering
100% (1)
Research Methods For Engineering
89 pages
Math 100 Project
No ratings yet
Math 100 Project
1 page

Diamond Price Analysis and Factors

Uploaded by

Diamond Price Analysis and Factors

Uploaded by

In [2]: import pandas as pd

In [3]: df = pd.read_csv(r"C:\Users\prakh\OneDrive\Desktop\Financial Modeling\diamonds.csv", index_col = 0)

Out[3]: carat cut color clarity depth table price x y z

1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43

2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31

3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31

4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63

5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

53940 rows × 10 columns

In [5]: df['Cut Code'] = le.fit_transform(df.cut)

1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 2 1 3

2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 3 1 2

3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 1 1 4

4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 3 5 5

5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 1 6 3

53940 rows × 13 columns

In [6]: X = df.iloc[:, [0,4,5,7,8,9,10,11,12]].values

In [7]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 9)

In [8]: model = LinearRegression()

In [9]: y_predicted = model.predict(X_test)

[3078.74736951 701.93349033 675.37805081 ... 1002.19084154 9403.96138641

In [10]: df1 = df.drop(['cut','color','clarity'], axis = 1)

1 0.23 61.5 55.0 326 3.95 3.98 2.43 2 1 3

2 0.21 59.8 61.0 326 3.89 3.84 2.31 3 1 2

3 0.23 56.9 65.0 327 4.05 4.07 2.31 1 1 4

4 0.29 62.4 58.0 334 4.20 4.23 2.63 3 5 5

5 0.31 63.3 58.0 335 4.34 4.35 2.75 1 6 3

53936 0.72 60.8 57.0 2757 5.75 5.76 3.50 2 0 2

53937 0.72 63.1 55.0 2757 5.69 5.75 3.61 1 0 2

53938 0.70 62.8 60.0 2757 5.66 5.68 3.56 4 0 2

53939 0.86 61.0 58.0 2757 6.15 6.12 3.74 3 4 3

53940 0.75 62.2 55.0 2757 5.83 5.87 3.64 2 0 3

53940 rows × 10 columns

In [11]: sns.heatmap(df1.corr(), annot = True)

[11011.31208688 -157.31520899 -92.67029686 -1162.11876188

In [13]: from sklearn.metrics import r2_score, mean_squared_error

In [14]: r_square = r2_score(y_test, y_predicted)

In [15]: #testing multicollinearity through variable_inflation_factor

In [16]: def calculate_vif(df1):

In [17]: features = df1.iloc[:,[0,1,2,4,5,6,7,8,9]]

Out[17]: features VIF_Value

6 Cut Code 7.452458

7 Color Code 3.656752

8 Clarity Code 6.216738

In [18]: #normality assumption through qq graph

In [19]: #normality through a normal distribution

C:\Users\prakh\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight

In [20]: #checking homoscedasticity

Text(0, 0.5, 'Residual')

In [21]: plt.scatter(y_test, y_predicted)

Text(0, 0.5, 'Predicted Prices')

array([[ 1. , 0.23, 61.5 , ..., 2. , 1. , 3. ],

In [24]: my_model = sm.OLS(y_test, X_test).fit()

OLS Regression Results

In [25]: #checking autocorrelation of errors

In [26]: #checking multicollinearity through heatmap

In [53]: #heteroscadasticity using the Breusch-Pagan test

In [41]: ivar = df.iloc[:, [0,4,5,7,8,9,10,11,12]]

const carat depth table x y z Cut Code Color Code \

In [42]: residual = y_test - y_predicted

In [43]: my_model1 = sm.OLS(y_train, X_train).fit()

<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001C534A8E750>

In [46]: X1 = sm.add_constant(X_test, prepend = True)

array([[ 1. , 0.71, 61. , ..., 2. , 3. , 4. ],

In [54]: from statsmodels.stats.diagnostic import het_breuschpagan

In [56]: bptest = het_breuschpagan(residual, exog_het = X1)

In [57]: print("Describe Prakhar in one adjective")

#Game with a count:

Describe Prakhar in one adjective

You might also like