0% found this document useful (0 votes)
19 views1 page

Diamond Price Analysis and Factors

Regression on Python

Uploaded by

prakhar.22169
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
19 views1 page

Diamond Price Analysis and Factors

Regression on Python

Uploaded by

prakhar.22169
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

In [2]: import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import scipy.stats as sp
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [3]: df = pd.read_csv(r"C:\Users\prakh\OneDrive\Desktop\Financial Modeling\diamonds.csv", index_col = 0)


df

Out[3]: carat cut color clarity depth table price x y z

1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43

2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31

3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31

4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63

5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

... ... ... ... ... ... ... ... ... ... ...

53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50

53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61

53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56

53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74

53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64

53940 rows × 10 columns

In [4]: le = LabelEncoder()

In [5]: df['Cut Code'] = le.fit_transform(df.cut)


df['Color Code'] = le.fit_transform(df.color)
df['Clarity Code'] = le.fit_transform(df.clarity)
df

Out[5]: carat cut color clarity depth table price x y z Cut Code Color Code Clarity Code

1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 2 1 3

2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 3 1 2

3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 1 1 4

4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 3 5 5

5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 1 6 3

... ... ... ... ... ... ... ... ... ... ... ... ... ...

53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50 2 0 2

53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61 1 0 2

53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56 4 0 2

53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74 3 4 3

53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64 2 0 3

53940 rows × 13 columns

In [6]: X = df.iloc[:, [0,4,5,7,8,9,10,11,12]].values


y = df.iloc[:, 6].values

In [7]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 9)

In [8]: model = LinearRegression()


model.fit(X_train, y_train)

Out[8]: ▾ LinearRegression

LinearRegression()

In [9]: y_predicted = model.predict(X_test)


print(y_predicted)

[3078.74736951 701.93349033 675.37805081 ... 1002.19084154 9403.96138641


779.67556312]

In [10]: df1 = df.drop(['cut','color','clarity'], axis = 1)


df1

Out[10]: carat depth table price x y z Cut Code Color Code Clarity Code

1 0.23 61.5 55.0 326 3.95 3.98 2.43 2 1 3

2 0.21 59.8 61.0 326 3.89 3.84 2.31 3 1 2

3 0.23 56.9 65.0 327 4.05 4.07 2.31 1 1 4

4 0.29 62.4 58.0 334 4.20 4.23 2.63 3 5 5

5 0.31 63.3 58.0 335 4.34 4.35 2.75 1 6 3

... ... ... ... ... ... ... ... ... ... ...

53936 0.72 60.8 57.0 2757 5.75 5.76 3.50 2 0 2

53937 0.72 63.1 55.0 2757 5.69 5.75 3.61 1 0 2

53938 0.70 62.8 60.0 2757 5.66 5.68 3.56 4 0 2

53939 0.86 61.0 58.0 2757 6.15 6.12 3.74 3 4 3

53940 0.75 62.2 55.0 2757 5.83 5.87 3.64 2 0 3

53940 rows × 10 columns

In [11]: sns.heatmap(df1.corr(), annot = True)

<Axes: >
Out[11]:

In [12]: print(model.coef_)
print(model.intercept_)

[11011.31208688 -157.31520899 -92.67029686 -1162.11876188


29.54875697 -19.67239509 75.71937425 -266.08666228
287.29962724]
16147.588909677248

In [13]: from sklearn.metrics import r2_score, mean_squared_error

In [14]: r_square = r2_score(y_test, y_predicted)


print(r_square)
MSE = mean_squared_error(y_test, y_predicted)
print(MSE)

0.8828127415197767
1847551.1693083993

In [15]: #testing multicollinearity through variable_inflation_factor


from statsmodels.stats.outliers_influence import variance_inflation_factor

In [16]: def calculate_vif(df1):


vif = pd.DataFrame()
vif["features"] = df1.columns
vif["VIF_Value"] = [variance_inflation_factor(df1.values,i) for i in range(df1.shape[1])]

return(vif)

In [17]: features = df1.iloc[:,[0,1,2,4,5,6,7,8,9]]


calculate_vif(features)

Out[17]: features VIF_Value

0 carat 71.280619

1 depth 532.475672

2 table 521.222987

3 x 1285.950850

4 y 531.783819

5 z 542.223353

6 Cut Code 7.452458

7 Color Code 3.656752

8 Clarity Code 6.216738

In [18]: #normality assumption through qq graph


residual = y_test - y_predicted
fig, ax = plt.subplots(figsize=(4,6))
sp.probplot(residual, plot = ax)
plt.show()

In [19]: #normality through a normal distribution


sns.displot(residual,kind='kde')
plt.show()

C:\Users\prakh\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight


self._figure.tight_layout(*args, **kwargs)

In [20]: #checking homoscedasticity


plt.scatter(y_predicted, residual, color = "purple")
plt.axhline(y=0, color = "red", linestyle = '--')
plt.xlabel("Predicted Price")
plt.ylabel("Residual")

Text(0, 0.5, 'Residual')


Out[20]:

In [21]: plt.scatter(y_test, y_predicted)


plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")

Text(0, 0.5, 'Predicted Prices')


Out[21]:

In [22]: # statsmodels
import statsmodels.api as sm

In [23]: X1 = sm.add_constant(X)
X1

array([[ 1. , 0.23, 61.5 , ..., 2. , 1. , 3. ],


Out[23]:
[ 1. , 0.21, 59.8 , ..., 3. , 1. , 2. ],
[ 1. , 0.23, 56.9 , ..., 1. , 1. , 4. ],
...,
[ 1. , 0.7 , 62.8 , ..., 4. , 0. , 2. ],
[ 1. , 0.86, 61. , ..., 3. , 4. , 3. ],
[ 1. , 0.75, 62.2 , ..., 2. , 0. , 3. ]])

In [24]: my_model = sm.OLS(y_test, X_test).fit()


print(my_model.summary())

OLS Regression Results


=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.940
Model: OLS Adj. R-squared (uncentered): 0.940
Method: Least Squares F-statistic: 2.347e+04
Date: Mon, 06 Nov 2023 Prob (F-statistic): 0.00
Time: 20:52:34 Log-Likelihood: -1.1651e+05
No. Observations: 13485 AIC: 2.330e+05
Df Residuals: 13476 BIC: 2.331e+05
Df Model: 9
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
x1 1.012e+04 113.621 89.090 0.000 9899.755 1.03e+04
x2 32.9777 4.778 6.902 0.000 23.612 42.343
x3 -40.0606 4.858 -8.246 0.000 -49.584 -30.537
x4 -2749.9261 229.970 -11.958 0.000 -3200.699 -2299.153
x5 2571.6502 223.625 11.500 0.000 2133.314 3009.987
x6 -993.2114 109.778 -9.047 0.000 -1208.391 -778.032
x7 60.5960 11.753 5.156 0.000 37.559 83.633
x8 -265.0724 7.275 -36.437 0.000 -279.332 -250.813
x9 302.6413 7.069 42.811 0.000 288.785 316.498
==============================================================================
Omnibus: 3507.028 Durbin-Watson: 1.984
Prob(Omnibus): 0.000 Jarque-Bera (JB): 95656.342
Skew: 0.653 Prob(JB): 0.00
Kurtosis: 15.982 Cond. No. 2.29e+03
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[3] The condition number is large, 2.29e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [25]: #checking autocorrelation of errors


plt.plot(residual, color="pink")
plt.grid()

In [26]: #checking multicollinearity through heatmap


sns.heatmap(df1.iloc[:, [0,1,2,4,5,6,7,8,9]].corr(), annot = True)

<Axes: >
Out[26]:

In [53]: #heteroscadasticity using the Breusch-Pagan test


import statsmodels.tools.tools as smt
import statsmodels.stats.diagnostic
import statsmodels.api as sm

In [41]: ivar = df.iloc[:, [0,4,5,7,8,9,10,11,12]]


ivarc = smt.add_constant(data = ivar, prepend = True)
print(ivarc.head())

const carat depth table x y z Cut Code Color Code \


1 1.0 0.23 61.5 55.0 3.95 3.98 2.43 2 1
2 1.0 0.21 59.8 61.0 3.89 3.84 2.31 3 1
3 1.0 0.23 56.9 65.0 4.05 4.07 2.31 1 1
4 1.0 0.29 62.4 58.0 4.20 4.23 2.63 3 5
5 1.0 0.31 63.3 58.0 4.34 4.35 2.75 1 6

Clarity Code
1 3
2 2
3 4
4 5
5 3

Breusch-Pagan Test
2
^ ^ + γ
^ X1 + γ
^ X2+. . . . . γ
^ Xn
residuals = γ 0 1 2 n

In [42]: residual = y_test - y_predicted

In [43]: my_model1 = sm.OLS(y_train, X_train).fit()


print(my_model1)

<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001C534A8E750>

In [44]: y_test.shape

(13485,)
Out[44]:

In [45]: X_test.reshape(-1)
X_test.shape

(13485, 9)
Out[45]:

In [46]: X1 = sm.add_constant(X_test, prepend = True)


X1

array([[ 1. , 0.71, 61. , ..., 2. , 3. , 4. ],


Out[46]:
[ 1. , 0.38, 62. , ..., 2. , 6. , 7. ],
[ 1. , 0.31, 60.6 , ..., 3. , 4. , 7. ],
...,
[ 1. , 0.43, 61.6 , ..., 2. , 2. , 3. ],
[ 1. , 1.53, 62.5 , ..., 2. , 4. , 3. ],
[ 1. , 0.31, 62.4 , ..., 4. , 2. , 6. ]])

In [47]: #It has 10 columns because it has an additional column of constants added to it
X1.shape

(13485, 10)
Out[47]:

In [48]: print(y_test.shape)
print(y_predicted.shape)

(13485,)
(13485,)

In [54]: from statsmodels.stats.diagnostic import het_breuschpagan

In [56]: bptest = het_breuschpagan(residual, exog_het = X1)


p_value = bptest[1]
p_value
if p_value < 0.05:
print("Heteroscedasticity is present")
else:
print("Homoscedasticity is assumed")

Heteroscedasticity is present

In [57]: print("Describe Prakhar in one adjective")


secret_word = "chad"
guess = " "
guess_count = 0
guess_limit = 5
out_of_guesses = False
while guess != secret_word and not(out_of_guesses):
if guess_count < guess_limit:
guess = input("Enter your guess: ")
guess_count += 1
else:
out_of_guesses = True

if out_of_guesses:
print("Hehehe, Loser")
else:
print("He is indeed")

#Game with a count:


#To do that, we need to create a few more variables. Firstly, we need to create the variable where we are gonna store the
#the number of guesses
#Down in the while loop, everytime we go through the loop, we wanna increment that guess
#The last variable out of guesses is a bullion simply telling us if the user is out of guesses or not
#We set it equal to False kyunki game ki start me toh the user is not out of guesses and shit
#Now, before we let the user guess the word, we need to make sure that the user is not out of guesses as of yet.

Describe Prakhar in one adjective


Enter your guess: lauda
Enter your guess: chad
He is indeed

In [58]: #for loops: You just loop over a different collection of items.
#For example: You can loop over the letters for a particular word etc.
#How you run a for loop: You write for and then you specify a variable, that variable is gonna represent a different value
#every time we go through the for loop

#The first line of the code says that for every letter (variable) in the word I have specified, I want to do something
#The second line simply tells you what that something is: that something is simply me printing the variable I specified
for letter in "Giraffe Academy":
print(letter)

G
i
r
a
f
f
e

A
c
a
d
e
m
y

In [ ]:

You might also like