In [2]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import scipy.stats as sp
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
In [3]: df = pd.read_csv(r"C:\Users\prakh\OneDrive\Desktop\Financial Modeling\diamonds.csv", index_col = 0)
df
Out[3]: carat cut color clarity depth table price x y z
1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
... ... ... ... ... ... ... ... ... ... ...
53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50
53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61
53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56
53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74
53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64
53940 rows × 10 columns
In [4]: le = LabelEncoder()
In [5]: df['Cut Code'] = le.fit_transform(df.cut)
df['Color Code'] = le.fit_transform(df.color)
df['Clarity Code'] = le.fit_transform(df.clarity)
df
Out[5]: carat cut color clarity depth table price x y z Cut Code Color Code Clarity Code
1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 2 1 3
2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 3 1 2
3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 1 1 4
4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 3 5 5
5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 1 6 3
... ... ... ... ... ... ... ... ... ... ... ... ... ...
53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50 2 0 2
53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61 1 0 2
53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56 4 0 2
53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74 3 4 3
53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64 2 0 3
53940 rows × 13 columns
In [6]: X = df.iloc[:, [0,4,5,7,8,9,10,11,12]].values
y = df.iloc[:, 6].values
In [7]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 9)
In [8]: model = LinearRegression()
model.fit(X_train, y_train)
Out[8]: ▾ LinearRegression
LinearRegression()
In [9]: y_predicted = model.predict(X_test)
print(y_predicted)
[3078.74736951 701.93349033 675.37805081 ... 1002.19084154 9403.96138641
779.67556312]
In [10]: df1 = df.drop(['cut','color','clarity'], axis = 1)
df1
Out[10]: carat depth table price x y z Cut Code Color Code Clarity Code
1 0.23 61.5 55.0 326 3.95 3.98 2.43 2 1 3
2 0.21 59.8 61.0 326 3.89 3.84 2.31 3 1 2
3 0.23 56.9 65.0 327 4.05 4.07 2.31 1 1 4
4 0.29 62.4 58.0 334 4.20 4.23 2.63 3 5 5
5 0.31 63.3 58.0 335 4.34 4.35 2.75 1 6 3
... ... ... ... ... ... ... ... ... ... ...
53936 0.72 60.8 57.0 2757 5.75 5.76 3.50 2 0 2
53937 0.72 63.1 55.0 2757 5.69 5.75 3.61 1 0 2
53938 0.70 62.8 60.0 2757 5.66 5.68 3.56 4 0 2
53939 0.86 61.0 58.0 2757 6.15 6.12 3.74 3 4 3
53940 0.75 62.2 55.0 2757 5.83 5.87 3.64 2 0 3
53940 rows × 10 columns
In [11]: sns.heatmap(df1.corr(), annot = True)
<Axes: >
Out[11]:
In [12]: print(model.coef_)
print(model.intercept_)
[11011.31208688 -157.31520899 -92.67029686 -1162.11876188
29.54875697 -19.67239509 75.71937425 -266.08666228
287.29962724]
16147.588909677248
In [13]: from sklearn.metrics import r2_score, mean_squared_error
In [14]: r_square = r2_score(y_test, y_predicted)
print(r_square)
MSE = mean_squared_error(y_test, y_predicted)
print(MSE)
0.8828127415197767
1847551.1693083993
In [15]: #testing multicollinearity through variable_inflation_factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
In [16]: def calculate_vif(df1):
vif = pd.DataFrame()
vif["features"] = df1.columns
vif["VIF_Value"] = [variance_inflation_factor(df1.values,i) for i in range(df1.shape[1])]
return(vif)
In [17]: features = df1.iloc[:,[0,1,2,4,5,6,7,8,9]]
calculate_vif(features)
Out[17]: features VIF_Value
0 carat 71.280619
1 depth 532.475672
2 table 521.222987
3 x 1285.950850
4 y 531.783819
5 z 542.223353
6 Cut Code 7.452458
7 Color Code 3.656752
8 Clarity Code 6.216738
In [18]: #normality assumption through qq graph
residual = y_test - y_predicted
fig, ax = plt.subplots(figsize=(4,6))
sp.probplot(residual, plot = ax)
plt.show()
In [19]: #normality through a normal distribution
sns.displot(residual,kind='kde')
plt.show()
C:\Users\prakh\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
self._figure.tight_layout(*args, **kwargs)
In [20]: #checking homoscedasticity
plt.scatter(y_predicted, residual, color = "purple")
plt.axhline(y=0, color = "red", linestyle = '--')
plt.xlabel("Predicted Price")
plt.ylabel("Residual")
Text(0, 0.5, 'Residual')
Out[20]:
In [21]: plt.scatter(y_test, y_predicted)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
Text(0, 0.5, 'Predicted Prices')
Out[21]:
In [22]: # statsmodels
import statsmodels.api as sm
In [23]: X1 = sm.add_constant(X)
X1
array([[ 1. , 0.23, 61.5 , ..., 2. , 1. , 3. ],
Out[23]:
[ 1. , 0.21, 59.8 , ..., 3. , 1. , 2. ],
[ 1. , 0.23, 56.9 , ..., 1. , 1. , 4. ],
...,
[ 1. , 0.7 , 62.8 , ..., 4. , 0. , 2. ],
[ 1. , 0.86, 61. , ..., 3. , 4. , 3. ],
[ 1. , 0.75, 62.2 , ..., 2. , 0. , 3. ]])
In [24]: my_model = sm.OLS(y_test, X_test).fit()
print(my_model.summary())
OLS Regression Results
=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.940
Model: OLS Adj. R-squared (uncentered): 0.940
Method: Least Squares F-statistic: 2.347e+04
Date: Mon, 06 Nov 2023 Prob (F-statistic): 0.00
Time: 20:52:34 Log-Likelihood: -1.1651e+05
No. Observations: 13485 AIC: 2.330e+05
Df Residuals: 13476 BIC: 2.331e+05
Df Model: 9
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
x1 1.012e+04 113.621 89.090 0.000 9899.755 1.03e+04
x2 32.9777 4.778 6.902 0.000 23.612 42.343
x3 -40.0606 4.858 -8.246 0.000 -49.584 -30.537
x4 -2749.9261 229.970 -11.958 0.000 -3200.699 -2299.153
x5 2571.6502 223.625 11.500 0.000 2133.314 3009.987
x6 -993.2114 109.778 -9.047 0.000 -1208.391 -778.032
x7 60.5960 11.753 5.156 0.000 37.559 83.633
x8 -265.0724 7.275 -36.437 0.000 -279.332 -250.813
x9 302.6413 7.069 42.811 0.000 288.785 316.498
==============================================================================
Omnibus: 3507.028 Durbin-Watson: 1.984
Prob(Omnibus): 0.000 Jarque-Bera (JB): 95656.342
Skew: 0.653 Prob(JB): 0.00
Kurtosis: 15.982 Cond. No. 2.29e+03
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[3] The condition number is large, 2.29e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [25]: #checking autocorrelation of errors
plt.plot(residual, color="pink")
plt.grid()
In [26]: #checking multicollinearity through heatmap
sns.heatmap(df1.iloc[:, [0,1,2,4,5,6,7,8,9]].corr(), annot = True)
<Axes: >
Out[26]:
In [53]: #heteroscadasticity using the Breusch-Pagan test
import statsmodels.tools.tools as smt
import statsmodels.stats.diagnostic
import statsmodels.api as sm
In [41]: ivar = df.iloc[:, [0,4,5,7,8,9,10,11,12]]
ivarc = smt.add_constant(data = ivar, prepend = True)
print(ivarc.head())
const carat depth table x y z Cut Code Color Code \
1 1.0 0.23 61.5 55.0 3.95 3.98 2.43 2 1
2 1.0 0.21 59.8 61.0 3.89 3.84 2.31 3 1
3 1.0 0.23 56.9 65.0 4.05 4.07 2.31 1 1
4 1.0 0.29 62.4 58.0 4.20 4.23 2.63 3 5
5 1.0 0.31 63.3 58.0 4.34 4.35 2.75 1 6
Clarity Code
1 3
2 2
3 4
4 5
5 3
Breusch-Pagan Test
2
^ ^ + γ
^ X1 + γ
^ X2+. . . . . γ
^ Xn
residuals = γ 0 1 2 n
In [42]: residual = y_test - y_predicted
In [43]: my_model1 = sm.OLS(y_train, X_train).fit()
print(my_model1)
<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001C534A8E750>
In [44]: y_test.shape
(13485,)
Out[44]:
In [45]: X_test.reshape(-1)
X_test.shape
(13485, 9)
Out[45]:
In [46]: X1 = sm.add_constant(X_test, prepend = True)
X1
array([[ 1. , 0.71, 61. , ..., 2. , 3. , 4. ],
Out[46]:
[ 1. , 0.38, 62. , ..., 2. , 6. , 7. ],
[ 1. , 0.31, 60.6 , ..., 3. , 4. , 7. ],
...,
[ 1. , 0.43, 61.6 , ..., 2. , 2. , 3. ],
[ 1. , 1.53, 62.5 , ..., 2. , 4. , 3. ],
[ 1. , 0.31, 62.4 , ..., 4. , 2. , 6. ]])
In [47]: #It has 10 columns because it has an additional column of constants added to it
X1.shape
(13485, 10)
Out[47]:
In [48]: print(y_test.shape)
print(y_predicted.shape)
(13485,)
(13485,)
In [54]: from statsmodels.stats.diagnostic import het_breuschpagan
In [56]: bptest = het_breuschpagan(residual, exog_het = X1)
p_value = bptest[1]
p_value
if p_value < 0.05:
print("Heteroscedasticity is present")
else:
print("Homoscedasticity is assumed")
Heteroscedasticity is present
In [57]: print("Describe Prakhar in one adjective")
secret_word = "chad"
guess = " "
guess_count = 0
guess_limit = 5
out_of_guesses = False
while guess != secret_word and not(out_of_guesses):
if guess_count < guess_limit:
guess = input("Enter your guess: ")
guess_count += 1
else:
out_of_guesses = True
if out_of_guesses:
print("Hehehe, Loser")
else:
print("He is indeed")
#Game with a count:
#To do that, we need to create a few more variables. Firstly, we need to create the variable where we are gonna store the
#the number of guesses
#Down in the while loop, everytime we go through the loop, we wanna increment that guess
#The last variable out of guesses is a bullion simply telling us if the user is out of guesses or not
#We set it equal to False kyunki game ki start me toh the user is not out of guesses and shit
#Now, before we let the user guess the word, we need to make sure that the user is not out of guesses as of yet.
Describe Prakhar in one adjective
Enter your guess: lauda
Enter your guess: chad
He is indeed
In [58]: #for loops: You just loop over a different collection of items.
#For example: You can loop over the letters for a particular word etc.
#How you run a for loop: You write for and then you specify a variable, that variable is gonna represent a different value
#every time we go through the for loop
#The first line of the code says that for every letter (variable) in the word I have specified, I want to do something
#The second line simply tells you what that something is: that something is simply me printing the variable I specified
for letter in "Giraffe Academy":
print(letter)
G
i
r
a
f
f
e
A
c
a
d
e
m
y
In [ ]: