11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [6]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import scipy.stats as stats
import seaborn as sns
import os
os.getcwd()
os.chdir('C:\\Users\\USER\\Downloads')
In [8]:
df_Wholesale = pd.read_csv('Wholesale+Customers+Data.csv')
In [9]:
df.head()
Out[9]:
Buyer/Spender Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicates
0 1 Retail Other 12669 9656 7561 214 2674 1
1 2 Retail Other 7057 9810 9568 1762 3293 1
2 3 Retail Other 6353 8808 7684 2405 3516 7
3 4 Hotel Other 13265 1196 4221 6404 507 1
4 5 Retail Other 22615 5410 7198 3915 1777 5
In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Buyer/Spender 440 non-null int64
1 Channel 440 non-null object
2 Region 440 non-null object
3 Fresh 440 non-null int64
4 Milk 440 non-null int64
5 Grocery 440 non-null int64
6 Frozen 440 non-null int64
7 Detergents_Paper 440 non-null int64
8 Delicatessen 440 non-null int64
dtypes: int64(7), object(2)
memory usage: 31.1+ KB
In [11]:
# There are 440 entries with 9 columns however there is no null values.
# There are 7 integers and 2 objects
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 1/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [12]:
df.describe(include='all')
Out[12]:
Buyer/Spender Channel Region Fresh Milk Grocery Fro
count 440.000000 440 440 440.000000 440.000000 440.000000 440.000
unique NaN 2 3 NaN NaN NaN N
top NaN Hotel Other NaN NaN NaN N
freq NaN 298 316 NaN NaN NaN N
mean 220.500000 NaN NaN 12000.297727 5796.265909 7951.277273 3071.931
std 127.161315 NaN NaN 12647.328865 7380.377175 9503.162829 4854.673
min 1.000000 NaN NaN 3.000000 55.000000 3.000000 25.000
25% 110.750000 NaN NaN 3127.750000 1533.000000 2153.000000 742.250
50% 220.500000 NaN NaN 8504.000000 3627.000000 4755.500000 1526.000
75% 330.250000 NaN NaN 16933.750000 7190.250000 10655.750000 3554.250
max 440.000000 NaN NaN 112151.000000 73498.000000 92780.000000 60869.000
In [13]:
# Hotel as channel and other as region has spent the most on above data and retail spend th
In [14]:
df2 = df.copy()
df2['Total'] = df2['Fresh'] + df2['Milk'] + df2['Grocery'] + df2['Frozen'] + df2['Detergent
In [15]:
#Dropping "Buyer/Spender" column as it is not useful
df2 = df2.drop(['Buyer/Spender'], axis = 1)
In [16]:
df2.head()
Out[16]:
Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicatessen Total
0 Retail Other 12669 9656 7561 214 2674 1338 34112
1 Retail Other 7057 9810 9568 1762 3293 1776 33266
2 Retail Other 6353 8808 7684 2405 3516 7844 36610
3 Hotel Other 13265 1196 4221 6404 507 1788 27381
4 Retail Other 22615 5410 7198 3915 1777 5185 46100
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 2/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [17]:
df2.describe(include='all')
Out[17]:
Channel Region Fresh Milk Grocery Frozen Detergents_
count 440 440 440.000000 440.000000 440.000000 440.000000 440.0
unique 2 3 NaN NaN NaN NaN
top Hotel Other NaN NaN NaN NaN
freq 298 316 NaN NaN NaN NaN
mean NaN NaN 12000.297727 5796.265909 7951.277273 3071.931818 2881.4
std NaN NaN 12647.328865 7380.377175 9503.162829 4854.673333 4767.8
min NaN NaN 3.000000 55.000000 3.000000 25.000000 3.0
25% NaN NaN 3127.750000 1533.000000 2153.000000 742.250000 256.7
50% NaN NaN 8504.000000 3627.000000 4755.500000 1526.000000 816.5
75% NaN NaN 16933.750000 7190.250000 10655.750000 3554.250000 3922.0
max NaN NaN 112151.000000 73498.000000 92780.000000 60869.000000 40827.0
In [20]:
df2.groupby('Region')['Total'].sum()
Out[20]:
Region
Lisbon 2386813
Oporto 1555088
Other 10677599
Name: Total, dtype: int64
In [24]:
plt.figure(figsize = (6,4))
df2.groupby('Region')['Total'].sum().plot.bar();
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 3/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [26]:
df2.groupby('Channel')['Total'].sum()
Out[26]:
Channel
Hotel 7999569
Retail 6619931
Name: Total, dtype: int64
In [27]:
df2.groupby('Channel')['Total'].sum().plot.bar();
In [28]:
pd.pivot_table(df2, values=['Total'], index=['Region'], aggfunc=np.sum)
Out[28]:
Total
Region
Lisbon 2386813
Oporto 1555088
Other 10677599
In [29]:
pd.pivot_table(df2, values=['Total'], index=['Channel'], aggfunc=np.sum)
Out[29]:
Total
Channel
Hotel 7999569
Retail 6619931
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 4/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [30]:
# Hotel has spend the most and Retails spend the least
In [31]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
Out[31]:
Delicatessen Detergents_Paper Fresh Frozen Grocery Milk Total
Region
Lisbon 104327 204136 854833 231026 570037 422454 2386813
Oporto 54506 173311 464721 190132 433274 239144 1555088
Other 512110 890410 3960577 930492 2495251 1888759 10677599
In [32]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
In [33]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
Out[33]:
Delicatessen Detergents_Paper Fresh Frozen Grocery Milk Total
Channel
Hotel 421955 235587 4015717 1116979 1180717 1028614 7999569
Retail 248988 1032270 1264414 234671 2317845 1521743 6619931
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 5/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [34]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
In [35]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
Out[35]:
Delicatessen Detergents_Paper Fresh Frozen Grocery Milk Total
Region Channel
Hotel 70632 56081 761233 184512 237542 228342 1538342
Lisbon
Retail 33695 148055 93600 46514 332495 194112 848471
Hotel 30965 13516 326215 160861 123074 64519 719150
Oporto
Retail 23541 159795 138506 29271 310200 174625 835938
Hotel 320358 165990 2928269 771606 820101 735753 5742077
Other
Retail 191752 724420 1032308 158886 1675150 1153006 4935522
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 6/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [36]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
In [37]:
pd.pivot_table(df2, values=['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicates
Out[37]:
Delicatessen Detergents_Paper Fresh Frozen Groce
Channel Hotel Retail Hotel Retail Hotel Retail Hotel Retail Hotel Ret
Region
Lisbon 70632 33695 56081 148055 761233 93600 184512 46514 237542 3324
Oporto 30965 23541 13516 159795 326215 138506 160861 29271 123074 3102
Other 320358 191752 165990 724420 2928269 1032308 771606 158886 820101 16751
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 7/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [38]:
pd.pivot_table(df2, values='Fresh', index=['Region'], columns=['Channel'], aggfunc=np.sum)
Out[38]:
Channel Hotel Retail
Region
Lisbon 761233 93600
Oporto 326215 138506
Other 2928269 1032308
In [39]:
#out of all different varities Fresh has the highest values and delicatessen has the lowest
# other has maximum number of channel and regions however Oporto has a least number of Chan
In [40]:
df.std()/df.mean()
Out[40]:
Buyer/Spender 0.576695
Fresh 1.053918
Milk 1.273299
Grocery 1.195174
Frozen 1.580332
Detergents_Paper 1.654647
Delicatessen 1.849407
dtype: float64
In [41]:
df.describe().T['std'] / df.describe().T['mean']
Out[41]:
Buyer/Spender 0.576695
Fresh 1.053918
Milk 1.273299
Grocery 1.195174
Frozen 1.580332
Detergents_Paper 1.654647
Delicatessen 1.849407
dtype: float64
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 8/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [42]:
sns.boxplot(data=df);
In [43]:
#Outliers are important because of this we need to understand our data more accurately
In [49]:
#Analysis Report
#On the basis of analysis we should concentrate more on Hotel and other region because we h
#Fresh items we are spending more than other however delicatession has lowest spend
#we have obeserved the retail has less spend and they are spending more on Grocessary
In [50]:
#Problem 2 Clear Mountain State University
In [51]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
In [52]:
df = pd.read_csv('Survey-1.csv')
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 9/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [53]:
df.head()
Out[53]:
Grad Social
ID Gender Age Class Major GPA Employment Salary S
Intention Networking
0 1 Female 20 Junior Other Yes 2.9 Full-Time 50.0 1
1 2 Male 23 Senior Management Yes 3.6 Part-Time 25.0 1
2 3 Male 21 Junior Other Yes 2.5 Part-Time 45.0 2
3 4 Male 21 Junior CIS Yes 2.5 Full-Time 40.0 4
4 5 Male 23 Senior Other Undecided 2.8 Unemployed 40.0 2
In [54]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 62 non-null int64
1 Gender 62 non-null object
2 Age 62 non-null int64
3 Class 62 non-null object
4 Major 62 non-null object
5 Grad Intention 62 non-null object
6 GPA 62 non-null float64
7 Employment 62 non-null object
8 Salary 62 non-null float64
9 Social Networking 62 non-null int64
10 Satisfaction 62 non-null int64
11 Spending 62 non-null int64
12 Computer 62 non-null object
13 Text Messages 62 non-null int64
dtypes: float64(2), int64(6), object(6)
memory usage: 6.9+ KB
In [55]:
# There are 62 entries and no null values
# data has 2 floats, 6 Integers and 6 Objects
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 10/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [56]:
df.describe()
Out[56]:
Social
ID Age GPA Salary Satisfaction Spending
Networking Mes
count 62.000000 62.000000 62.000000 62.000000 62.000000 62.000000 62.000000 62.0
mean 31.500000 21.129032 3.129032 48.548387 1.516129 3.741935 482.016129 246.2
std 18.041619 1.431311 0.377388 12.080912 0.844305 1.213793 221.953805 214.4
min 1.000000 18.000000 2.300000 25.000000 0.000000 1.000000 100.000000 0.0
25% 16.250000 20.000000 2.900000 40.000000 1.000000 3.000000 312.500000 100.0
50% 31.500000 21.000000 3.150000 50.000000 1.000000 4.000000 500.000000 200.0
75% 46.750000 22.000000 3.400000 55.000000 2.000000 4.000000 600.000000 300.0
max 62.000000 26.000000 3.900000 80.000000 4.000000 6.000000 1400.000000 900.0
In [57]:
pd.crosstab(df['Gender'],df['Major'])
Out[57]:
International
Major Accounting CIS Economics/Finance Management Other Retailing/Marke
Business
Gender
Female 3 3 7 4 4 3
Male 4 1 4 2 6 4
In [58]:
pd.crosstab(df['Gender'],df['Grad Intention'])
Out[58]:
Grad Intention No Undecided Yes
Gender
Female 9 13 11
Male 3 9 17
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 11/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [59]:
pd.crosstab(df['Gender'],df['Employment'])
Out[59]:
Employment Full-Time Part-Time Unemployed
Gender
Female 3 24 6
Male 7 19 3
In [60]:
pd.crosstab(df['Gender'],df['Computer'])
Out[60]:
Computer Desktop Laptop Tablet
Gender
Female 2 29 2
Male 3 26 0
In [61]:
#prob that is randomly selected University student will be male/female
df['Gender'].value_counts()
Out[61]:
Female 33
Male 29
Name: Gender, dtype: int64
In [62]:
#conditional probability of different majors among the male students in University.
pd.crosstab(df['Gender'],df['Major'])
Out[62]:
International
Major Accounting CIS Economics/Finance Management Other Retailing/Marke
Business
Gender
Female 3 3 7 4 4 3
Male 4 1 4 2 6 4
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 12/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [63]:
#probability that a randomly chosen student is a male and intends to graduate
pd.crosstab(df['Gender'],df['Grad Intention'])
Out[63]:
Grad Intention No Undecided Yes
Gender
Female 9 13 11
Male 3 9 17
In [64]:
#probability that a randomly selected student is a female and does NOT have a laptop.
pd.crosstab(df['Gender'],df['Computer'])
Out[64]:
Computer Desktop Laptop Tablet
Gender
Female 2 29 2
Male 3 26 0
In [65]:
#probability that a randomly chosen student is a male or has full-time employment?
pd.crosstab(df['Gender'],df['Employment'])
Out[65]:
Employment Full-Time Part-Time Unemployed
Gender
Female 3 24 6
Male 7 19 3
In [66]:
#conditional probability that given a female student is randomly chosen, she is majoring in
pd.crosstab(df['Gender'],df['Major'])
Out[66]:
International
Major Accounting CIS Economics/Finance Management Other Retailing/Marke
Business
Gender
Female 3 3 7 4 4 3
Male 4 1 4 2 6 4
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 13/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [67]:
#Contingency table of Gender and Intent to Graduate at 2 levels (Y/N). The Undecided studen
pd.crosstab(df['Gender'],df[df['Grad Intention']!='Undecided']['Grad Intention'])
Out[67]:
Grad Intention No Yes
Gender
Female 9 11
Male 3 17
In [68]:
#Do you think the graduate intention and being female are independent events?
p_female = 20/40
p_grad = 28/40
p_female_grad = 11/40
p_female_int_grad = p_female * p_grad
if (p_female_int_grad == p_female_grad):
print('Graduate intention and female are independent events')
else:
print('Graduate intention and female are dependent events')
Graduate intention and female are dependent events
In [69]:
#If a student is chosen randomly, what is the probability that his/her GPA is less than 3?
df[df['GPA']<3]['GPA'].value_counts().sum()
Out[69]:
17
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 14/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [70]:
#2.7.2.
#Find the conditional probability that a randomly selected male earns 50 or more.
#Find the conditional probability that a randomly selected female earns 50 or more.
df[df['Salary']>=50][['Gender','Salary']].value_counts()
Out[70]:
Gender Salary
Female 50.0 5
55.0 5
60.0 5
Male 50.0 4
55.0 3
60.0 3
Female 70.0 1
78.0 1
80.0 1
Male 52.0 1
54.0 1
65.0 1
80.0 1
dtype: int64
In [71]:
#Shapiro test
p_val = stats.shapiro(df['GPA']).pvalue
print('GPA p-value: ' + str(p_val))
p_val = stats.shapiro(df['Salary']).pvalue
print('Salary p-value: ' + str(p_val))
p_val = stats.shapiro(df['Spending']).pvalue
print('Spending p-value: ' + str(p_val))
p_val = stats.shapiro(df['Text Messages']).pvalue
print('Text msgs p-value: ' + str(p_val))
GPA p-value: 0.11204058676958084
Salary p-value: 0.028000956401228905
Spending p-value: 1.6854661225806922e-05
Text msgs p-value: 4.324040673964191e-06
In [72]:
# ABC Asphalt Shingles
In [77]:
df = pd.read_csv('A+&+B+shingles.csv')
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 15/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [78]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 36 non-null float64
1 B 31 non-null float64
dtypes: float64(2)
memory usage: 704.0 bytes
In [79]:
# There are 36 entries having 2 floats A has 36 non null values and B has 31 non null value
In [80]:
df.describe()
Out[80]:
A B
count 36.000000 31.000000
mean 0.316667 0.273548
std 0.135731 0.137296
min 0.130000 0.100000
25% 0.207500 0.160000
50% 0.290000 0.230000
75% 0.392500 0.400000
max 0.720000 0.580000
In [81]:
df.head()
Out[81]:
A B
0 0.44 0.14
1 0.61 0.15
2 0.47 0.31
3 0.30 0.16
4 0.15 0.37
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 16/17
11/28/21, 11:26 PM Deepak Mahindra_SMDM Final - Jupyter Notebook
In [82]:
stats.ttest_1samp(df['A'],0.35,alternative='two-sided')
#Use pvalue/2
Out[82]:
Ttest_1sampResult(statistic=-1.4735046253382782, pvalue=0.14955266289815025)
In [83]:
stats.ttest_1samp(df['B'],0.35,alternative='two-sided',nan_policy='omit')
#Use pvalue/2
Out[83]:
Ttest_1sampResult(statistic=-3.1003313069986995, pvalue=0.00418095480063836
5)
In [84]:
stats.levene(df.A,df.B.dropna())
Out[84]:
LeveneResult(statistic=0.23808965111555147, pvalue=0.6272312061867605)
In [85]:
stats.ttest_ind(df.A,df.B,equal_var=True,nan_policy='omit',alternative='two-sided')
Out[85]:
Ttest_indResult(statistic=1.2896282719661123, pvalue=0.2017496571835306)
In [ ]:
localhost:8891/notebooks/Deepak Mahindra_SMDM Final.ipynb# 17/17