import pandas as pd
import numpy as np
data=pd.read_csv("C:/Users/LENOVO/Downloads/mtcars.csv")
data.head()
import statistics
mean=statistics.mean(data.mpg)
print(mean)
OUTPUT: 20.090625
data['jackknife_mean_mpg']=np.nan
for i in range (len(data)):
data_minus_one=data.drop(index=i)
mean_value_minus_one=data_minus_one['mpg'].mean()
data.at[i,'jackknife_mean_mpg']=mean_value_minus_one
print(data.at[i,'jackknife_mean_mpg'])
20.06129032258064 20.26451612903226
20.06129032258064 19.693548387096776
20.00322580645161 19.758064516129032
20.048387096774192 19.64516129032258
20.135483870967743 20.045161290322575
20.154838709677417 20.23870967741935
20.277419354838706 20.24838709677419
19.951612903225808 20.309677419354834
20.00322580645161 20.119354838709675
20.119354838709675 19.85806451612903
20.164516129032254 19.899999999999995
20.20967741935484 19.75806451612903
20.180645161290318 20.229032258064514
20.248387096774195 20.103225806451608
import scipy.stats as stats
n=32
standard_error=np.std(data['mpg']/np.sqrt(n))
jackknife_mean_mpg=np.mean(data['jackknife_mean_mpg'])
jackknife_standard_error=np.sqrt(((n-1)/n)*np.sum((data['jackknife_mean_mpg']
-jackknife_mean_mpg)**2))
t_value=stats.t.ppf(0.975,df=n-1)
confidence_interval=[np.mean(data['mpg'])+factor*t_value*standard_error for
factor in[-1,1]]
print(standard_error)
print(jackknife_mean_mpg)
print(jackknife_standard_error)
print(t_value)
print(confidence_interval)
OUTPUT:
1.0486445806577978
20.090624999999996
1.0654239593728139
2.0395134463964077
[17.951900277257703, 22.229349722742302]
import statistics
mean=statistics.mean(data.hp)
print(mean)
OUTPUT: 146.6875
data['jackknife_mean_hp']=np.nan
for i in range (len(data)):
data_pseudo=data.drop(index=i)
mean_value_pseudo=data_pseudo['hp'].mean()
data.at[i,'jackknife_mean_hp']=mean_value_pseudo
print(data.at[i,'jackknife_mean_hp'])
147.8709677419355 149.41935483870967
147.8709677419355 148.3548387096774
148.41935483870967 147.4516129032258
147.8709677419355 147.4516129032258
145.7741935483871 145.61290322580646
148.03225806451613 145.61290322580646
143.51612903225808
import scipy.stats as stats
n=32
standard_error=np.std(data['hp']/np.sqrt(n))
jackknife_mean_hp=np.mean(data['jackknife_mean_hp'])
jackknife_standard_error=np.sqrt(((n-1)/n)*np.sum((data['jackknife_mean_hp']-
jackknife_mean_hp)**2))
t_value=stats.t.ppf(0.975,df=n-1)
confidence_interval=[np.mean(data['hp'])+factor*t_value*standard_error for
factor in[-1,1]]
print(standard_error)
print(jackknife_mean_hp)
print(jackknife_standard_error)
print(t_value)
print(confidence_interval)
OUTPUT:
11.929434243382522
146.6875
12.120317311599985
2.0395134463964077
[122.35725845271959, 171.01774154728042]
MEDIAN
import statistics
median=statistics.median(data.mpg)
print(median)
OUTPUT: 19.2
data['jackknife_mean_mpg']=np.nan
for i in range (len(data)):
data_minus_one=data.drop(index=i)
median_value_minus_one=data_minus_one['mpg'].median()
data.at[i,'jackknife_mean_mpg']=median_value_minus_one
print(data.at[i,'jackknife_mean_mpg'])
19.2 19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2
19.2
19.2
import scipy.stats as stats
n=32
standard_error=np.std(data['mpg']/np.sqrt(n))
jackknife_median_mpg=np.median(data['jackknife_mean_mpg'])
jackknife_standard_error=np.sqrt(((n-1)/n)*np.sum((data['jackknife_mean_mpg']
-jackknife_mean_mpg)**2))
t_value=stats.t.ppf(0.975,df=n-1)
confidence_interval=[np.median(data['mpg'])+factor*t_value*standard_error for
factor in[-1,1]]
print(standard_error)
print(jackknife_median_mpg)
print(jackknife_standard_error)
print(t_value)
print(confidence_interval)
1.0486445806577978
19.2
4.958790135645469
2.0395134463964077
[17.0612752772577, 21.3387247227423]
import statistics
mean=statistics.median(data.hp)
print(median)
OUTPUT: 19.2
data['jackknife_mean_hp']=np.nan
for i in range (len(data)):
data_pseudo=data.drop(index=i)
median_value_pseudo=data_pseudo['hp'].median()
data.at[i,'jackknife_mean_hp']=median_value_pseudo
print(data.at[i,'jackknife_mean_hp'])
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0
import scipy.stats as stats
n=32
standard_error=np.std(data['hp']/np.sqrt(n))
jackknife_median_hp=np.median(data['jackknife_mean_hp'])
jackknife_standard_error=np.sqrt(((n-1)/n)*np.sum((data['jackknife_mean_hp']-
jackknife_mean_hp)**2))
t_value=stats.t.ppf(0.975,df=n-1)
confidence_interval=[np.median(data['hp'])+factor*t_value*standard_error for
factor in[-1,1]]
print(standard_error)
print(jackknife_median_hp)
print(jackknife_standard_error)
print(t_value)
print(confidence_interval)
11.929434243382522
123.0
131.88641834453614
2.0395134463964077
[98.66975845271959, 147.33024154728042]
jackknife_median_hp=np.mean(data['jackknife_mean_hp'])
print(jackknife_median_hp)
OUTPUT: 123.0
RANGE
max=data['mpg'].max()
min=data['mpg'].min()
print(f"Maximum MPG: {max}")
print(f"Minimum MPG: {min}")
OUTPUT:
Maximum MPG: 33.9
Minimum MPG: 10.4
range_mpg=max-min
print(range_mpg)
OUTPUT:23.5
data['jackknife_range_mpg']=np.nan
for i in range (len(data)):
data_minus_one=data.drop(index=i)
max_value_minus_one=data_minus_one['mpg'].max()
min_value_minus_one=data_minus_one['mpg'].min()
range_value_minus_one=max_value_minus_one-min_value_minus_one
data.at[i,'jackknife_range_mpg']=range_value_minus_one
print(data.at[i,'jackknife_range_mpg'])
23.5 23.5
23.5 22.0
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5
23.5
import scipy.stats as stats
n=32
standard_error=np.std(data['mpg']/np.sqrt(n))
jackknife_range_mpg=np.mean(data['jackknife_range_mpg'])
jackknife_standard_error1=np.sqrt(((n-1)/n)*np.sum((data['jackknife_range_mpg
']-jackknife_range_mpg)**2))
t_value1=stats.t.ppf(0.975,df=n-1)
confidence_interval1=[np.median(data['mpg'])+factor*t_value*standard_error
for factor in[-1,1]]
print(standard_error)
print(jackknife_range_mpg)
print(jackknife_standard_error1)
print(t_value1)
print(confidence_interval1)
OUTPUT:
1.0486445806577978
23.453125
1.453125
2.0395134463964077
[17.0612752772577, 21.3387247227423]
BOOTSTRAPPING
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import glm
data=pd.read_csv("C:/Users/LENOVO/Downloads/mtcars.csv")
data.head()
model mpg cyl disp hp drat wt qsec vs am gear
\
0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4
1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4
2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4
3 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3
4 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3
carb
0 4
1 4
2 1
3 1
4 2
sample_size=len(data)
data_bootstrap_sample=data['hp'].sample(n=sample_size, replace=True)
print("Samples:", data_bootstrap_sample)
sample_mean=data_bootstrap_sample.mean()
print(sample_mean)
Samples: 2 93 8 95
1 110 17 66
31 109 31 109
30 335 5 105
15 215 8 95
4 175 5 105
21 150 28 264
8 95 26 91
25 66 8 95
12 180 16 230
22 150 11 180
18 52 15 215
4 175 22 150
10 123 25 66
9 123 10 123
20 97
10 123
Name: hp, dtype: int64
136.25
def create_bootstrap_samples(sample_size=len(data), n_sample=1000):
sample_means=[]
for i in range(n_sample):
data_bootstrap_sample=data['hp'].sample(n=sample_size, replace=True)
sample_mean=data_bootstrap_sample.mean()
sample_means.append(sample_mean)
return pd.Series(sample_means)
sample_means=create_bootstrap_samples()
sample_means.plot(kind='hist', bins=20, title='Distribution of bootstrap
sample means')
<Axes: title={'center': 'Distribution of bootstrap sample means'},
ylabel='Frequency'>
ci_lower=sample_means.quantile(q=0.025)
ci_upper=sample_means.quantile(q=0.975)
print(ci_lower)
print(ci_upper)
124.084375
171.44453124999998
import matplotlib.pyplot as plt
sample_means.plot(kind='hist', bins=20, title='confidence interval of the
sample means')
plt.axvline(ci_lower,color='green', ls='--')
plt.axvline(ci_upper, color='red', ls='--')
# for disp
sample_size=len(data)
data_bootstrap_sample=data['disp'].sample(n=sample_size, replace=True)
print("Samples:", data_bootstrap_sample)
sample_mean=data_bootstrap_sample.mean()
print(sample_mean)
Samples: 4 360.0 20 120.1
24 400.0 22 304.0
9 167.6 19 71.1
3 258.0 19 71.1
26 120.3 13 275.8
5 225.0 24 400.0
20 120.1 13 275.8
31 121.0 12 275.8
30 301.0 19 71.1
21 318.0 16 440.0
18 75.7 28 351.0
5 225.0 26 120.3
25 79.0 18 75.
def create_bootstrap_samples(sample_size=len(data), n_sample=1000):
sample_means=[]
for i in range(n_sample):
data_bootstrap_sample=data['disp'].sample(n=sample_size,
replace=True)
sample_mean=data_bootstrap_sample.mean()
sample_means.append(sample_mean)
return pd.Series(sample_means)
sample_means=create_bootstrap_samples()
sample_means.plot(kind='hist', bins=20, title='Distribution of bootstrap
sample means')
ci_lower=sample_means.quantile(q=0.025)
ci_upper=sample_means.quantile(q=0.975)
print(ci_lower)
print(ci_upper)
sample_means.plot(kind='hist', bins=20, title='confidence interval of the
sample means')
plt.axvline(ci_lower,color='yellow', ls='--')
plt.axvline(ci_upper, color='blue', ls='--')