0% found this document useful (0 votes)
16 views8 pages

Sem 4.1

The document details a data analysis process using the 'mtcars' dataset, focusing on calculating various statistics such as mean, median, and range for the 'mpg' and 'hp' columns. It employs techniques like jackknife resampling and bootstrapping to estimate standard errors and confidence intervals. Additionally, it includes visualizations of bootstrap sample means to illustrate the distribution of these estimates.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOC, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views8 pages

Sem 4.1

The document details a data analysis process using the 'mtcars' dataset, focusing on calculating various statistics such as mean, median, and range for the 'mpg' and 'hp' columns. It employs techniques like jackknife resampling and bootstrapping to estimate standard errors and confidence intervals. Additionally, it includes visualizations of bootstrap sample means to illustrate the distribution of these estimates.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOC, PDF, TXT or read online on Scribd
You are on page 1/ 8

import pandas as pd

import numpy as np

data=pd.read_csv("C:/Users/LENOVO/Downloads/mtcars.csv")
data.head()

import statistics
mean=statistics.mean(data.mpg)
print(mean)

OUTPUT: 20.090625

data['jackknife_mean_mpg']=np.nan
for i in range (len(data)):
data_minus_one=data.drop(index=i)
mean_value_minus_one=data_minus_one['mpg'].mean()
data.at[i,'jackknife_mean_mpg']=mean_value_minus_one
print(data.at[i,'jackknife_mean_mpg'])

20.06129032258064 20.26451612903226
20.06129032258064 19.693548387096776
20.00322580645161 19.758064516129032
20.048387096774192 19.64516129032258
20.135483870967743 20.045161290322575
20.154838709677417 20.23870967741935
20.277419354838706 20.24838709677419
19.951612903225808 20.309677419354834
20.00322580645161 20.119354838709675
20.119354838709675 19.85806451612903
20.164516129032254 19.899999999999995
20.20967741935484 19.75806451612903
20.180645161290318 20.229032258064514
20.248387096774195 20.103225806451608

import scipy.stats as stats


n=32
standard_error=np.std(data['mpg']/np.sqrt(n))
jackknife_mean_mpg=np.mean(data['jackknife_mean_mpg'])
jackknife_standard_error=np.sqrt(((n-1)/n)*np.sum((data['jackknife_mean_mpg']
-jackknife_mean_mpg)**2))
t_value=stats.t.ppf(0.975,df=n-1)
confidence_interval=[np.mean(data['mpg'])+factor*t_value*standard_error for
factor in[-1,1]]
print(standard_error)
print(jackknife_mean_mpg)
print(jackknife_standard_error)
print(t_value)
print(confidence_interval)

OUTPUT:

1.0486445806577978
20.090624999999996
1.0654239593728139
2.0395134463964077
[17.951900277257703, 22.229349722742302]

import statistics
mean=statistics.mean(data.hp)
print(mean)

OUTPUT: 146.6875

data['jackknife_mean_hp']=np.nan
for i in range (len(data)):
data_pseudo=data.drop(index=i)
mean_value_pseudo=data_pseudo['hp'].mean()
data.at[i,'jackknife_mean_hp']=mean_value_pseudo
print(data.at[i,'jackknife_mean_hp'])

147.8709677419355 149.41935483870967
147.8709677419355 148.3548387096774
148.41935483870967 147.4516129032258
147.8709677419355 147.4516129032258
145.7741935483871 145.61290322580646
148.03225806451613 145.61290322580646
143.51612903225808

import scipy.stats as stats


n=32
standard_error=np.std(data['hp']/np.sqrt(n))
jackknife_mean_hp=np.mean(data['jackknife_mean_hp'])
jackknife_standard_error=np.sqrt(((n-1)/n)*np.sum((data['jackknife_mean_hp']-
jackknife_mean_hp)**2))
t_value=stats.t.ppf(0.975,df=n-1)
confidence_interval=[np.mean(data['hp'])+factor*t_value*standard_error for
factor in[-1,1]]
print(standard_error)
print(jackknife_mean_hp)
print(jackknife_standard_error)
print(t_value)
print(confidence_interval)

OUTPUT:
11.929434243382522
146.6875
12.120317311599985
2.0395134463964077
[122.35725845271959, 171.01774154728042]

MEDIAN
import statistics
median=statistics.median(data.mpg)
print(median)

OUTPUT: 19.2

data['jackknife_mean_mpg']=np.nan
for i in range (len(data)):
data_minus_one=data.drop(index=i)
median_value_minus_one=data_minus_one['mpg'].median()
data.at[i,'jackknife_mean_mpg']=median_value_minus_one
print(data.at[i,'jackknife_mean_mpg'])

19.2 19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2 19.2
19.2
19.2
19.2

import scipy.stats as stats


n=32
standard_error=np.std(data['mpg']/np.sqrt(n))
jackknife_median_mpg=np.median(data['jackknife_mean_mpg'])
jackknife_standard_error=np.sqrt(((n-1)/n)*np.sum((data['jackknife_mean_mpg']
-jackknife_mean_mpg)**2))
t_value=stats.t.ppf(0.975,df=n-1)
confidence_interval=[np.median(data['mpg'])+factor*t_value*standard_error for
factor in[-1,1]]
print(standard_error)
print(jackknife_median_mpg)
print(jackknife_standard_error)
print(t_value)
print(confidence_interval)

1.0486445806577978
19.2
4.958790135645469
2.0395134463964077
[17.0612752772577, 21.3387247227423]

import statistics
mean=statistics.median(data.hp)
print(median)

OUTPUT: 19.2

data['jackknife_mean_hp']=np.nan
for i in range (len(data)):
data_pseudo=data.drop(index=i)
median_value_pseudo=data_pseudo['hp'].median()
data.at[i,'jackknife_mean_hp']=median_value_pseudo
print(data.at[i,'jackknife_mean_hp'])

123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0 123.0
123.0

import scipy.stats as stats


n=32
standard_error=np.std(data['hp']/np.sqrt(n))
jackknife_median_hp=np.median(data['jackknife_mean_hp'])
jackknife_standard_error=np.sqrt(((n-1)/n)*np.sum((data['jackknife_mean_hp']-
jackknife_mean_hp)**2))
t_value=stats.t.ppf(0.975,df=n-1)
confidence_interval=[np.median(data['hp'])+factor*t_value*standard_error for
factor in[-1,1]]
print(standard_error)
print(jackknife_median_hp)
print(jackknife_standard_error)
print(t_value)
print(confidence_interval)

11.929434243382522
123.0
131.88641834453614
2.0395134463964077
[98.66975845271959, 147.33024154728042]
jackknife_median_hp=np.mean(data['jackknife_mean_hp'])
print(jackknife_median_hp)

OUTPUT: 123.0

RANGE
max=data['mpg'].max()
min=data['mpg'].min()
print(f"Maximum MPG: {max}")
print(f"Minimum MPG: {min}")

OUTPUT:
Maximum MPG: 33.9
Minimum MPG: 10.4
range_mpg=max-min
print(range_mpg)

OUTPUT:23.5

data['jackknife_range_mpg']=np.nan
for i in range (len(data)):
data_minus_one=data.drop(index=i)
max_value_minus_one=data_minus_one['mpg'].max()
min_value_minus_one=data_minus_one['mpg'].min()
range_value_minus_one=max_value_minus_one-min_value_minus_one
data.at[i,'jackknife_range_mpg']=range_value_minus_one
print(data.at[i,'jackknife_range_mpg'])

23.5 23.5
23.5 22.0
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5 23.5
23.5
23.5

import scipy.stats as stats


n=32
standard_error=np.std(data['mpg']/np.sqrt(n))
jackknife_range_mpg=np.mean(data['jackknife_range_mpg'])
jackknife_standard_error1=np.sqrt(((n-1)/n)*np.sum((data['jackknife_range_mpg
']-jackknife_range_mpg)**2))
t_value1=stats.t.ppf(0.975,df=n-1)
confidence_interval1=[np.median(data['mpg'])+factor*t_value*standard_error
for factor in[-1,1]]
print(standard_error)
print(jackknife_range_mpg)
print(jackknife_standard_error1)
print(t_value1)
print(confidence_interval1)

OUTPUT:

1.0486445806577978
23.453125
1.453125
2.0395134463964077
[17.0612752772577, 21.3387247227423]
BOOTSTRAPPING
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import glm

data=pd.read_csv("C:/Users/LENOVO/Downloads/mtcars.csv")
data.head()

model mpg cyl disp hp drat wt qsec vs am gear


\
0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4

1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4

2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4

3 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3

4 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3

carb
0 4
1 4
2 1
3 1
4 2

sample_size=len(data)
data_bootstrap_sample=data['hp'].sample(n=sample_size, replace=True)
print("Samples:", data_bootstrap_sample)
sample_mean=data_bootstrap_sample.mean()
print(sample_mean)

Samples: 2 93 8 95
1 110 17 66
31 109 31 109
30 335 5 105
15 215 8 95
4 175 5 105
21 150 28 264
8 95 26 91
25 66 8 95
12 180 16 230
22 150 11 180
18 52 15 215
4 175 22 150
10 123 25 66
9 123 10 123
20 97
10 123

Name: hp, dtype: int64


136.25

def create_bootstrap_samples(sample_size=len(data), n_sample=1000):


sample_means=[]
for i in range(n_sample):
data_bootstrap_sample=data['hp'].sample(n=sample_size, replace=True)
sample_mean=data_bootstrap_sample.mean()
sample_means.append(sample_mean)
return pd.Series(sample_means)

sample_means=create_bootstrap_samples()
sample_means.plot(kind='hist', bins=20, title='Distribution of bootstrap
sample means')
<Axes: title={'center': 'Distribution of bootstrap sample means'},
ylabel='Frequency'>

ci_lower=sample_means.quantile(q=0.025)
ci_upper=sample_means.quantile(q=0.975)
print(ci_lower)
print(ci_upper)

124.084375
171.44453124999998

import matplotlib.pyplot as plt

sample_means.plot(kind='hist', bins=20, title='confidence interval of the


sample means')
plt.axvline(ci_lower,color='green', ls='--')
plt.axvline(ci_upper, color='red', ls='--')
# for disp
sample_size=len(data)
data_bootstrap_sample=data['disp'].sample(n=sample_size, replace=True)
print("Samples:", data_bootstrap_sample)
sample_mean=data_bootstrap_sample.mean()
print(sample_mean)

Samples: 4 360.0 20 120.1


24 400.0 22 304.0
9 167.6 19 71.1
3 258.0 19 71.1
26 120.3 13 275.8
5 225.0 24 400.0
20 120.1 13 275.8
31 121.0 12 275.8
30 301.0 19 71.1
21 318.0 16 440.0
18 75.7 28 351.0
5 225.0 26 120.3
25 79.0 18 75.

def create_bootstrap_samples(sample_size=len(data), n_sample=1000):


sample_means=[]
for i in range(n_sample):
data_bootstrap_sample=data['disp'].sample(n=sample_size,
replace=True)
sample_mean=data_bootstrap_sample.mean()
sample_means.append(sample_mean)
return pd.Series(sample_means)
sample_means=create_bootstrap_samples()
sample_means.plot(kind='hist', bins=20, title='Distribution of bootstrap
sample means')
ci_lower=sample_means.quantile(q=0.025)
ci_upper=sample_means.quantile(q=0.975)
print(ci_lower)
print(ci_upper)
sample_means.plot(kind='hist', bins=20, title='confidence interval of the
sample means')
plt.axvline(ci_lower,color='yellow', ls='--')
plt.axvline(ci_upper, color='blue', ls='--')

You might also like