week 3 lecture
2025-08-25
library(tidyverse)
library(magrittr) #pipping operator %>%
library(tinytex) #knit to pdf
library(readxl) #read excel files
library(resampledata) #data sets from Chihara and Hesterberg's book
df <- read.csv("birthweight.csv", stringsAsFactors = T)
head(df)
## id age tobacco alcohol gender weight gestation smoker
## 1 1 30-34 No No Male 3827 40 No
## 2 2 30-34 No No Male 3629 38 No
## 3 3 35-39 No No Female 3062 37 No
## 4 4 20-24 No No Female 3430 39 No
## 5 5 25-29 No No Male 3827 38 No
## 6 6 35-39 No No Female 3119 39 No
ori.sample <- df %>% summarise(n = n(),
Mean = mean(weight),
SD = sd(weight))
ori.sample
## n Mean SD
## 1 1009 3448.26 487.736
set.seed(123)
weight <- df$weight
n <- length(weight)
N <- 10ˆ4
weight.boot <- numeric(N)
for (i in 1:N) {
samp <- sample(weight, size = n, replace = T) # draw resample
weight.boot[i] <- mean(samp) # compute mean and store it as my.boot
}
Shape
1
resamples<-as.data.frame(weight.boot)
ggplot(data=resamples, mapping=aes(x=weight.boot)) +
geom_histogram(binwidth=10, fill="salmon",color="black") +
labs(x="BirthWeight(ingrams)",y="Frequency",title="DistributionofBirthWeight") +
theme_classic()
DistributionofBirthWeight
2500
2000
1500
Frequency
1000
500
3400 3440 3480 3520
BirthWeight(ingrams)
Spread
boot.sample <- resamples %>% summarise(n = n(),
Mean = mean(weight.boot),
SD = sd(weight.boot))
boot.sample
## n Mean SD
## 1 10000 3448.297 15.36148
a <- rbind(ori.sample, boot.sample)
rownames(a) <- c("Original Sample", "Bootstrap Sample")
a %>% format(scientific = F)
## n Mean SD
## Original Sample 1009 3448.260 487.73604
## Bootstrap Sample 10000 3448.297 15.36148
2
Percentile Interval
quantile(weight.boot, c(0.025,0.975))
## 2.5% 97.5%
## 3418.475 3478.640
dat <- read.csv("arsenic.csv", stringsAsFactors = T)
highcontam <- dat %>% filter(Arsenic > 10)
100*(nrow(highcontam)/nrow(dat))
## [1] 57.56458
Shape, Center & Spread for means
set.seed(123)
arsenic <- dat$Arsenic
n.arsenic <- length(arsenic)
N <- 10ˆ4
boot.arsenic <- numeric(n.arsenic)
for (i in 1:N) {
x <- sample(arsenic, size = n.arsenic, replace = T)
boot.arsenic[i] <- mean(x)
}
hist(boot.arsenic)
3
Histogram of boot.arsenic
2000
1500
Frequency
1000
500
0
60 80 100 120 140 160 180 200
boot.arsenic
95% CI
paste("Bootstrap Mean =", round(mean(boot.arsenic),2))
## [1] "Bootstrap Mean = 125.32"
paste("Bootstrap SE =", round(sd(boot.arsenic),2))
## [1] "Bootstrap SE = 18.16"
quantile(boot.arsenic, c(0.025,0.975))
## 2.5% 97.5%
## 92.80887 163.96348
Permutation testing & Bootstrapping
dive <- Diving2017 # data from library(resampledata)
dive
## Name Country Semifinal Final
## 1 CHEONG Jun Hoong Malaysia 325.50 397.50
## 2 SI Yajie China 382.80 396.00
4
## 3 REN Qian China 367.50 391.95
## 4 KIM Mi Rae North Korea 346.00 385.55
## 5 WU Melissa Australia 318.70 370.20
## 6 KIM Kuk Hyang North Korea 360.85 360.00
## 7 ITAHASHI Minami Japan 313.70 357.85
## 8 BENFEITO Meaghan Canada 355.15 331.40
## 9 PAMG Pandelela Malaysia 322.75 322.40
## 10 CHAMANDY Olivia Canada 320.55 307.15
## 11 PARRATTO Jessica USA 322.75 302.35
## 12 MURILLO URREA Carolina Colombia 325.75 283.35
mean(dive$Semifinal)
## [1] 338.5
mean(dive$Final)
## [1] 350.475
mean(dive$Final)-mean(dive$Semifinal)
## [1] 11.975
set.seed(987)
Diff <- dive$Final-dive$Semifinal #differenceoftwoscores
obs <- mean(Diff) #meandifference
N <- 10ˆ5-1
result <- numeric(N)
for(i in 1:N) {
Sign <- sample(c(-1,1),size=12,replace=T) #randomvectorof1'sand-1's
Diff2 <- Diff*Sign #randompairsof(a-b)or(b-a)
result[i] <- mean(Diff2) #meanofthedifference
}
hist(result,col="steelblue")
abline(v=mean(obs),col="red",lty=2)
5
Histogram of result
15000
Frequency
10000
5000
0
−30 −20 −10 0 10 20 30
result
2 *(sum(result >=obs) + 1)/ (N+1) #P-value
## [1] 0.25664
dive.tidy <- dive %>%
select(Semifinal, Final) %>%
pivot_longer(cols=everything(), names_to = "round", values_to = "time")
head(dive.tidy,3)
## # A tibble: 3 x 2
## round time
## <chr> <dbl>
## 1 Semifinal 326.
## 2 Final 398.
## 3 Semifinal 383.
set.seed(987)
obs.ind <- mean(dive$Final)- mean(dive$Semifinal)
time <- dive.tidy$time
N.ind <- 10ˆ5
result.ind <- numeric(N.ind)
6
for (i in 1:N.ind) {
index.ind <- sample(length(time), size = 0.5*length(time), replace = F)
result.ind[i] <- mean(time[-index.ind])- mean(time[index.ind])
}
hist(result.ind, col = "steelblue")
abline(v=mean(obs.ind), col = "red", lty = 2)
Histogram of result.ind
12000
Frequency
8000
4000
0
−40 −20 0 20 40
result.ind
2*(sum(result.ind >= obs.ind) + 1) / (N+1)
## [1] 0.37566
Bootstrap
dive.boot <- dive %>% mutate(difference=dive$Final-dive$Semifinal)
head(dive.boot)
## Name Country Semifinal Final difference
## 1 CHEONG Jun Hoong Malaysia 325.50 397.50 72.00
## 2 SI Yajie China 382.80 396.00 13.20
## 3 REN Qian China 367.50 391.95 24.45
## 4 KIM Mi Rae North Korea 346.00 385.55 39.55
## 5 WU Melissa Australia 318.70 370.20 51.50
## 6 KIM Kuk Hyang North Korea 360.85 360.00 -0.85
7
set.seed(987)
N.boot<-10ˆ5
result.boot<-numeric(N.boot)
dif<-dive.boot$difference
for(i in 1:N.boot) {
x.boot <-sample(dif,size= length(dif),replace=T)
result.boot[i]<-mean(x.boot)
}
hist(result.boot)
Histogram of result.boot
10000 15000 20000
Frequency
5000
0
−20 0 20 40
result.boot
95% CI
quantile(result.boot, c(0.025,0.975))
## 2.5% 97.5%
## -6.61250 31.04583
Since 0 is part of the interval, we cannot conclude that the mean scores for divers differ between the semifinal
and final rounds.
Quiz 3 #“‘{r} #quiz3 <- read.csv(” .csv”)
#“‘