0% found this document useful (0 votes)
8 views8 pages

Week 3 Lecture

The document discusses statistical methods applied to datasets, including bootstrapping and permutation testing, using R programming with libraries such as tidyverse and readxl. It presents analyses on birth weights and arsenic levels, showcasing original and bootstrap sample statistics, histograms, and confidence intervals. Additionally, it examines diving scores to assess differences between semifinal and final rounds, concluding that the mean scores do not significantly differ.

Uploaded by

christelsiaw
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views8 pages

Week 3 Lecture

The document discusses statistical methods applied to datasets, including bootstrapping and permutation testing, using R programming with libraries such as tidyverse and readxl. It presents analyses on birth weights and arsenic levels, showcasing original and bootstrap sample statistics, histograms, and confidence intervals. Additionally, it examines diving scores to assess differences between semifinal and final rounds, concluding that the mean scores do not significantly differ.

Uploaded by

christelsiaw
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

week 3 lecture

2025-08-25

library(tidyverse)
library(magrittr) #pipping operator %>%
library(tinytex) #knit to pdf
library(readxl) #read excel files
library(resampledata) #data sets from Chihara and Hesterberg's book

df <- read.csv("birthweight.csv", stringsAsFactors = T)


head(df)

## id age tobacco alcohol gender weight gestation smoker


## 1 1 30-34 No No Male 3827 40 No
## 2 2 30-34 No No Male 3629 38 No
## 3 3 35-39 No No Female 3062 37 No
## 4 4 20-24 No No Female 3430 39 No
## 5 5 25-29 No No Male 3827 38 No
## 6 6 35-39 No No Female 3119 39 No

ori.sample <- df %>% summarise(n = n(),


Mean = mean(weight),
SD = sd(weight))

ori.sample

## n Mean SD
## 1 1009 3448.26 487.736

set.seed(123)

weight <- df$weight


n <- length(weight)

N <- 10ˆ4
weight.boot <- numeric(N)

for (i in 1:N) {
samp <- sample(weight, size = n, replace = T) # draw resample
weight.boot[i] <- mean(samp) # compute mean and store it as my.boot
}

Shape

1
resamples<-as.data.frame(weight.boot)

ggplot(data=resamples, mapping=aes(x=weight.boot)) +
geom_histogram(binwidth=10, fill="salmon",color="black") +
labs(x="BirthWeight(ingrams)",y="Frequency",title="DistributionofBirthWeight") +
theme_classic()

DistributionofBirthWeight

2500

2000

1500
Frequency

1000

500

3400 3440 3480 3520


BirthWeight(ingrams)

Spread

boot.sample <- resamples %>% summarise(n = n(),


Mean = mean(weight.boot),
SD = sd(weight.boot))
boot.sample

## n Mean SD
## 1 10000 3448.297 15.36148

a <- rbind(ori.sample, boot.sample)


rownames(a) <- c("Original Sample", "Bootstrap Sample")
a %>% format(scientific = F)

## n Mean SD
## Original Sample 1009 3448.260 487.73604
## Bootstrap Sample 10000 3448.297 15.36148

2
Percentile Interval

quantile(weight.boot, c(0.025,0.975))

## 2.5% 97.5%
## 3418.475 3478.640

dat <- read.csv("arsenic.csv", stringsAsFactors = T)


highcontam <- dat %>% filter(Arsenic > 10)
100*(nrow(highcontam)/nrow(dat))

## [1] 57.56458

Shape, Center & Spread for means

set.seed(123)

arsenic <- dat$Arsenic


n.arsenic <- length(arsenic)

N <- 10ˆ4
boot.arsenic <- numeric(n.arsenic)

for (i in 1:N) {
x <- sample(arsenic, size = n.arsenic, replace = T)
boot.arsenic[i] <- mean(x)
}

hist(boot.arsenic)

3
Histogram of boot.arsenic
2000
1500
Frequency

1000
500
0

60 80 100 120 140 160 180 200

boot.arsenic

95% CI

paste("Bootstrap Mean =", round(mean(boot.arsenic),2))

## [1] "Bootstrap Mean = 125.32"

paste("Bootstrap SE =", round(sd(boot.arsenic),2))

## [1] "Bootstrap SE = 18.16"

quantile(boot.arsenic, c(0.025,0.975))

## 2.5% 97.5%
## 92.80887 163.96348

Permutation testing & Bootstrapping

dive <- Diving2017 # data from library(resampledata)


dive

## Name Country Semifinal Final


## 1 CHEONG Jun Hoong Malaysia 325.50 397.50
## 2 SI Yajie China 382.80 396.00

4
## 3 REN Qian China 367.50 391.95
## 4 KIM Mi Rae North Korea 346.00 385.55
## 5 WU Melissa Australia 318.70 370.20
## 6 KIM Kuk Hyang North Korea 360.85 360.00
## 7 ITAHASHI Minami Japan 313.70 357.85
## 8 BENFEITO Meaghan Canada 355.15 331.40
## 9 PAMG Pandelela Malaysia 322.75 322.40
## 10 CHAMANDY Olivia Canada 320.55 307.15
## 11 PARRATTO Jessica USA 322.75 302.35
## 12 MURILLO URREA Carolina Colombia 325.75 283.35

mean(dive$Semifinal)

## [1] 338.5

mean(dive$Final)

## [1] 350.475

mean(dive$Final)-mean(dive$Semifinal)

## [1] 11.975

set.seed(987)

Diff <- dive$Final-dive$Semifinal #differenceoftwoscores


obs <- mean(Diff) #meandifference

N <- 10ˆ5-1
result <- numeric(N)
for(i in 1:N) {
Sign <- sample(c(-1,1),size=12,replace=T) #randomvectorof1'sand-1's
Diff2 <- Diff*Sign #randompairsof(a-b)or(b-a)
result[i] <- mean(Diff2) #meanofthedifference
}

hist(result,col="steelblue")
abline(v=mean(obs),col="red",lty=2)

5
Histogram of result
15000
Frequency

10000
5000
0

−30 −20 −10 0 10 20 30

result

2 *(sum(result >=obs) + 1)/ (N+1) #P-value

## [1] 0.25664

dive.tidy <- dive %>%


select(Semifinal, Final) %>%
pivot_longer(cols=everything(), names_to = "round", values_to = "time")

head(dive.tidy,3)

## # A tibble: 3 x 2
## round time
## <chr> <dbl>
## 1 Semifinal 326.
## 2 Final 398.
## 3 Semifinal 383.

set.seed(987)

obs.ind <- mean(dive$Final)- mean(dive$Semifinal)


time <- dive.tidy$time
N.ind <- 10ˆ5
result.ind <- numeric(N.ind)

6
for (i in 1:N.ind) {
index.ind <- sample(length(time), size = 0.5*length(time), replace = F)
result.ind[i] <- mean(time[-index.ind])- mean(time[index.ind])
}

hist(result.ind, col = "steelblue")


abline(v=mean(obs.ind), col = "red", lty = 2)

Histogram of result.ind
12000
Frequency

8000
4000
0

−40 −20 0 20 40

result.ind

2*(sum(result.ind >= obs.ind) + 1) / (N+1)

## [1] 0.37566

Bootstrap

dive.boot <- dive %>% mutate(difference=dive$Final-dive$Semifinal)


head(dive.boot)

## Name Country Semifinal Final difference


## 1 CHEONG Jun Hoong Malaysia 325.50 397.50 72.00
## 2 SI Yajie China 382.80 396.00 13.20
## 3 REN Qian China 367.50 391.95 24.45
## 4 KIM Mi Rae North Korea 346.00 385.55 39.55
## 5 WU Melissa Australia 318.70 370.20 51.50
## 6 KIM Kuk Hyang North Korea 360.85 360.00 -0.85

7
set.seed(987)

N.boot<-10ˆ5
result.boot<-numeric(N.boot)
dif<-dive.boot$difference

for(i in 1:N.boot) {
x.boot <-sample(dif,size= length(dif),replace=T)
result.boot[i]<-mean(x.boot)
}

hist(result.boot)

Histogram of result.boot
10000 15000 20000
Frequency

5000
0

−20 0 20 40

result.boot

95% CI

quantile(result.boot, c(0.025,0.975))

## 2.5% 97.5%
## -6.61250 31.04583

Since 0 is part of the interval, we cannot conclude that the mean scores for divers differ between the semifinal
and final rounds.
Quiz 3 #“‘{r} #quiz3 <- read.csv(” .csv”)
#“‘

You might also like