# Mindanao State University
# General Santos City
# Exploratory Data Analysis
# Prepared by: Prof. Carlito O. Daarol
# Math Department
# March 16, 2023
# -------------------------------------------------------------------
# Activate the file containing all functions
# You should modify the file location because it refers to my laptop
# -------------------------------------------------------------------
(drive <- "D:/")
folder_functions <- "Research/thesisfunctions"
filename <- "fn_More_Correlations.R"
source(paste0(drive, folder_functions,"/",filename))
# --------------------------------------------------------
# Using function 1: Read the dataset using function call
# --------------------------------------------------------
# Set pointer to location of my data ( do not use setwd command for data
retrieval)
folder_data <- "C:/Users/Admin/Desktop/Class Lectures/BLecture 0 Graphics in R"
filename <- "Cancer.csv"
data <- readcsv(folder_data,filename)
# Check contents
dim(data)
colnames(data)
head(data)
# ------------------------------------------
# Using R package to display large dataset
# Output visible only if output format is html
# --------------------------- --------------
library(DT)
datatable(data)
# --------------------------------------------------------
# Using function 3: Compute correlation using two columns
# from the dataset
# --------------------------------------------------------
X <- data$breastcancer
Y <- data$co2emissions
corXY <- correlation(X,Y)
corXY
# Result: NA
# This means computation of correlation is not possible because
# of the presence of Missing values
# Possible solution is to omit the NA values
# this is not good because at the end
# X and Y may not have the same length
# Using function 4: put X and Y into 1 dataframe
dataXY <- as.data.frame(cbind(X,Y))
dim(dataXY)
# select only rows with no missing value
dataXY <- na.omit(dataXY)
dim(dataXY)
corXY <- Correcorre(dataXY)
corXY
# Using function 5 and 2: Construct two sets of variables from the data
str(data) # we need to lookup first on the type of variables we have
# select three columns from the data
Set1 <- data[,c("breastcancer", "alcconsumption","internetuserate")]
anyNA(Set1)
# select another three columns from the data
Set2 <- data[,c("co2emissions", "employrate","lifeexpectancy")]
anyNA(Set2)
# If we delete NA values separately then Set1 and Set2 we
# could possibly have unequal rows
# solution is to combine them as dataframe
tmpdat <- cbind(Set1,Set2)
tmpdat <- na.omit(tmpdat)
dim(tmpdat)
# pull out again Set1 and Set2
Set1 <- tmpdat[,1:3]
Set2 <- tmpdat[,4:6]
#process pairwise correlations by feeding the two sets to the 5th function
Pearsonr <- pairwiseCor(Set1,Set2,"pearson")
Spearmanr <- pairwiseCor(Set1,Set2,"spearman")
Pearsonr
Spearmanr
# Table is not good enough for distribution
# Call the function #2 NiceTable to enhance appearance
NiceTable(Pearsonr,"Pearson Correlation Analysis")
NiceTable(Spearmanr,"Spearman Correlation Analysis")
# Using function 6: Compute correlation using only 1 set of data
Pearson1 <- singlesetCor(tmpdat,"pearson")
Spearman1 <- singlesetCor(tmpdat,"spearman")
#display unformatted table
Pearson1
Spearman1
# Display a better table
NiceTable(Pearson1, "Pearson Correlation Analysis")
NiceTable(Spearman1, "Pearson Correlation Analysis")
# Using function 7: Correlation Coefficents in table format
CorrsjPlot(Set1,"pearson","Pearson Correlation Coefficients")
CorrsjPlot(Set2,"pearson","Pearson Correlation Coefficients")
# Using function 8: Scatter Plot
Set1name <- colnames(Set1)
CorrePlotXY(Set1,Set1name[1],Set1name[2],"blue", "XAxis", "YAxis","pearson")
Set2name <- colnames(Set2)
CorrePlotXY(Set2,Set2name[1],Set2name[2],"blue", "XAxis", "YAxis","pearson")
# Use double for loop to generate all plots for Set 1
for (i in 1:(ncol(Set1)-1)) {
for (j in (i+1):ncol(Set1)){
print(CorrePlotXY(Set1,Set1name[i],Set1name[j],"blue", Set1name[i],
Set1name[j],"pearson"))
}
}
# Use double for loop to generate all plots for Set 2
for (i in 1:(ncol(Set2)-1)) {
for (j in (i+1):ncol(Set2)){
print(CorrePlotXY(Set2,Set2name[i],Set2name[j],"blue", Set2name[i],
Set2name[j],"pearson"))
}
}
#Remark: Plots under double for loop will not appear without the pront command
# Using function 9: How to verify if the data
# satisfies the normal distribution using Wilk-Shapiro test
# Using function 10: How to verify if the data
# satisfies the normal distribution using graphs
NiceTable(Set1,"Dataset in wide original format")
# convert data to long format first
data_long <- melt(Set1)
NiceTable(data_long,"Dataset in long format")
QQNormality_Plot(data_long)
# Points must fall inside the confidence band
# for it to be called as normally distributed.
# If not satisfied call the distribution as Non-normal (skewed)
# Using function 10: How to verify if the data
# satisfies the normal distribution using graphs
NiceTable(Set1,"Dataset in wide original format")
# convert data to long format first
data_long <- melt(Set1)
NiceTable(data_long,"Dataset in long format")
QQNormality_Plot(data_long)
# Points must fall inside the confidence band
# for it to be called as normally distributed.
# If not satisfied call the distribution as Non-normal (skewed)
# Using function 11: For a given set of correlation coefficients, Generate the
# corresponding raw data X and Y.
PlotHistDensity(Set1)
# Using function 12: For a given set of correlation coefficients, Generate the
# corresponding raw data X and Y.
sampleCor <- c(0.214, 0.4, 0.617, 0.742, 0.851, 0.915)
Simulate_XY_From_Correlations(sampleCor)
sampleCor <- c(0.214, 0.3, 0.617, 0.76, 0.851, 0.915)
Simulate_XY_From_Correlations(sampleCor)
# View generated data
gendata <- read.csv("DatXY.csv")
NiceTable(gendata,"Generated Datasets")