Probability and Statistics
INSTALL NECESSARY LIBRARIES
#Recalling necessary libraries
install.packages("ggplot2")
library(ggplot2)
library(caret)
DATA READING & PRE-PROCESSING
#Read data
water_quality=read.csv("C:/Probability and
Statistics/Project/water_potability.csv")
head(water_quality)
#Categorize potability data as factor
water_quality$Potability <- as.factor(water_quality$Potability)
#Data summary
table(water_quality$Potability)
#Counting missing values
colSums(is.na(water_quality))
#Replace missing values
water_quality$ph[is.na(water_quality$ph)] <-
mean(water_quality$ph,na.rm=TRUE)
water_quality$Sulfate[is.na(water_quality$Sulfate)] <-
mean(water_quality$Sulfate,na.rm=T)
water_quality$Trihalomethanes[is.na(water_quality$Trihalomethanes
)] <- mean(water_quality$Trihalomethanes,na.rm=T)
#Replace missing values
pH
ph_np<-mean(water_quality[water_quality$Potability == 0,
"ph"],na.rm=T)
ph_p<-mean(water_quality[water_quality$Potability == 1,
"ph"],na.rm=T)
water_quality[water_quality$Potability == 0 &
is.na(water_quality$ph), "ph"] <- ph_np
water_quality[water_quality$Potability == 1 &
is.na(water_quality$ph), "ph"] <- ph_p
Sulfate_np<-mean(water_quality[water_quality$Potability == 0,
"Sulfate"],na.rm=T)
Sulfate_p<-mean(water_quality[water_quality$Potability == 1,
"Sulfate"],na.rm=T)
water_quality[water_quality$Potability == 0 &
is.na(water_quality$Sulfate), "Sulfate"] <- Sulfate_np
water_quality[water_quality$Potability == 1 &
is.na(water_quality$Sulfate), "Sulfate"] <- Sulfate_p
Trihalomethanes_np<-mean(water_quality[water_quality$Potability
== 0, "Trihalomethanes"],na.rm=T)
Trihalomethanes_p<-mean(water_quality[water_quality$Potability
== 1, "Trihalomethanes"],na.rm=T)
water_quality[water_quality$Potability == 0 &
is.na(water_quality$Trihalomethanes), "Trihalomethanes"] <-
Trihalomethanes_np
water_quality[water_quality$Potability == 1 &
is.na(water_quality$Trihalomethanes), "Trihalomethanes"] <-
Trihalomethanes_p
shuffled_water_quality <-
water_quality[sample(nrow(water_quality)), ]
# Set a seed for reproducibility
set.seed(232)
# Split
dt <- sort(sample(nrow(water_quality),nrow(water_quality)*0.8))
train <- water_quality[dt,]
test <- water_quality[-dt,]
# Fit a logistic regression model
log_model <- glm(Potability ~ ., data = train, family = "binomial")
# Display the summary of the model
summary(log_model)
# Use the model to predict probabilities on the testing set
predicted_probability <- predict(log_model, newdata = test, type =
"response")
# Convert probabilities to binary predictions (0 or 1)
predicted_labels <- ifelse(predicted_probability > 0.5, 1, 0)
# Assuming your actual labels are in the "Potability" column of the
testing set
actual_labels <- test$Potability
# Create a confusion matrix to evaluate model performance
confusion_matrix <- table(Actual = actual_labels, Predicted =
predicted_labels)
# Calculate accuracy, 95% CI, no-information rate, and p-value
model_performance <- confusionMatrix(data=
as.factor(predicted_labels), reference = as.factor(actual_labels))
# Display the model performance summary
print(model_performance)
#Import Random Forest library
install.packages("randomForest")
library(randomForest)
#Use Random Forest
randomforest_model <- randomForest(Potability ~ ph + Conductivity
+ Trihalomethanes + Hardness + Solids + Chloramines + Sulfate +
Turbidity + Organic_carbon, data = train)
#Print the model
print(randomforest_model)
#Test the model
predicted_randomforest <- predict(randomforest_model, newdata =
test)
confusion_matrix2 <- table(predicted_randomforest, test$Potability)
print(confusion_matrix2)
#Print the model performance
model_performance_randomforest <- confusionMatrix(data =
as.factor(predicted_randomforest), reference =
as.factor(test$Potability))
print(model_performance_randomforest)
Đánh giá tổng quan:
- Còn thiếu phần 1 Introduction, Kết luận
- Các chương chưa được trình bày theo dạng báo cáo mà chỉ là giải
thích code: Thiếu nhận xét, đánh giá, kết luận
Nhiệm vụ:
- Hoàn thành nội dung theo dạng báo cáo
- Hoàn thành file Giải thích code
- Lưu ý số trang tối đa