Data <- [Link]("C:/Users/admin/Downloads/airline.
csv")
names(Data)
str(Data)
#---------------------------Imputing missing values with median of the
var-----------------------------
colSums([Link](Data))
#Data$author_country[[Link](Data$author_country)] <- median(Data$author_country,
[Link] = TRUE)
Data$overall_rating[[Link](Data$overall_rating)] <- median(Data$overall_rating,
[Link] = TRUE)
Data$seat_comfort_rating[[Link](Data$seat_comfort_rating)] <-
median(Data$seat_comfort_rating, [Link] = TRUE)
Data$cabin_staff_rating[[Link](Data$cabin_staff_rating)] <-
median(Data$cabin_staff_rating, [Link] = TRUE)
Data$ food_beverages_rating[[Link](Data$ food_beverages_rating)] <- median(Data$
food_beverages_rating, [Link] = TRUE)
Data$inflight_entertainment_rating[[Link](Data$inflight_entertainment_rating)] <-
median(Data$inflight_entertainment_rating, [Link] = TRUE)
#Data$ground_service_rating[[Link](Data$ground_service_rating)] <-
median(Data$ground_service_rating, [Link] = TRUE)
#Data$wifi_connectivity_rating[[Link](Data$wifi_connectivity_rating)] <-
median(Data$wifi_connectivity_rating, [Link] = TRUE)
Data$value_money_rating[[Link](Data$value_money_rating)] <-
median(Data$value_money_rating, [Link] = TRUE)
#----------------------Finding the outliers---------------------
boxplot(Data$overall_rating)$out
boxplot(Data$seat_comfort_rating)$out
boxplot(Data$cabin_staff_rating)$out
boxplot(Data$food_beverages_rating)$out
boxplot(Data$inflight_entertainment_rating)$out
#boxplot(Data$ground_service_rating)$out
#boxplot(Data$wifi_connectivity_rating)$out
boxplot(Data$value_money_rating)$out
#-------wifi rating and ground service has outliers and dropping the
columns-------------------------
df = subset(Data, select = -c(ground_service_rating,wifi_connectivity_rating) )
#Dropping the NA values from author_country
levels(df$author_country)[1] = NA
df<-[Link](df,cols="author_country")
#----------Finding the correlation among the variable-----------------------
round(cor(df[,11:17]),
digits = 2 # rounded to 2 decimals
)
#------------overall_rating & value_money_rating are correlated.. so dropping
overall_rating from the model--------
pos = df %>% group_by(airline_name) %>% summarise(AVG = mean(overall_rating))
# Using Linear Regression
model1 = lm(recommended ~ type_traveller+ cabin_flown + seat_comfort_rating +
cabin_staff_rating + food_beverages_rating + inflight_entertainment_rating +
value_money_rating, data = df )
summary(model1)
# Using Logistic Regression
model2 = glm(recommended ~ type_traveller+
cabin_flown+seat_comfort_rating+cabin_staff_rating+food_beverages_rating+
inflight_entertainment_rating+
value_money_rating,data=df,binomial(link = "logit"))
summary(model2)
#----------------Ans1- Both the model suggests value_money_rating &
cabin_staff_rating are of utmost importance to the customers---------------
#---------------------------Ans2- counntry specific decesion making
variables------------------------
library(dplyr)
fitted_models = dplyr::group_by(df,author_country) %>% dplyr::do(model =
lm(recommended~ seat_comfort_rating+cabin_staff_rating+food_beverages_rating+
inflight_entertainment_rating+
value_money_rating,data=.))
View(fitted_models$model)
#-----------------------------Ans3- counntry specific decesion making variables
forlufthansa airlines -----------------------------
pos3 = df %>% filter(airline_name == "lufthansa") %>% group_by(author_country) %>%
summarise(AVG = mean(overall_rating,[Link] = TRUE),CSR =
mean(cabin_staff_rating,[Link] = TRUE),
SCR = mean(seat_comfort_rating,[Link] = TRUE), IER =
mean(inflight_entertainment_rating,[Link] = TRUE),
VMR = mean(value_money_rating,[Link] = TRUE),FBR = mean(food_beverages_rating,[Link]
= TRUE))
# For example for Ireland the overall average is 4 but FBR is only one hence,
lufthansa should focus on food_beverages_rating in Ireland