0% found this document useful (0 votes)
23 views12 pages

Data Analytics Programs

The document contains multiple R programming scripts for various data analysis tasks including numerical operations, statistical operations, linear regression, web scraping, matrix operations, data pre-processing, PCA, decision tree classification and regression, movie recommendations, time series analysis, data visualization, and K-means clustering. Each section provides code snippets for performing specific tasks such as calculating statistics, building models, visualizing data, and handling datasets. The document serves as a comprehensive guide for executing common data analysis techniques in R.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
23 views12 pages

Data Analytics Programs

The document contains multiple R programming scripts for various data analysis tasks including numerical operations, statistical operations, linear regression, web scraping, matrix operations, data pre-processing, PCA, decision tree classification and regression, movie recommendations, time series analysis, data visualization, and K-means clustering. Each section provides code snippets for performing specific tasks such as calculating statistics, building models, visualizing data, and handling datasets. The document serves as a comprehensive guide for executing common data analysis techniques in R.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 12

1.

NUMERICAL OPERATIONS
input <- readline(prompt="Enter numbers separated by spaces:")
numbers <- as.numeric(unlist(strsplit(input, " ")))
max_value <- max(numbers)
print(paste("Maximum:", max_value))
min_value <- min(numbers)
print(paste("Minimum:", min_value))
avg_value <- mean(numbers)
print(paste("Average:", avg_value))
sum_value <- sum(numbers)
print(paste("Sum:", sum_value))
sqrt_values <- sqrt(numbers)
print("Square roots")
print(sqrt_values)
rounded_values <- round(numbers, 2)
print("Rounded values")
print(rounded_values)

2. STAT OPERATIONS
# Read numeric vector from user
cat("Enter the elements:\n")
vector_elements <- as.numeric(unlist(strsplit(readline(), " ")))
# Calculate mean
mean_value <- mean(vector_elements)
cat("Mean:\n")
print(mean_value)
# Calculate median
median_value <- median(vector_elements)
cat("Median:\n")
print(median_value)
# Calculate mode
mode_value <- calculate_mode(vector_elements)
cat("Mode:\n")
print(mode_value)
calculate_mode <- function(x) {
uniqx <- unique(x)
uniqx[which.max(tabulate(match(x, uniqx)))]}
# Calculate standard deviation
sd_value <- sd(vector_elements)
cat("Standard deviation:\n")
print(sd_value)

3. SIMPLE L REG
data <- read.csv("C:/Users/suhan/Downloads/SimpleLinearRegressionData.csv")
print(data)
model <- lm(y ~ x, data = data)
summary(model)
plot(data$x, data$y, main = "Linear Regression of x and y", xlab = "x", ylab = "y", pch = 19)
abline(model, col = "blue")
IN CSV:
x→(3,7,9,6,2)
y→(1,9,8,5,4)

4. WEB SCRAPING
install.packages('rvest')
library(rvest)
webpage = read_html("https://www.geeksforgeeks.org/web-scraping-using-r-language/")
heading = html_node(webpage, '.entry-title')
text = html_text(heading)
print(text)
paragraph = html_nodes(webpage, 'p')
pText = html_text(paragraph)
print(head(pText))

5. MATRIX OPERATIONS
n <- as.integer(readline(prompt="Enter the number of rows: "))
m <- as.integer(readline(prompt="Enter the number of columns: "))
# Read matrix elements from user
cat("Enter the elements of the first matrix (row-wise), separated by spaces:\n")
matrix_elements_1 <- as.numeric(unlist(strsplit(readline(), " ")))
matrix_1 <- matrix(matrix_elements_1, nrow = n, ncol = m, byrow = TRUE)
cat("Enter the elements of the second matrix (row-wise), separated by spaces:\n")
matrix_elements_2 <- as.numeric(unlist(strsplit(readline(), " ")))
matrix_2 <- matrix(matrix_elements_2, nrow = n, ncol = m, byrow = TRUE)
# Display the matrices
cat("First Matrix:\n")
print(matrix_1)
cat("Second Matrix:\n")
print(matrix_2)
# Matrix Addition
matrix_add <- matrix_1 + matrix_2
cat("Matrix Addition:\n")
print(matrix_add)
# Matrix Subtraction
matrix_sub <- matrix_1 - matrix_2
cat("Matrix Subtraction:\n")
print(matrix_sub)
# Matrix Multiplication
matrix_mul <- matrix_1 %*% matrix_2
cat("Matrix Multiplication:\n")
print(matrix_mul)
# Matrix Inverse
if (n == m && det(matrix_1) != 0) {
matrix_inv <- solve(matrix_1)
cat("Matrix Inverse:\n")
print(matrix_inv)} else {
cat("Matrix Inverse:\nThe matrix is not square or is singular, so the inverse does not
exist.\n")}
# Matrix Transpose
matrix_transpose <- t(matrix_1)
cat("Matrix Transpose:\n")
print(matrix_transpose)
# Matrix Division
# For matrix division, we'll assume element-wise division (Hadamard division)
matrix_div <- matrix_1 / matrix_2
cat("Matrix Division (element-wise):\n")
print(matrix_div)

6. DATA PRE-PROCESSING
install.packages("readr")
library(readr)
# Read data from CSV file
iris_data <- read_csv("C:/Users/suhan/Downloads/archive/Iris.csv")
# Display first few rows of dataset
cat("Original Iris dataset")
head(iris_data)
# Check structure of dataset
str(iris_data)
# Summary
summary(iris_data)
# Check for missing values
cat("Checking for missing values:")
columns <- colSums(is.na(iris_data))
print(columns)
# Remove rows with missing values
data_no_na <- na.omit(iris_data)
cat("Data after removing rows with missing values:\n")
print(data_no_na)
# Normalization using min-max scaling
min_max_normalise <- function(x) {
return((x - min(x)) / (max(x) - min(x)))}
# Apply normalization to numeric columns
iris_normalized <- iris_data
iris_normalized[,1:4] <- sapply(iris_data[,1:4], min_max_normalise)
cat("Dataset after min-max normalization:")
head(iris_normalized)
# Encode categorical variable
iris_encoded <- iris_normalized
iris_encoded <- cbind(iris_encoded[,1:4], model = iris_data$Species)
# Inspect encoded data
cat("Inspecting encoded data:")
data <- iris_encoded
print(data)

7. PCA for HOS


install.packages("ggplot2")
install.packages("caret")
library(ggplot2)
library(caret)
pca_result <- read.csv("C:/Users/suhan/Downloads/dd/Housing.csv")
data <- data.frame(
Price = c(300000, 450000, 500000, 600000, 700000),
SqFt = c(1500, 2000, 1800, 2200, 2500),
Bedrooms = c(3, 4, 3, 5, 4),
Bathrooms = c(2, 3, 2, 3, 4))
# View the first few rows of the dataset
cat("Original Data:\n")
print(head(data))
# Remove non-numeric columns if any
data_numeric <- data[, sapply(data, is.numeric)]
# Standardize the data (important for PCA)
data_scaled <- scale(data_numeric)
# Perform PCA
pca_result <- prcomp(data_scaled, center = TRUE, scale. = TRUE)
# Summary of PCA results
cat("PCA Summary:\n")
print(summary(pca_result))
# Print principal components
cat("Principal Components:\n")
print(pca_result$rotation)
# Print the scores of the principal components
cat("PCA Scores:\n")
print(pca_result$x)
# Plotting the PCA results (optional)
# Scree plot to visualize variance explained by each principal component
screeplot(pca_result, main="Scree Plot")
# Biplot to visualize the principal components
biplot(pca_result, main="PCA Biplot")
# Generate Scree Plot for the First Two Principal Components
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2) * 100 # Calculate variance
explained
barplot(var_explained[1:2],
main = "Scree Plot (First Two Principal Components)",
xlab = "Principal Component",
ylab = "Percentage of Variance Explained",
names.arg = c("PC1", "PC2"),
col = "steelblue")
# Generate Biplot for the First Two Principal Components
biplot(pca_result,
choices = c(1,2), # Select first two principal components
main = "Biplot of the First Two Principal Components",
col = c("red", "blue")) # Different colors for points and vectors

8. DECISION TREE CLASSIFICATION


install.packages("caret")
install.packages("rpart.plot")
library(caret)
library(rpart.plot)
irisdata <- datasets::iris
class <- irisdata$Species
set.seed(3033)
inTrain <- createDataPartition(y=class, p=0.8, list=FALSE)
training <- irisdata[inTrain,]
testing <- irisdata[-inTrain,]
class_test <- class[-inTrain]
dim(training)
dim(testing)
fit.tree= rpart(Species ~ ., data=training, method="class", parms=list(split="information"))
fit.tree
rpart.plot(fit.tree)
pred.tree <- predict(fit.tree, testing, type="class")
table(pred.tree, class_test)
printcp(fit.tree)
pruned.tree <- prune(fit.tree, cp=0.5)
rpart.plot(pruned.tree, box.palette="Blues")
pred.pruned=predict(pruned.tree, testing, type="class")
table(pred.pruned, class_test)

9. DECISION TREE REGRESSION


install.packages("rpart")
install.packages("rpart.plot")
library(rpart)
library(rpart.plot)
library(ISLR)
carseats_data <- read.csv("C:/Users/suhan/Downloads/Carseats.csv")
head(carseats_data)
# Build the tree model using rpart
tree.model <- rpart(Sales~Price + Advertising + ShelveLoc, data=carseats_data,
method="anova")
summary(tree.model)
printcp(tree.model)
# Plot the decision tree
rpart.plot(tree.model, type=3, extra=101, fallen.leaves=TRUE)
# Prepare new data and predict
new_data <- data.frame(Price=120, Advertising=10, ShelveLoc="Good")
predicted_sales <- predict(tree.model, new_data)
print(predicted_sales)

10. MOVIE RECOM


install.packages("recommenderlab")
install.packages("ggplot2")
library(recommenderlab)
library(ggplot2)
data("MovieLense")
ml_data <- MovieLense
print(ml_data)
summary(ml_data)
cat("Sparsity of the data:", sum(is.na(as(ml_data, "matrix"))) / prod(dim(ml_data)), "\n")
hist(getRatings(ml_data), main = "Rating Distribution", col = "blue", xlab = "Ratings", ylab =
"Frequency")
set.seed(123)
split <- sample(x = c(TRUE, FALSE), size = nrow(ml_data), replace = TRUE, prob = c(0.8,
0.2))
train_data <- ml_data[split, ]
test_data <- ml_data[!split, ]
recommender <- Recommender(train_data, method = "UBCF")
predictions <- predict(recommender, newdata = test_data, n = 5)
cat("Predicted Recommendations:\n")
print(as(predictions, "list"))
evaluation_scheme <- evaluationScheme(ml_data, method = "split", train = 0.8, given = 10,
goodRating = 4)
results <- evaluate(evaluation_scheme, method = "UBCF", type = "topNList", n = c(1, 3, 5))
plot(results, annotate = TRUE, main = "ROC Curve")
top_recommendations <- as(predictions, "list")
recommendation_df <- data.frame(
User = rep(names(top_recommendations), lengths(top_recommendations)),
Movie = unlist(top_recommendations))
print("Top Recommendations Data Frame:")
print(recommendation_df)

11. TIME SERIES ARIMA


library(zoo)
library(ggplot2)
library(forecast)
library(Metrics)
data <- c(10,23,36,40,51,63,72,84,93,102,113,125)
series_ts = ts(data)
test_set <- c(85,92)
nval = length(test_set)
ar_model <- Arima(series_ts, order = c(1, 0, 0))
ma_model <- Arima(series_ts, order = c(0, 0, 1))
arma_model <- Arima(series_ts, order = c(1, 0, 1))
arima_model <- Arima(series_ts, order = c(1, 1, 1))
auto_arima_no_season_model <- auto.arima(series_ts, seasonal = FALSE)
auto_arima_season_model <- auto.arima(series_ts, seasonal = TRUE)
ar_forecasts <- forecast(ar_model, h = nval)
ma_forecasts <- forecast(ma_model, h = nval)
arma_forecasts <- forecast(arma_model, h = nval)
arima_forecasts <- forecast(arima_model, h = nval)
auto_arima_no_season_forecasts <- forecast(auto_arima_no_season_model, h = nval)
auto_arima_season_forecasts <- forecast(auto_arima_season_model, h = nval)
arima_forcast_df <- data.frame(
AR = ar_forecasts$mean,
MA = ma_forecasts$mean,
ARMA = arma_forecasts$mean,
ARIMA = arima_forecasts$mean,
AutoARIMANoSeason = auto_arima_no_season_forecasts$mean,
AutoARIMASeason = auto_arima_season_forecasts$mean)
arima_forcast_df
mae(test_set,ar_forecasts$mean)
mape(test_set,ar_forecasts$mean)
rmse(test_set,ar_forecasts$mean)
ar_forecasts$model$coef
#ar_forecasts$mean
summary(arima_forecasts$model)

12. DATA VISUALIZATION


#Histograms
# load the data
data(iris)
# create histograms for each attribute
par(mfrow=c(1,4))
for(i in 1:4) {
hist(iris[,i], main=names(iris)[i])}
#Density Plots
# load packages
library(lattice)
# load dataset
data(iris)
# create a panel of simpler density plots by attribute
par(mfrow=c(1,4))
for(i in 1:4) {
plot(density(iris[,i]), main=names(iris)[i])}
#Box And Whisker Plots
data(iris)
# Create separate boxplots for each attribute
par(mfrow=c(1,4))
for(i in 1:4) {
boxplot(iris[,i], main=names(iris)[i])}
#Bar Plots
data(iris)
# Create separate boxplots for each attribute
par(mfrow=c(1,4))
for(i in 1:4) {
barplot(iris[,i], main=names(iris)[i])}
#Missing Plot
install.packages("Amelia")
library(Amelia)
# create a missing map
missmap(iris, col=c("black", "grey"), legend=FALSE)
#Multivariate Visualization
#Correlation Plot
install.packages("corrplot")
library(corrplot)
# calculate correlations
correlations <- cor(iris[,1:4])
# create correlation plot
corrplot(correlations, method="circle")
#Scatterplot Matrix
# pair-wise scatterplots of all 4 attributes
pairs(iris)
#Scatterplot Matrix By Class
# pair-wise scatterplots colored by class
pairs(Species~., data=iris, col=iris$Species)
#Density Plots By Class
install.packages("caret")
# load the package
library(caret)
x <- iris[,1:4]
y <- iris[,5]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)
#Box And Whisker Plots By Class
# box and whisker plots for each attribute by class value
# load the package
library(caret)
x <- iris[,1:4]
y <- iris[,5]
featurePlot(x=x, y=y, plot="box")

13. K-MEANS
# Define the data points
data <- data.frame(
x = c(2, 2, 8, 5, 7, 6, 1, 4),
y = c(10, 5, 4, 8, 5, 4, 2, 9))
rownames(data) <- c("A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2")
# Initial cluster centers (A1, B1, C1)
initial_centers <- matrix(
c(2, 10, # A1
5, 8, # B1
1, 2), # C1
nrow = 3,
byrow = TRUE)
# Perform K-Means with given initial centers
set.seed(123) # For reproducibility
kmeans_result <- kmeans(data, centers = initial_centers, iter.max = 10, nstart = 1)
# Output the results
cat("Cluster Assignments:\n")
print(kmeans_result$cluster)
cat("\nCluster Centers:\n")
print(kmeans_result$centers)
# Visualize the clusters
library(ggplot2)
data$Cluster <- factor(kmeans_result$cluster)
ggplot(data, aes(x = x, y = y, color = Cluster)) +
geom_point(size = 4) +
geom_point(data = as.data.frame(kmeans_result$centers),
aes(x = x, y = y, color = factor(1:3)),
shape = 4, size = 6, stroke = 2) +
labs(title = "K-Means Clustering", x = "X", y = "Y") +
theme_minimal()

14. IMPORT/EXPORT
IMPORT
# Importing CSV using base R
data <- read.csv("path/to/your/file.csv")
# Importing CSV using readr
install.packages("readr")
library(readr)
data <- read_csv("path/to/your/file.csv")
# Importing Excel files using readxl
install.packages("readxl")
library(readxl)
data <- read_excel("path/to/your/file.xlsx", sheet = "Sheet1")
# Importing TXT using base R
data <- read.table("path/to/your/file.txt", header = TRUE, sep = "\t")
# Importing TXT using readr
library(readr)
data <- read_tsv("path/to/your/file.txt")
EXPORT
# Exporting CSV using base R
write.csv(data, "path/to/save/your/file.csv", row.names = FALSE)
# Exporting CSV using readr
library(readr)
write_csv(data, "path/to/save/your/file.csv")
# Exporting Excel files using writexl
install.packages("writexl")
library(writexl)
write_xlsx(data, "path/to/save/your/file.xlsx")
# Exporting TXT using base R
write.table(data, "path/to/save/your/file.txt", sep = "\t", row.names = FALSE)
# Exporting TXT using readr
library(readr)
write_tsv(data, "path/to/save/your/file.txt")

15. MLR
install.packages("MASS")
install.packages("ggplot2")
install.packages("dplyr")
install.packages("caret")
library(MASS)
library(ggplot2)
library(dplyr)
library(caret)
data <- Boston
head(data)
# Check for missing values
colSums(is.na(data))
# Visualize relationships
pairs(data)
# Select features and target (for example, "medv" is the median value of owner-occupied
homes)
target <- "medv" # Median value of owner-occupied homes in $1000s
predictors <- data %>% select(-medv)
# Split the data
set.seed(42)
train_index <- createDataPartition(data[[target]], p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Build the linear regression model
model <- lm(as.formula(paste(target, "~ .")), data = train_data)
summary(model)
# Make predictions
predictions <- predict(model, newdata = test_data)
# Evaluate the model
mse <- mean((test_data[[target]] - predictions)^2)
r_squared <- summary(model)$r.squared
cat("Mean Squared Error:", mse, "\n")
cat("R^2 Score:", r_squared, "\n")
# Visualize the results
ggplot(data = test_data, aes(x = test_data[[target]], y = predictions)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(x = "Actual Prices", y = "Predicted Prices", title = "Actual vs Predicted Prices")
# Interpret the coefficients
coef(summary(model))
# Save the model
saveRDS(model, "boston_housing_model.rds")

You might also like