0% found this document useful (0 votes)
166 views10 pages

R Data Analysis for Beginners

The document discusses various data transformation and modeling techniques in R including: 1) Loading and preparing the iris dataset for analysis by reading in the CSV file, specifying column types, and filtering rows. 2) Performing exploratory data analysis using dplyr verbs like select, filter, mutate, and transform to explore and manipulate the data. 3) Building regression, correlation, and decision tree models using tools like lm(), cor(), rpart(), and ctree() and plotting the results. 4) Discussing additional R packages like ggpubr and rpart.plot that can help visualize and interpret the models.

Uploaded by

aqib ahmed
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
166 views10 pages

R Data Analysis for Beginners

The document discusses various data transformation and modeling techniques in R including: 1) Loading and preparing the iris dataset for analysis by reading in the CSV file, specifying column types, and filtering rows. 2) Performing exploratory data analysis using dplyr verbs like select, filter, mutate, and transform to explore and manipulate the data. 3) Building regression, correlation, and decision tree models using tools like lm(), cor(), rpart(), and ctree() and plotting the results. 4) Discussing additional R packages like ggpubr and rpart.plot that can help visualize and interpret the models.

Uploaded by

aqib ahmed
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

https://r4ds.had.co.nz/transform.

html ---link for r commands

install.packages("dplyr");
install.packages("plyr")
install.packages("readr")
install.packages("FSelector");
library(dplyr);
library(readr)
library(FSelector)
library(plyr)
library(dplyr);
library(FSelector);

> setwd("C:\\Users\\aqiba\\OneDrive\\Desktop\\Data Analytics\\IRIS dataset");


> mydata <- read_csv("iris_new.csv");

-- Column specification --------------------------------------------------------


cols(
sepal.length = col_double(),
sepal.width = col_double(),
petal.length = col_double(),
petal.width = col_double(),
variety = col_character()
)

> mydata
> filter(mydata, variety =="Setosa" & sepal.length > 4);
> summary(mydata)
sepal.length sepal.width petal.length petal.width
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
Median :5.800 Median :3.000 Median :4.350 Median :1.300
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
variety
Length:150
Class :character
Mode :character

> transform(mydata, variety =="Setose");


> show(mydata)
> View(mydata)
> select(mydata, variety, sepal.length);

> mutate(mydata, total = sepal.width + petal.width);


> select(mydata, total);
> View(mydata)
> transform(mydata, sepal.width = -sepal.width);
> View(mydata)
> v_change_sepal <- transform(mydata, sepal.width = -sepal.width);
> v_change_sepal
rename(iris_date_new, c("variety" = "species")); ------to rename any column of data
set

sum(is.na(iris_null_data)); ---for finding null values in dataset


[1] 5
OR

any(is.na(data)) -----for checking null values if any in entire dataframe

> for (i in which(sapply(iris_null_data, is.numeric))) {


+ iris_null_data[is.na(iris_null_data[, i]), i] <- mean(iris_null_data[, i],
na.rm = TRUE) + } -----for replacing null values
> newdata <- na.omit(iris_null_data)

------------------------links ------------------------------
https://subscription.packtpub.com/book/big_data_and_business_intelligence/978178528
6544/1/ch01lvl1sec11/data-preprocessing-techniques---- preprocessing steps
http://dataanalyticsedge.com/2018/05/02/data-cleaning-using-r/ ------for funtions
of checking missing values
-----------------------------------
regression----------------------------------------------------------------
https://www.datacamp.com/community/tutorials/linear-regression-R

?USJudgeRatings
head(USJudgeRatings)
USJUDGE_DATE <- USJudgeRatings
X <- as.matrix([-10]);
X <- as.matrix(USJUDGE_DATE[-10]);
X
X <- as.matrix(USJUDGE_DATE[-7]);
X
X <- as.matrix(USJUDGE_DATE[1]);
X
X <- as.matrix(USJUDGE_DATE[-8]);
X
V_REG <- lm(RTEN ~ CONT + INTG, data = USJUDGE_DATE);
V_REG
plot(V_REG);
plot(V_REG);
abline(V_REG);
V_REG <- lm(RTEN ~ CONT data = USJUDGE_DATE);
V_REG <- lm(RTEN ~ CONT, data = USJUDGE_DATE);
abline(V_REG);
v_plot <- abline(V_REG);
v_plot
abline(V_REG);
plot(V_REG, pch = 16, col = "blue");

---------------------------------
CORRELATION--------------------------------------------------------------

head(iris_new)

> x <- iris_new[1:1];


> y <- iris_new[2:2];
> y
# A tibble: 150 x 1
sepal.width
<dbl>
1 3.5
2 3
3 3.2
4 3.1
5 3.6
6 3.9
7 3.4
8 3.4
9 2.9
10 3.1
# ... with 140 more rows
> x
# A tibble: 150 x 1
sepal.length
<dbl>
1 5.1
2 4.9
3 4.7
4 4.6
5 5
6 5.4
7 4.6
8 5
9 4.4
10 4.9
# ... with 140 more rows
> v_get_correlation <- cor(x,y);
> v_get_correlation
sepal.width
sepal.length -0.1175698
> v_get_correlation <- cor(x,y, method = "spearman");
> v_get_correlation
sepal.width
sepal.length -0.1667777
> install.packages("ggpubr");
Error in install.packages : Updating loaded packages
> install.packages("ggpubr")
WARNING: Rtools is required to build R packages but is not currently installed.
Please download and install the appropriate version of Rtools before proceeding:

https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/aqiba/OneDrive/Documents/R/win-library/4.0’
(as ‘lib’ is unspecified)

>
>
>

> library(ggpubr);

Attaching package: ‘ggpubr’

The following object is masked from ‘package:plyr’:

mutate

> plot(v_get_correlation);
> ggqqplot(iris_new$sepal.length,iris_new$petal.width);
Error in data[, x] : incorrect number of dimensions
> ggqqplot(iris_new$sepal.length, ylab = "sepal_length");
> ggqqplot(iris_new$petal.width, ylab = "petal_width");
------------------------------------------------------------------

install.packages("rpart.plot");
WARNING: Rtools is required to build R packages but is not currently installed.
Please download and install the appropriate version of Rtools before proceeding:

https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/aqiba/OneDrive/Documents/R/win-library/4.0’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/rpart.plot_3.0.9.zip'
Content type 'application/zip' length 1034182 bytes (1009 KB)
downloaded 1009 KB

package ‘rpart.plot’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in


C:\Users\aqiba\AppData\Local\Temp\RtmpcVOHBJ\downloaded_packages
> install.packages("data.tree");
> install.packages("party");
> library(dplyr);
> library(readr);
> library(plyr);
-------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
-------------------------------------------------------------------------------
> setwd("C:\\Users\\aqiba\\OneDrive\\Desktop\\Data Analytics\\IRIS dataset");
> iris_data <- read.csv("iris_new.csv");
> iris_data;
> library(rpart);
> library(rpart.plot);
> create_tree <- rpart(variety ~ sepal.length, data = iris_data);
> create_tree
n= 150

node), split, n, loss, yval, (yprob)


* denotes terminal node

1) root 150 100 Setosa (0.33333333 0.33333333 0.33333333)


2) sepal.length< 5.45 52 7 Setosa (0.86538462 0.11538462 0.01923077) *
3) sepal.length>=5.45 98 49 Virginica (0.05102041 0.44897959 0.50000000)
6) sepal.length< 6.15 43 15 Versicolor (0.11627907 0.65116279 0.23255814) *
7) sepal.length>=6.15 55 16 Virginica (0.00000000 0.29090909 0.70909091) *
> rpart.plot(create_tree, extra = 7);
Warning message:
extra=7 but the response has 3 levels (only the 2nd level is displayed)
> rpart.plot(create_tree, extra = 3);
> View(iris_data)
> create_tree <- rpart(variety ~ sepal.length + sepal.width, data = iris_data);
> create_tree
n= 150

node), split, n, loss, yval, (yprob)


* denotes terminal node

1) root 150 100 Setosa (0.33333333 0.33333333 0.33333333)


2) sepal.length< 5.45 52 7 Setosa (0.86538462 0.11538462 0.01923077)
4) sepal.width>=2.8 45 1 Setosa (0.97777778 0.02222222 0.00000000) *
5) sepal.width< 2.8 7 2 Versicolor (0.14285714 0.71428571 0.14285714) *
3) sepal.length>=5.45 98 49 Virginica (0.05102041 0.44897959 0.50000000)
6) sepal.length< 6.15 43 15 Versicolor (0.11627907 0.65116279 0.23255814)
12) sepal.width>=3.1 7 2 Setosa (0.71428571 0.28571429 0.00000000) *
13) sepal.width< 3.1 36 10 Versicolor (0.00000000 0.72222222 0.27777778) *
7) sepal.length>=6.15 55 16 Virginica (0.00000000 0.29090909 0.70909091) *
> rpart.plot(create_tree, extra = 7);
Warning message:
extra=7 but the response has 3 levels (only the 2nd level is displayed)

> rpart.plot(create_tree, extra = 3);

-----------------------other method for


tree---------------------------------------------

create_tree <- ctree(sepal.width ~ sepal.length, data = iris_data);


> create_tree

Model formula:
sepal.width ~ sepal.length

Fitted party:
[1] root: 3.057 (n = 150, err = 28.3)

Number of inner nodes: 0


Number of terminal nodes: 1
> plot(create_tree);
> create_tree <- ctree(petal.length ~ sepal.length, data = iris_data);
> create_tree

Model formula:
petal.length ~ sepal.length

Fitted party:
[1] root
| [2] sepal.length <= 5.5
| | [3] sepal.length <= 5.4: 1.769 (n = 52, err = 34.1)
| | [4] sepal.length > 5.4: 3.229 (n = 7, err = 10.2)
| [5] sepal.length > 5.5
| | [6] sepal.length <= 6.2
| | | [7] sepal.length <= 5.8: 3.924 (n = 21, err = 25.8)
| | | [8] sepal.length > 5.8: 4.711 (n = 19, err = 3.3)
| | [9] sepal.length > 6.2
| | | [10] sepal.length <= 7: 5.169 (n = 39, err = 8.9)
| | | [11] sepal.length > 7: 6.300 (n = 12, err = 1.4)

Number of inner nodes: 5


Number of terminal nodes: 6
> plot(create_tree);
> create_tree <- rpart(petal.length ~ sepal.length, data = iris_data);
> plot(create_tree);
> rpart.plot(create_tree, extra = 3);

The 'extra' argument:


0 No extra information
1 Number of observations in the node
2 Class models: Classification rate (ncorrect/nobservations)
Poisson and exp models: number of events
3 Class models: Misclassification rate
4 Class models: Probability per class
5 Class models: Like 4 but don't display the fitted class
6 Class models: Probability of second class only
7 Class models: Like 6 but don't display the fitted class
8 Class models: Probability of the fitted class
9 Class models: Probability relative to all observations
10 Class models: like 9 but display the probability of the second class only

Add 100 to also display the percentage of observations in the node

Error: extra=3 is legal only for "class" models (you have an "anova" model)
> rpart.plot(create_tree);
> rpart.plot(create_tree, extra = 2);

The 'extra' argument:


0 No extra information
1 Number of observations in the node
2 Class models: Classification rate (ncorrect/nobservations)
Poisson and exp models: number of events
3 Class models: Misclassification rate
4 Class models: Probability per class
5 Class models: Like 4 but don't display the fitted class
6 Class models: Probability of second class only
7 Class models: Like 6 but don't display the fitted class
8 Class models: Probability of the fitted class
9 Class models: Probability relative to all observations
10 Class models: like 9 but display the probability of the second class only

Add 100 to also display the percentage of observations in the node

Error: extra=2 is legal only for "class", "poisson" and "exp" models (you have an
"anova" model)
> create_tree
n= 150

node), split, n, deviance, yval


* denotes terminal node

1) root 150 464.325400 3.758000


2) sepal.length< 5.55 59 57.404070 1.942373
4) sepal.length< 5.45 52 34.090770 1.769231 *
5) sepal.length>=5.45 7 10.174290 3.228571 *
3) sepal.length>=5.55 91 86.327470 4.935165
6) sepal.length< 6.25 40 35.249750 4.297500
12) sepal.length< 5.85 21 25.778100 3.923810 *
13) sepal.length>=5.85 19 3.297895 4.710526 *
7) sepal.length>=6.25 51 22.056470 5.435294
14) sepal.length< 7.05 39 8.923077 5.169231 *
15) sepal.length>=7.05 12 1.400000 6.300000 *
> rpart.plot(create_tree, extra = 1);
----------------------------------------------------------------------------neural
network-----------------

v_neural <- read.csv("neural_net.csv");


> v_neural
TCK CSS placed
1 20 90 1
2 10 20 0
3 30 40 0
4 20 50 0
5 80 50 1
6 30 80 1

> nn=neuralnet(placed~TCK+CSS,data=v_neural, hidden=3,act.fct = "logistic",


+ linear.output = FALSE);
> plot(nn)

--------------------------------------------------------------------
clustering---------------------------------------------

>iris_data
> Iris_cluster = iris_data;
> Iris_cluster
> Iris_cluster$variety = NULL;
> Iris_cluster
> create_cluster <- kmeans(Iris_cluster,3);
> create_cluster
K-means clustering with 3 clusters of sizes 38, 50, 62

Cluster means:
sepal.length sepal.width petal.length petal.width
1 6.850000 3.073684 5.742105 2.071053
2 5.006000 3.428000 1.462000 0.246000
3 5.901613 2.748387 4.393548 1.433871

Clustering vector:
[1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[39] 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[77] 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1 1 3 1 1 1 1 1 1 3
[115] 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1 1 1 3 1 1 1 3 1 1 3

Within cluster sum of squares by cluster:


[1] 23.87947 15.15100 39.82097
(between_SS / total_SS = 88.4 %)

Available components:

[1] "cluster" "centers" "totss" "withinss" "tot.withinss"


[6] "betweenss" "size" "iter" "ifault"
> create_cluster <- kmeans(Iris_cluster,2);
> create_cluster
K-means clustering with 2 clusters of sizes 97, 53

Cluster means:
sepal.length sepal.width petal.length petal.width
1 6.301031 2.886598 4.958763 1.695876
2 5.005660 3.369811 1.560377 0.290566

Clustering vector:
[1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[39] 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[77] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[115] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Within cluster sum of squares by cluster:


[1] 123.79588 28.55208
(between_SS / total_SS = 77.6 %)
Available components:

[1] "cluster" "centers" "totss" "withinss" "tot.withinss"


[6] "betweenss" "size" "iter" "ifault"
> create_cluster <- kmeans(Iris_cluster,3);
> create_cluster
K-means clustering with 3 clusters of sizes 38, 62, 50

Cluster means:
sepal.length sepal.width petal.length petal.width
1 6.850000 3.073684 5.742105 2.071053
2 5.901613 2.748387 4.393548 1.433871
3 5.006000 3.428000 1.462000 0.246000

Clustering vector:
[1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[39] 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[77] 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 1 1 1 1 1 1 2
[115] 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 2

Within cluster sum of squares by cluster:


[1] 23.87947 39.82097 15.15100
(between_SS / total_SS = 88.4 %)

Available components:

[1] "cluster" "centers" "totss" "withinss" "tot.withinss"


[6] "betweenss" "size" "iter" "ifault"
> create_cluster$size
[1] 38 62 50
>
>
>
>
> table(iris_data$variety, create_cluster);
Error in table(iris_data$variety, create_cluster) :
all arguments must have the same length
> table(iris_data$variety, create_cluster$cluster);

1 2 3
Setosa 0 0 50
Versicolor 2 48 0
Virginica 36 14 0
> plot(iris_data[c("petal.length", "petal.width")], col = create_cluster$cluster);
> plot(iris_data[c("sepal.length", "sepal.width")], col = create_cluster$cluster);
>

---------------------------to find if there is any null values in entire data


set----------------------------------
any(is.na(v_iris_data));

---------------------------to remove all rows having null values or N/A value in


any row----------------------------
v_get_new_iris_data <- na.omit(v_iris_data);
> v_get_new_iris_data;
-----------------------------------principle component
analysis---------------------------------
----https://www.datacamp.com/community/tutorials/pca-analysis-r
------https://aaronschlegel.me/principal-component-analysis-r-example.html

v_get_pca <- prcomp(v_get_new_iris_data[,1:3]);


> v_get_pca
>install.packages("ggfortify");
pca.plot <- autoplot(v_get_pca, data = v_get_new_iris_data, colour = 'Group');
Error in autoplot(v_get_pca, data = v_get_new_iris_data, colour = "Group") :
could not find function "autoplot"
> library(ggfortify);
> pca.plot <- autoplot(v_get_pca, data = v_get_new_iris_data, colour = 'Group');
> pca.plot
Error: Unknown colour name: Group
> pca.plot <- autoplot(v_get_pca, data = v_get_new_iris_data, colour = 'Red');
> pca.plot

-----------------------------------------create hisogram
------------------------------
> hist(v_get_new_iris_data$petal.length);

-----------------------------str command as similar to


summary---------------------------
str(v_get_new_iris_data);
----------------------------------
correlation-------------------------------------------

x <- v_get_new_iris_data[1:1];
> y <- v_get_new_iris_data[3:3];
> v_find_correlation <- cor(x,y);
> v_find_correlation;
petal.length
sepal.length 0.8679478
> v_find_correlation <- cor(x,y, method = "spearman");
> v_find_correlation;
petal.length
sepal.length 0.8800297
> v_find_correlation <- cor(x,y, method = "kendal");
> v_find_correlation <- cor(x,y, method = "kendal");
> v_find_correlation;
petal.length
sepal.length 0.7157654

> library(ggplot2)
> plot(v_find_correlation);

?mtcar
No documentation for ‘mtcar’ in specified packages and libraries:
you could try ‘??mtcar’
> mtcars
> v_find_correlation <- cor(mtcars);
> v_find_correlation
> library(corrplot)
> plot(mtcars, method="circle");
> plot(mtcars, method="pie");

> install.packages("PerformanceAnalytics");
> library("PerformanceAnalytics");
> my_data <- mtcars[, c(1,3,4,5,6,7)]
> chart.Correlation(my_data, histogram=TRUE, pch=19);

------------------------------------regression
line---------------------------------
abline(lm(mpg~wt), col="red") # regression line (y~x)
lines(lowess(wt,mpg), col="blue") # lowess line (x,y)

> install.packages("ggcorrplot");
> v_linear_regression <- lm(iris$Petal.Length ~ iris$Sepal.Length +
iris$Sepal.Width, data = iris);
> v_linear_regression;

Call:
lm(formula = iris$Petal.Length ~ iris$Sepal.Length + iris$Sepal.Width,
data = iris)

Coefficients:
(Intercept) iris$Sepal.Length iris$Sepal.Width
-2.525 1.776 -1.339

> library(ggplot2);
> plot(v_linear_regression);

You might also like