R Script Compiled by Mr.
Anup Sharma (Strictly to be used as class notes)
#### COPY ENTIRE DOCUMENT CONTENT IN R STUDIO #####
#===================================================
#----------------basics-------------------------
#===================================================
#### NUMERIC VARIABLES ####
# Assign value to a variable
x=2
x<-2
3->y
# Assign value to multiple variables
z=y=4
# Remove variable z
rm(z)
# Check type of variable x
class(x)
# Change type of variable x from numeric to Integer
x=[Link](x)
# Check x type again
class(x)
# Check if x is numeric variable now or not
[Link](x) # True because integer is subset of Numeric
# Check if x is Integer variable now or not
[Link](x)
#===================================================
#### Character Variables ####
# Assign "A Grade" text to a variable a
a="A Grade"
# Check length of a
nchar(a)
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
# Check length of a number 3000
nchar(3000)
#===================================================
#### Date Variable ####
# Assign todays date to variable b
b = [Link]("2019-11-15 10:30") # as YYYY-MM-DD
# Check class of variable b
class(b)
#===================================================
#### Logical Variable ####
# Assign a logical value to variable c and d
c=TRUE
d=FALSE
# check type of variable c and d
class(c)
class(d)
# Compare if x>y
x>y
# y<=x ??
y<=x
# IF x = y ??
x==y
# if x not equal to y ??
x!=y
#### VECTORS ####
# Make a vector "grades" holding grades for 10 students
grades=c("a","b","a","c","d","a","b","a","c","d")
# Make a vector "marks" holding marks of 10 students
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
marks=c(90,85,98,72,54,93,86,90,70,45)
# Make a vector "rno" holding rollno of 10 students
rno=c(1:10)
# Add bonus marks = 2 and update marks variable
marks=marks+2
marks
# Convert marks to 10 scale and assign to slab variable
slab=marks/10
slab
# What's grade of 3rd student?
grades[3]
# What are slab for 3rd and 5th student?
slab[c(3,5)]
# What are marks for last three roll nos?
marks[8:10]
#===================================================
#### Factors ####
# Check factors for grades
[Link](grades)
# Check numeric value of factor variables
gf=[Link](grades)
[Link](gf)
#===================================================
#### NA ####
# Add roll no 11 with marks NA and grades NA
rno=c(rno,11)
marks=c(marks,NA)
grades=c(grades,NA)
# Update slab
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
slab=marks/10
slab
# Mean class marks?
mean(marks) ## Wrong answer as NA considered
mean(marks,[Link]=TRUE) ## True to strip NA before calculation
#===================================================
#### Pipes ####
# Add package magrittr from library
library(magrittr)
# Find mean marks
marks%>%mean ## Error as NA included
# Find mean after scraping NA
marks%>%[Link]%>%mean # wrong as it's average no of NA
marks%>%mean([Link]=TRUE)
#===================================================
#### [Link] ####
# Make a dataframe with variables rno,marks,grades and slab
myclass=[Link](Roll_Number=rno,Marks=marks,CGPA=slab,Grades=grades)
myclass
# Dimensions of myclass
dim(myclass) ## Row and Columns
# number of rows
nrow(myclass)
# number of columns
ncol(myclass)
# List of Row names
rownames(myclass)
# List of Column names
colnames(myclass)
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
# Head of dataframe
head(myclass)
# Tail of dataframe
tail(myclass)
# grades?
myclass[,4]
myclass[,"Grades"]
# grade of 5th roll number
myclass[5,4]
# performance of 5th roll number
myclass[5,]
# Roll number wise grades
myclass[,c(1,4)]
#==========-{ GGPLOT }=======
library(ggplot2)
data(diamonds)
mydata=diamonds
head(mydata)
#Base Histogram
hist(mydata$carat,main="Carat Histogram",xlab="Carat")
# Base Scatterplot
plot(price~carat,data=mydata)
# Boxplot
boxplot(mydata$carat)
##### Using ggplot2#####################################
# Histogram for discrete measurement
ggplot(data=mydata)+geom_histogram(aes(x=carat))
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
# Density for continous measurement
ggplot(data=mydata)+geom_density(aes(x=carat),fill="skyblue")
# Scatterplot
ggplot(mydata,aes(x=carat,y=price))+geom_point()
# save previous thing to g variable to add layers
g=ggplot(mydata,aes(x=carat,y=price))
# Add colors to g
g+geom_point(aes(color=color))
#Make faceted plots
g+geom_point(aes(color=color))+facet_wrap(~color)
g+geom_point(aes(color=color))+facet_grid(cut~clarity)
#Facet with histogram
ggplot(mydata,aes(x=carat))+geom_histogram()+facet_wrap(~color)
# Boxplots
ggplot(mydata,aes(y=carat,x=1))+geom_boxplot()
ggplot(mydata,aes(y=carat,x=cut))+geom_boxplot()
#Line plot
ggplot(economics,aes(x=date,y=pop))+geom_line(color="red")
#### AREA PLOT ######
library(ggplot2)
# create data
xValue=1:50
yValue=cumsum(rnorm(50))
data=[Link](xValue,yValue)
# area Plot
ggplot(data)+geom_area(aes(x=xValue, y=yValue))
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
#### BAR PLOT######
#data
df=[Link](dose=c("D0.5", "D1", "D2"),len=c(4.2, 10, 29.5))
#barplot
ggplot(data=df, aes(x=dose, y=len)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=len), vjust=1.6, color="white", size=3.5)+
theme_minimal()
# data
df2=[Link](supp=rep(c("VC", "OJ"), each=3),
dose=rep(c("D0.5", "D1", "D2"),2),
len=c(6.8, 15, 33, 4.2, 10, 29.5))
# Stacked barplot with multiple groups
ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +
geom_bar(stat="identity")
# Use position=position_dodge()
ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +
geom_bar(stat="identity", position=position_dodge())
###########dot plot#####################
dd=ToothGrowth
head(dd)
class(dd$dose)
class(dd)
dd$dose=[Link](dd$dose)
ggplot(data=dd)+geom_dotplot(aes(x=dose,y=len,color=supp),binaxis="y",stackdir="center")
### PIE CHART ####
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
#data
df=[Link](dose=c("D0.5", "D1", "D2"),len=c(4.2, 10, 29.5))
#barplot
bp= ggplot(data=df, aes(x="", y=len, fill=dose)) +
geom_bar(stat="identity")
pie=bp + coord_polar("y", start=0)
pie
##############correlogram###########
#####[Link]("GGally")
[Link]("corrplot")
h=cor(mtcars[,c(2,3,5)])
corr=head(round(h,2))
###corrplot(corr,method="circle")
library(corrplot)
corrplot(corr,method="circle")
corrplot(corr,method="pie")
corrplot(corr,method="color")
corrplot(corr,method="number")
#####################################################################
#=========-{ DPLYR }========
library(dplyr)
## Use of Pipes
diamonds %>% head(4) %>% dim
class(diamonds)
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
head(diamonds)
diamonds
## Print carat and price columns w/i and w/o pipes
select(diamonds,carat,price)
diamonds %>% select(carat,price)
diamonds %>% select(1,7)
## Print all except carat and price columns w/i and w/o pipes
diamonds %>% select(c(-carat,-price))
## Filter where cut is ideal
diamonds %>% filter(cut=='Ideal')
diamonds[diamonds$cut=='Ideal',] ### Base R equivalent
## Filter where cut is Ideal or Good
diamonds %>% filter(cut==c("Ideal","Good"))
## Filter where price >=1000
diamonds %>% filter(price>=1000)
## Multiple and condition carat>2 and price<14000 using , or &
diamonds %>%filter(carat>2,price<14000)
diamonds %>%filter(carat>2&price<14000)
## or condition carat <1 or >5
diamonds %>% filter(carat<1|carat>5)
## comparision with value
diamonds %>% filter(cut=="Ideal")
## compare using variable now
ia="Ideal"
diamonds %>% filter(cut==ia)
## to display selected rows 1 to 5, 8th and 15:20
diamonds %>% slice(c(1:5,8,15:20))
## display rows except 1st row
diamonds %>% slice(-1)
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
## add one more column ratio = price/carat
diamonds %>% mutate(ratio=price/carat)
## can use created variable in same call
diamonds %>%
select(carat,price) %>%
mutate(ratio=price/carat,double=ratio*2)
### Add magrittr package for following
diamonds2=diamonds
diamonds %<>%
select(carat,price) %>%
mutate(ratio=price/carat,double=ratio*2)
diamonds2
###
## summary
summarize(diamonds,mean(price))
diamonds %>% summarize(mean(price))
## group by
diamonds %>% group_by(cut) %>% summarize(AvgPr=mean(price),SumCarat=sum(carat))
diamonds %>% group_by(cut,color) %>% summarize(AvgPr=mean(price),SumCarat=sum(carat))
##################time series & forecasting #######
# create a vector
rainfall=c(12,43,32,23,23,31,56,20,30,12,45,78)
# convert to time series object, freq means number of data points per year
rats=ts(rainfall,start=c(2017,6),frequency=12)
rats
# create another vector
rainfall1=c(12,21,33,24,23,15,22,12,12,12,13,70)
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
# merge 2 vectors in a matrix
comr=matrix(c(rainfall,rainfall1),nrow=12)
# convert matrix to ts object
rats1=ts(comr,start=c(2017,6),frequency = 12)
library(forecast)
# Save a data to be analysed in a variable
dd=rats ## Also try AirPassengers
start(dd)
end(dd)
frequency(dd)
class(dd)
d1=[Link](dd)
# PRedict for next 3 periods
d2=predict(d1,[Link]=3)
# Plot actual and prediction together
[Link](dd,d2$pred,col=c("blue","red"),lty=c(1,3))
########## Machine Learning
[Link](c("tm","SnowballC","topicmodels","wordcloud","sentimentr","syuzhet"))
library(tm)
library(SnowballC)
library(topicmodels)
library(wordcloud)
library(plyr)
library(dplyr)
library(stringr)
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
library(ggplot2)
library(httr)
library(reshape2)
library(sentimentr)
library(scales)
library(RCurl)
library(syuzhet)
getwd()
filenames = [Link](getwd(),pattern="*.txt")
files = lapply(filenames,readLines)
#create corpus from vector
[Link] = Corpus(VectorSource(files))
class([Link])
##Text Preprocessing
# make each letter lowercase
[Link] = tm_map([Link], tolower)
# remove punctuation
[Link] = tm_map([Link], removePunctuation)
#remove numbers
[Link] = tm_map([Link], removeNumbers)
# remove generic and custom stopwords
stopwords()
[Link] = tm_map([Link], removeWords, stopwords())
[Link] = tm_map([Link], removeWords, c("and", "the", "have", "was", "with"))
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
#Visualization - Wordcloud
library(wordcloud)
wordcloud([Link], [Link] = F)
#Create TDM
#A term document matrix is a way of representing the words in the text as a table (or matrix) of
numbers. The rows of the matrix represent the text responses to be analysed, and the columns of
the matrix represent the words from the text that are to be used in the analysis.
tdm= TermDocumentMatrix([Link])
class(tdm)
tdm=[Link](tdm)
tdm
termfreq = rowSums([Link](tdm))
termfreq
#Subsetting TDM
termfreqsubset= subset(termfreq, termfreq>=4)
class(termfreqsubset)
#Creating a dataframe
library(ggplot2)
tdmdf= [Link](term=names(termfreqsubset), freq=termfreqsubset)
View(tdmdf)
tdmplot= ggplot(tdmdf, aes(x=term, y=freq)) +
geom_bar(stat="identity") + xlab("Terms") + ylab("Count") +
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
coord_flip() +
theme([Link]=element_text(size=6))
tdmplot
#Wordcloud
wc= [Link](tdm) #making a matrix
wordfreq= sort(rowSums(wc), decreasing = T)
#Colors
pal= [Link](9, "BuGn")[-(1:4)]
colors()
nwc= wordcloud(words=names(wordfreq), freq= wordfreq, [Link]=3,
[Link] = F, colors= pal)
################################################
#Sentiment Analysis
library(sentimentr)
class([Link])
class(a)
a=[Link]([Link])
mysentiment = get_nrc_sentiment(a)
SentimentScores = [Link](colSums(mysentiment[,]))
SentimentScores
#Giving Name to the scores column
names(SentimentScores) = "Score"
SentimentScores
#Giving Row Names
SentimentScores = cbind("sentiment" = rownames(SentimentScores),
SentimentScores)
SentimentScores
R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
#Removing Row names
rownames(SentimentScores) = NULL
SentimentScores
#Plotting the sentiment Scores
ggplot(SentimentScores, aes(x = sentiment, y = Score))+
geom_bar(aes(fill = sentiment), stat = "identity") +
theme([Link] = "none") +
xlab("Sentiment") + ylab("Score") +
ggtitle("Total Sentiment Score")
#Topic Modeling
#Latent dirichlet allocation (LDA) models are a widely used topic modeling technique.
#Create DTM
articleDtm = DocumentTermMatrix([Link],
control = list(minWordLength = 3))
k=3
SEED = 1234
[Link] = LDA(articleDtm, k, method="Gibbs",
control=list(seed = SEED))
[Link] = [Link](topics([Link]))
[Link]
[Link] = terms([Link])
[Link]