0% found this document useful (0 votes)
22 views20 pages

Data Science

The document contains a comprehensive guide on using R programming for data manipulation and analysis, covering topics such as variable management, data structures (vectors, lists, matrices, data frames), and basic arithmetic operations. It also includes instructions for installing and updating packages, applying functions, and subsetting data. Additionally, the document demonstrates the use of user-defined functions and data input/output methods.

Uploaded by

algobeetrading
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
22 views20 pages

Data Science

The document contains a comprehensive guide on using R programming for data manipulation and analysis, covering topics such as variable management, data structures (vectors, lists, matrices, data frames), and basic arithmetic operations. It also includes instructions for installing and updating packages, applying functions, and subsetting data. Additionally, the document demonstrates the use of user-defined functions and data input/output methods.

Uploaded by

algobeetrading
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd

ls()

rm(x)

rm(list = ls())

help(rm)

x <- c(1:4,10,"Ram");x

y <- 3:10

rm(x)

# setting working directory

getwd()

setwd("E:\\DataScience\\Week1")

history()

[Link]()

###ctrl+L

version

[Link]("installr")

require(installr)

library(installr)

if(!require(installr)) {
[Link]("installr");require(installr)

updateR()

###Installing and using packages

[Link]('reshape2')

library(reshape2)

[Link]("installR")

# The basic arithmetic operator

# Addition

x <- 3 + 7

# Substraction

7-3

# Multiplication

3*7

# Divison

7/3

# Exponentiation

2^3

# Modulo: returns the remainder of the division of 8/3

8 %% 3
log2(x) # logarithms base 2 of x

log10(x) # logaritms base 10 of x

exp(x) # Exponential of x

cos(x) # Cosine of x

sin(x) # Sine of x

tan(x) #Tangent of x

abs(x) # absolute value of x

sqrt(x) # square root of x

# Logical

logi <- c(TRUE,FALSE, TRUE, TRUE)

class(logi)

age <- c(34,35,36,34,34,35,36,35)

class(age)

[Link](age)

[Link](age)

[Link](age)

[Link](logi)

age <- [Link](age)


median(x = 1:10)

median(x <- 1:10)

# x <- y <- 5

# x <- y = 5

# Vectors

car_name <- c("Honda","BMW","Ferrari")

car_color = c("Black","Blue","Red")

car_cc = c(2000,3400,4000)

# List

cars <- list(name =c("Honda","BMW","Ferrari"),

color =c("Black","Blue","Red"),

cc =c(2000,3400,4000,6000,5000))

list(car_name,car_color,car_cc)

cars

class(cars)
# Matrix

mdat <- matrix(c(1,2,3, 11,12,13), nrow =2, ncol =3, byrow =F,

dimnames =list(c("row1", "row2"), c("C.1", "C.2", "C.3")));mdat

t(mdat)

# Deconstruction

c(mdat)

rowSums(mdat)

colSums(mdat)

# dataframe

cars <- [Link](name =c("Honda","BMW","Ferrari"),

color =c("Black","Blue","Red"),

cc =c(2000,3400,4000));cars

cars

# Factors

apple_colors <- c('green','green','yellow','red','red','red','green')

class(apple_colors)

factor_apple <- factor(apple_colors)

nlevels(factor_apple)

levels(factor_apple)

# String
x <- c("Hello World!")

print(x)

class(x)

length(x)

nchar(x)

substr(x,2,4)

substring(x, 2, 4:6)

######Indexing#######

# A sample vector

v <- c(1,4,4,3,2,2,3)

y <- c(v,c(1,23))

v[c(2,3,4)]

v[c(1,3,6)]

v[-1]

v[c(-1,-3)]

v[2:4]

length(v)

v[length(v)]

# Create a sample data frame

data <- [Link](header=T, text='

subject gender size

1 M 7
2 F 6

3 F 9

4 M 11

'); data

names(data)

colnames(data)

dimnames(data)

rownames(data)

dim(data)

# Get the element at row 1, column 3

data[1,3]

data[1,"size"]

# Get rows 1 and 2, and all columns

data[1:2, ]

data[,1:2]

dimnames(data)

dimnames(data)[[1]]

data[c(1,3), ]
# Get rows 1 and 2, and only column 2

data[1:2, c(1,2)]

data[c(1,3), 2]

# Get rows 1 and 2, and only the columns named "gender" and "size"

data[1:2, c("gender","size")]

data[c(1,2), c(2,3)]

data[['size']]

###Indexing with a boolean vector

v>2

v[v>2]

v[ c(F,T,T,T,F,F,T)]

# A boolean vector

data$subject < 3

data[data$subject < 3, ]

data[c(TRUE,TRUE,FALSE,FALSE), ]

which(data$subject < 3)

data

##Negative indexing

# Drop the first element

v[-1]

# Drop first three


v[-1:-3]

# Drop just the last element

v[-length(v)]

#####Getting a subset of a data structure

subset(v, v<3)

v[v<3]

# Another vector

t <- c("small", "small", "large", "medium")

# Remove "small" entries

subset(t, t!="small")

t[t!="small"]

# One important difference between the two methods is that you can assign values to

# elements with square bracket indexing, but you cannot with subset().

v[v>3] <- 9

subset(v, v<3) <- 9

subset(data, subject < 3)

data[data$subject < 3, ]

# Subset of particular rows and columns


subset(data, subject < 3, select = -subject)

subset(data, subject < 3, select = c(gender,size))

subset(data, subject < 3, select = gender:size)

data[data$subject < 3, c("gender","size")]

# Logical AND of two conditions

subset(data, subject < 3 & gender=="M")

data[data$subject < 3 & data$gender=="M", ]

# Logical OR of two conditions

subset(data, subject < 3 | gender=="M")

data[data$subject < 3 | data$gender=="M", ]

# Condition based on transformed data

subset(data, log2(size) > 3 )

data[log2(data$size) > 3, ]

# Subset if elements are in another vector

# subset(data, subject %in% c(1,3))

# data[data$subject %in% c(1,3), ]

####vector filled with values########

rep(1, 50)

rep(F, 20)
rep(1:5, 4)

rep(1:5, each=4)

# Use it on a factor

rep(factor(LETTERS[1:3]), 5)

seq(0,10, by=2)

seq(0,10,length=20)

round(seq(0,10,length=20),2)

###Information about variables####

n <- 1:4

let <- LETTERS[1:4]

df <- [Link](n, let)

# List currently defined variables

ls()

# Check if a variable named "x" exists

exists("x")

# Delete variable x

rm(x)

###Information about size/structure


# Get information about structure

str(n)

str(df)

# Get the length of a vector

length(n)

# Length probably doesn't give us what we want here:

length(df)

# Number of rows

nrow(df)

# Number of columns

ncol(df)

# Get rows and columns

dim(df)

########Working with NULL, NA, and NaN#######

x <- 5

x>2

y <- NA

y>5

z <- NaN

z>5
[Link](x)

[Link](y)

[Link](z)

vy <- c(1, 2, 3, NA, 5)

sum(vy)

sum(vy, [Link]=TRUE)

vz <- c(1, 2, 3, NaN, 5)

sum(vz, [Link]=TRUE)

vx <- c(1, 2, 3, NULL, 5)

sum(vx)

vy[![Link](vy)]

vz[![Link](vz)]

# R Datasets

data()

data(mtcars)

str(mtcars)

help(mtcars)

head(mtcars)

tail(mtcars)

dim(mtcars)
colnames(mtcars)

summary(mtcars)

# Apply functions

data <- matrix(c(1:10, 21:30), nrow = 5, ncol = 4)

data <- matrix(c(1:10, 21:30), nrow = 5, ncol = 4, byrow = TRUE)

data

# apply

apply(data, 1, sum)

apply(data, 2, mean)

# lapply

data <- list(x = 1:5, y = 6:10, z = 11:15)

lapply(data, FUN = mean)

# sapply

# sapply is the same as lapply, but returns a vector instead of a list.

sapply(data, FUN = mean)

i39 <- sapply(3:9, seq)

sapply(i39, fivenum)
# tapply

# tapply splits the array based on specified data, usually factor levels and then applies the function to it.

library(datasets)

data()

data(mtcars)

help(mtcars)

str(mtcars)

head(mtcars)

apply(mtcars,2,mean)

tapply(mtcars$wt, mtcars$cyl, mean)

groups <- [Link](c(1,1,1,3,4,4,5,6,2,3,5))

#Calculate the number of times each number repeats

tapply(groups, groups, length)

#The output is similar to the function table

table(groups)

# mapply

# mapply is a multivariate version of sapply. It will apply the specified

# function to the first element of each argument first, followed by the

# second element, and so on.

# x <- 1:5
# b <- 6:10

# mapply(sum, x, b)

# # vapply

# vapply(i39, fivenum,c("Min." =0, "1st Qu." =0, "Median" =0, "3rd Qu." =0, "Max." =0))

#Rowbind

x <- c(1:10)

y <- c(11:20)

z <- c(21:30)

a <- rbind(x,y,z);a

# column bind

b <- cbind(x,y,z);b

# Data description

data(mtcars)

head(mtcars$mpg)

help(mtcars)

head(mpg)

attach(mtcars)

head(disp)

detach(mtcars)

dim(mtcars)
str(mtcars)

class(mtcars)

head(mtcars)

tail(mtcars)

# UDF

totalsum <- function(x){

y = sum(x)

return(y)

x <- c(1:130)

totalsum(x)

fib <- function(Fibonacci_terms){

for(i in Fibonacci_terms){

Fibonacci_ter1[i] <- Fibonacci_terms[i-2] + Fibonacci_terms[i-1]

return(Fibonacci_ter1)

xterms <- seq(3, 15, by=1)

fib(xterms)
n <- 1:4

let <- LETTERS[1:4]

df <- [Link](n,let)

myfunction <- function(x) {

y=x-5

return (y)

x<-c(1:10)

myfunction(x)

x <- c(1:10)

y <- c(11:20)

z <- c(21:30)
myfunction <- function( x,y,z) {

y = x+y+z

return (y)

mytable <- [Link](x,y,z,total = myfunction(x,y,z))

mytable

x <- c(1:10)

y <- c(11:20)

z <- c(21:30)

total<-myfunction(x,y,z)

check_total <- function(x) {

y=ifelse(total>5,"yes","no")

return(y)

mydata <- [Link](x,y,z,total,status=ifelse(total>5,"yes","no"))

mydata

mydata$status2 <- ifelse(total>40,"yes","no")

mydata

mydata1 <-subset(mydata,mydata$status=="yes")
mydata2 <-subset(mydata,mydata$status=="no")

mydata1

mydata2

xterm <- seq(3,15,by=1);xterm

fib <- function(fibonacii_terms){

for (i in fibonacii_terms)

fibonacii_terms[i]=fibonacii_terms[i-2] + fibonacii_terms[i-1]

return(fibonacii_terms)

y<-fib(xterm)

mydata

[Link](mydata,"d:/[Link]",[Link]=FALSE,[Link]=TRUE,sep="\t")

str(mtcars)

arrange

mydata<-order(mtcars$mpg ,mtcars$cyl,decreasing =TRUE)

head(mydata)

You might also like