The common data structure in R are
Vectors, List, Array, Matrix and Dataframe.
###Vector Operations
# values from 1 :5
vec <- c( 1:5)
vec <- c( sample( 5))
vec <- rnorm( 5,mean= 2.5,sd= 2)
vec <- c( letters[1:5])
vec_name <- c( "id" = 1, "age" = 21)
vec_name <- c( "id" = 1:5, "age" = 21:25)
id1 id2 id3 id4 id5 age1 age2 age3 age4 age5
1 2 3 4 5 21 22 23 24 25
# check whether value exists in vector
2 %in% vec
#filtering on multiple conditions
l <- sample( c( 1:50) ,10)
d <- sample( c( 1:50) ,10)
l[l %in% d | l > 40]
# find the index of the value in vector
match( 3,vec)
Selecting elements
# select element 1 and 3
vec[c( 1,3)]
# filter out element 1 and 3
vec[-c( 1,3)]
# filter out elements based on logical true or false
vec[sample( rep( c( TRUE,FALSE) ,2) , 2)]
# filter out
vec[vec >3]
# change the value of some elements
vec[c( 1,4)] <- c( 10,14)
vec[-c( 2,3,5)] <- c( 10,14)
List
List is a collection of different objects
list( c( 1:5) , letters[1:5] )
Array and Matrices
mat <- matrix( 1:4,nrow = 2, ncol = 2)
arr <- array( 1:6,dim= c( 2,3))
Data frames
#By default the strings columns are considered as factors. You can turn it to false
data.frame( id = 1:10, category = letters[1:10],stringsAsFactors = FALSE)
> str( df)
'data.frame' : 10 obs. of 2 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10
$ category: chr "a" "b" "c" "d" ...
Since data frame is both a list and rectangular structure you can access the values in two ways
You can use the list operators: df[i], df[[i]] , df$col
You can use the matrix notation: df[i, j] df[i , ] df[ , j]
Data.frame vs as.data.frame
If your data is captured in several vectors and/or factors, use the data.frame function to assemble them into a data frame:
> df <- data.frame(v1, v2, v3, f1, f2)
If your data is captured in a list that contains vectors and/or factors, use instead
as.data.frame:
> dfrm <- as.data.frame(list.of.vectors)
data.frame calls as.data.frame internally
Operations on Dataframe
#Adding a column
cbind( df, data.frame( cat_id = paste( 6,"_" ,letters[6], sep = "" )))
#Adding a row
rbind( df,data.frame( id = 6, category = letters[6]))
Check out the structure of data frame
str( mtcars)
# add a column with some NA value
mtcars_df <- cbind( mtcars, data.frame( "temp" = sample( rep( c( "a" ,"b" ,NA,"c" ) ,8) ,32) ))
Subsetting dataframe
# get rows 1:5
mtcars[c( 1:5) ,]
# filter out the rows for which the temp column is NA
mtcars_df[ is.na( mtcars_df$temp ) == FALSE ,]
#In the dataset lets find the rows where the carb column is 1 or 2
mtcars[mtcars$carb %in% c( 1,2) ,]
subset( mtcars, carb %in% c( 1,2) )
mtcars[!is.na( match( mtcars$carb ,c( 1,2) )) ,]
#There are various ways of doing subsetting operations. But the important thing is to know the tricks of using the operators so that when the need arises you can intelligently combine them.
#To group dataframe by factor levels
df <- Toothgrowth
split( df,levels( df$supp )) # this will give the lists of dataframes, one for each factor level
split( mtcars,mtcars$gear )
#To run a function under each of the split group
# get average mpg for different gear levels
by( mtcars,mtcars$gear ,FUN = function ( df){ mean( df$mpg )})
mtcars$gear : 3
[ 1] 16.10667
mtcars$gear : 4
[ 1] 24.53333
mtcars$gear : 5
[ 1] 21.38
#Show the freq table for two different columns of a dataframe
f <- mtcars
f$cyl = as.factor( f$cyl )
f$gear = as.factor( f$gear )
table( f$cyl ,f$gear )
3 4 5
4 1 8 2
6 2 4 1
8 12 0 2
Binning of data
You have a vector, and you want to split the data into groups according to intervals.
# get a weighted sample
vec <- sample( c( 1:5) ,20,replace = TRUE, prob = c( 10,15,45,25,5))
# create the group ranges
breaks <- c( 0,2,3,4,6)
groups <- cut( vec,breaks,c( "0-2" ,"2-3" ,"3-4" ,"4-6" ))
> table( groups)
groups
0-2 2-3 3-4 4-6
4 5 10 1
Operations on model object
Given an model we can update it with new features
# add additional feature z to existing model m
update( m , ~ . + z)
#Where . means "what was previously in this part of the formula".
# update the model by adding new observations in the dataframe df
update( m1, . ~ ., data = df)
#remove a field from model
update( m1,as.formula( paste( ".~.-" , fieldRemove)) )