R PROGRAMMING SEP 2022

DATA ANALYSIS WITH R

DAY 1: 10 SEP 2022

#Compiler

 

#interpreter

 

print(“XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX”)

print(5 + 3)

print(“5 + 3”)

hello = 5

print(hello)

 

print(4) # comment

 

#Data types -what is the data

#Basic data type:  single value

#logical: TRUE / FALSE

var1 = TRUE  #FALSE

var1 <- TRUE

TRUE  -> var1

#

print(class(var1))

 

#Integer: positive or negative numbers without decimal part

var1 <- 3L

print(class(var1))

 

#numeric: can take decimal values

var1 <- 3.5

print(class(var1))

 

#CHARACTER

var1 <- “HEllo”

print(class(var1))

 

#complex: square root of -1

var1 = 5i   #complex numbers are represented iota

print(var1 * var1)

print(class(var1))

 

#Raw

print(charToRaw(“l”))

 

#### data structure

#vector : same type of values

hello=68

var1 = c(34,45,67,”hello”)

print(var1)

print(class(var1))

 

# lists

var1 <- list(3,5,”Hello”, TRUE, c(2,4,8,2,4,6,8,2,4,6,8))

print(var1)

cat(“Hello”, “there”)

#print(“Hello”, “there”)

 

#Matrices

mat1 = matrix(c(1,3,5,7,9,11,13,15,18), nrow=3,ncol=3, byrow = TRUE)

print(mat1)

 

mat1 = matrix(c(1,3,5,7,9,11,13,15,18), nrow=3,ncol=3, byrow = FALSE)

print(mat1)

 

var1 = array(c(1,3,5,7,9,11,13,15,18,9,11,13,15,18,21,22,25,28), dim=c(2,2,2,2))

print(var1)

 

# Factor

color = c(“Red”,”Green”,”Blue”,”Green”,”Blue”,”Green”,”Blue”,”Green”,”Blue”,”Red”)

color_f = factor(color)

print(color_f)

 

 

# Data Frames

employee <- data.frame(

  Name = c(“Sachin”,”Virat”,”Rohit”),

  City = c(“Mumbai”,”Delhi”,”Chennai”),

  Avg = c(113,24,85)

)

print(employee)

 

DAY 2: 11 SEP 2022

#Arithmetic operators

v1 = c(1,3,5,7)

v2 = c(2,4,6,8)

print(v1 + v2)

print(v1 – v2)

print(v1 * v2)

print(v1 / v2)

 

# %% is for remainder

num = 15

rem = num %%2

print(rem)

 

# integer division or quotient:  %/%

qt = 15 %/% 4

print(qt)

 

#5 ^ 3 : cube power of

print( 5^ 3)

 

#Relational Operators: bigger smaller relation – oUput is logical

var1 = 55

var2 = 66

print(var1 > var2)  # is var1 greater than var2?

print(var1 < var2)

print(var1 >= var2)

print(var1 <= var2)

print(var1 == var2)

print(var1 != var2) 

 

 

#Logical operators: Input is logical and output is also logical

#prediction: Sachin and Laxman will open the batting

#actual: Sachin and Rahul opened the batting

 

#prediction: Sachin or Laxman will open the batting

#actual: Sachin and Rahul opened the batting

 

#  & for and ,  | for or

a=5

b=6

c=7

print(a > b | b < c)  # for OR – even 1 True will make it True

 

# T & T = T  F & F = F   T & F = F    F&T = F  (multiplication)

# T | T = T  F | F = F   T | F = T    F|T = T  (addition)

print(!TRUE)

 

#Assignment Operators:

a = 5

a <- 5

a <<- 5  #left assignment

#right assignment:

100 -> b

200 ->> b

c=6

 

b -> c

print(b)

print(c)

 

 

####################################################3

## CONDITIONS

 

#if avg >= 90 I want to print COngratulations

avg = 90

if (avg >=90) {

  print(“Congratulations”)

}

 

avg =40

if (avg>=50) {

  print(“You have passed”)

} else {

  print(“Sorry, You have failed”)

}

 

 

# if – else if  – else

 

#avg > 90: Grade A, avg>80: Grade B, avg>70: C, avg > 60: D, avg >50: E, <50: F

avg = 90

 

if (avg>=90) {

  print(“Grade A”)

  val = 1

} else if (avg >=80) {

  print(“Grade B”)

  val=2

} else if(avg>=70) {

  print(“Grade C”)

  val = 3

} else if (avg >= 60) {

  print(“Grade D”)

   val = 4

} else if (avg>=50) {

  print(“Grade E”)

   val =5

} else {

  print(“Grade F”)

   val = 6

}

 

## switch

#switch(expression, case1: case2)…

 result <- switch(

   val,

   “Hello”,

   “How are you?”,

   “Where are you?”,

   “Hows going?”

 )

 print(result)

 

 

 #loops – repeat block

 ## repeat: exit check

 ## while : entry check

 ## for : when we know how many times to repeat

 

TABLE OF CONTENTS

Unit 1: Getting Started with R.. 2

Getting Started. 2

R Objects and Data Types. 5

R Operators. 9

Decision Making in R. 12

LOOPS in R. 14

STRINGS in R. 15

Unit 2: FUNCTIONS in R.. 17

Built-in Function. 17

User-defined Function. 17

Unit 3: VECTORS, LISTS, ARRAYS & MATRICES. 19

VECTORS. 19

LISTS. 22

MATRICES. 25

ARRAYS. 27

Factors. 29

Data Frames. 34

Unit 4: Working with Files. 45

Working with Excel Files. 46

Unit 5: Working with MSAccess Database. 48

Unit 6: Working with Graphs. 51

Unit 7: Overview of R Packages. 64

Unit 8: Programming Examples. 68

Unit 1: Getting Started with R

R is a free software environment for statistical computing and graphics. It compiles and runs on a wide variety of UNIX platforms, Windows and MacOS. Why R? It’s free, open source, powerful and highly extensible. “You have a lot of prepackaged stuff that’s already available, so you’re standing on the shoulders of giants,” Google’s chief economist told The New York Times back in 2009.There can be little doubt that interest in the R statistics language, especially for data analysis, is soaring.

 

Downloading R

The primary R system is available from the Comprehensive R Archive Network, also known as CRAN. CRAN also hosts many add-on packages that can be used to extend the functionality of R. The “base” R system that you download from CRAN: Linux, Windows, Mac, Source Code

Website to download:  https://cran.r-project.org/mirrors.html

 

The R Foundation for Statistical Computing

The R Foundation is a not-for-profit organization working in the public interest. It was founded by the members of the R Development Core Team in order to:

·        Provide support for the R project and other innovations in statistical computing. We believe that R has become a mature and valuable tool and we would like to ensure its continued development and the development of future innovations in software for statistical and computational research.

·        Provide a reference point for individuals, institutions or commercial enterprises that want to support or interact with the R development community.

·        Hold and administer the copyright of R software and documentation.

 

R functionality is divided into a number of packages:

·        The “base” R system contains, among other things, the base package which is required to run R and contains the most fundamental functions.

·        The other packages contained in the “base” system include utils, stats, datasets, graphics, grDevices, grid, methods, tools, parallel, compiler, splines, tcltk, stats4.

·        There are also “Recommended” packages: boot, class, cluster, codetools, foreign, KernSmooth, lattice, mgcv, nlme, rpart, survival, MASS, spatial, nnet, Matrix.

When you download a fresh installation of R from CRAN, you get all of the above, which represents a substantial amount of functionality. However, there are many other packages available:

·        There are over 4000 packages on CRAN that have been developed by users and programmers around the world.

·        People often make packages available on their personal websites; there is no reliable way to keep track of how many packages are available in this fashion.

·        There are a number of packages being developed on repositories like GitHub and BitBucket but there is no reliable listing of all these packages.

 

 

More details can be found at the R foundation website: https://www.r-project.org/

 

Let’s create our first R Program

Launch R. In Windows you can launch R software using the option shown below under Program Files.

Figure 1: Launch R Programming Window

 

After launching R interpreter, you will get a prompt > where you can start typing your

Program. Let’s try our first program:

 

In the Hello World code below, vString is a variable which stores the String value “Hello World” and in the next line we print the value of the vString variable. Please note that R command are case sensitive. print is the valid command to print the value on the screen.

Figure 2: Hello World

 

# is the syntax used to print comments in the program

Figure 3: R Programming

 

R Basic Syntax

Download and Install R software

When R is run, this will launch R interpreter. You will get a prompt where you can start typing your programs as follows:

Here first statement defines a string variable myString, where we assign a string “Hello, World!” and then next statement print() is being used to print the value stored in variable myString.

 

R Script File

Usually, you will do your programming by writing your programs in script files and then you execute those scripts at your command prompt with the help of R interpreter called Rscript. So let’s start with writing following code in a text file called test.R as under:

Save the above code in a file test.R and execute it at Linux command prompt as given below. Even if you are using Windows or other system, syntax will remain same.

For windows, go to command prompt and browse to the directory where R.exe/Rscript.exe is installed.

Run-> Rscript filename.R     (filename.R is the name of the file which has R program along with the path name.)

 

We will use RStudio for rest of our course example. Download and install R Studio.

 

 

Generally, while doing programming in any programming language, you need to use various variables to store information. Variables are nothing but reserved memory locations to store values. This means that, when you create a variable you reserve some space in memory. In contrast to other programming languages like C and java in R, the variables are not declared as some data type. The variables are assigned with R-Objects and the data type of the R-object becomes the data type of the variable.

 

R has five basic or “atomic” classes of objects:

·        character

·        numeric (real numbers)

·        integer

·        complex

·        logical (True/False)

 

The frequently used ones are:

Vectors

Lists

Matrices

Arrays

Factors

Data Frames

 

The simplest of these objects is the vector object and there are six data types of these atomic vectors, also termed as six classes of vectors. The other R-Objects are built upon the atomic vectors.

Figure 4: Data Types in R

 

 

Creating Vectors

The c() function can be used to create vectors of objects by concatenating things together.  When you want to create vector with more than one element, you should use c() function which means to combine the elements into a vector. You can also use the vector() function to initialize vectors.

Figure 5: Vector example

 

Lists, Matrices, Arrays

A list is an R-object which can contain many different types of elements inside it like vectors, functions and even another list inside it.

 

A matrix is a two-dimensional rectangular data set. It can be created using a vector input to the matrix function.

 

While matrices are confined to two dimensions, arrays can be of any number of dimensions. The array function takes a dim attribute which creates the required number of dimension. In the below example we create an array with two elements which are 3×3 matrices each.

 

Factors

Factors are used to represent categorical data and can be unordered or ordered. One can think of a factor as an integer vector where each integer has a label. Factors are important in statistical modeling and are treated specially by modelling functions like lm() and glm(). Using factors with labels is better than using integers because factors are self-describing. Having a variable that has values “Male” and “Female” is better than a variable that has values 1 and 2. Factor objects can be created with the factor() function.

Figure 6: List, Matrix and Array example

 

Figure 7: Factors example

 

Data Frames

Data frames are tabular data objects. Unlike a matrix in data frame each column can contain different modes of data. The first column can be numeric while the second column can be character and third column can be logical. It is a list of vectors of equal length. Data Frames are created using the data.frame() function.

Figure 8: Data frames example

 

Mixing Objects

There are occasions when different classes of R objects get mixed together. Sometimes this happens by accident but it can also happen on purpose. In implicit coercion, what R tries to do is find a way to represent all of the objects in the vector in a reasonable fashion. Sometimes this does exactly what you want and sometimes not. For example, combining a numeric object with a character object will create a character vector, because numbers can usually be easily represented as strings.

Figure 9: Mixing and Missing Objects examples

We have the following types of operators in R programming:

·        Arithmetic Operators

·        Relational Operators

·        Logical Operators

·        Assignment Operators

·        Miscellaneous Operators

 

Arithmetic Operators

 

Figure 10: Assignment Operators

 

Relational Operators

Operators

Meaning

> 

Checks if each element of the first vector is greater than the corresponding element of the second vector.

< 

Checks if each element of the first vector is less than the corresponding element of the second vector.

==

Checks if each element of the first vector is equal to the corresponding element of the second vector.

<=

Checks if each element of the first vector is less than or equal to the corresponding element of the second vector.

>=

Checks if each element of the first vector is greater than or equal to the corresponding element of the second vector.

!=

Checks if each element of the first vector is unequal to the corresponding element of the second vector.

 

Logical Operators

Operators

Meaning

&

It is called Element-wise Logical AND operator. It combines each element of the first vector with the corresponding element of the second vector and gives a output TRUE if both the elements are TRUE.

|

It is called Element-wise Logical OR operator. It combines each element of the first vector with the corresponding element of the second vector and gives a output TRUE if one the elements is TRUE.

!

It is called Logical NOT operator. Takes each element of the vector and gives the opposite logical value.

The logical operator && (logical AND) and || (logical OR) considers only the first element of the vectors and give a vector of single element as output.

 

Readers are encouraged to practice all the operators and see the output.

 

 

 

Assignment Operators

A variable in R can store an atomic vector, group of atomic vectors or a combination of many R objects. The variables can be assigned values using leftward, rightward and equal to operator. The values of the variables can be printed using print() or cat() function. The cat() function combines multiple items into a continuous print output.

In R, a variable itself is not declared of any data type, rather it gets the data type of the R -object assigned to it. So R is called a dynamically typed language, which means that we can change a variable’s data type of the same variable again and again when using it in a program.

Figure 11: Variable assignment

 

Figure 12: Listing and deleting variables

 

Miscellaneous Operators

Operators

Meaning

:

Colon operator. It creates the series of numbers in sequence for a vector.

%in%

This operator is used to identify if an element belongs to a vector.

%*%

This operator is used to multiply a matrix with its transpose.

 

 

R provides the following types of decision making statements:

Statement

Description

If statement

An if statement consists of a Boolean expression followed by one or more statements.

If else statement

An if statement can be followed by an optional else statement, which executes when the Boolean expression is false.

Switch statement

A switch statement allows a variable to be tested for equality against a list of values.

 

Figure 13: Example of If Statement

 

Figure 14: Example of If Else Statement

 

Multiple if else

An if statement can be followed by an optional else if…else statement, which is very

useful to test various conditions using single if…else if statement.

 

Syntax

 

When using if, else if, else statements there are few points to keep in mind.

·        An if can have zero or one else and it must come after any else if’s.

·        An if can have zero to many else if’s and they must come before the else.

·        Once an else if succeeds, none of the remaining else if’s or else’s will be tested.

 

SWITCH statement

A switch statement allows a variable to be tested for equality against a list of values. Each value is called a case, and the variable being switched on is checked for each case.

Syntax

 

The following rules apply to a switch statement:

·        If the value of expression is not a character string it is coerced to integer.

·        You can have any number of case statements within a switch. Each case is followed by the value to be compared to and a colon.

·        If the value of the integer is between 1 and nargs()-1 (The max number of arguments)then the corresponding element of case condition is evaluated and the

·        result returned.

·        If expression evaluates to a character string then that string is matched (exactly) to the names of the elements.

·        If there is more than one match, the first matching element is returned.

·        No Default argument is available.

·        In the case of no match, if there is a unnamed element of … its value is returned. (If there is more than one such argument an error is returned.)

 

 

Loops are used to repeat a block of code. Being able to have your program repeatedly execute a block of code is one of the most basic but useful tasks in programming- a loop lets you write a very simple statement to produce a significantly greater result simply by repetition. R programming language provides the following kinds of loop to handle looping requirements:

Loop Type

Description

REPEAT loop

Executes a sequence of statements multiple times and abbreviates the code that manages the loop variable.

WHILE loop

Repeats a statement or group of statements while a given condition is true. It tests the condition before executing the loop body.

FOR loop

It executes a block of statements repeatedly until the specified condition returns false.

 

Look Control Statements

Control Type

Description

BREAK statement

Terminates the loop statement and transfers execution to the statement immediately following the loop.

NEXT statement

The next statement simulates the behavior of R switch (skips the line of execution).

 

REPEAT – loop

The Repeat loop executes the same code again and again until a stop condition is met.

    Syntax:                                                                         Example:

 

 

 

 

 

WHILE – loop

The While loop executes the same code again and again until a stop condition is met.

    Syntax:                                                                         Example:

FOR – loop

A for loop is a repetition control structure that allows you to efficiently write a loop that needs to execute a specific number of times.

    Syntax:                                                                         Example:

Any value written within a pair of single quote or double quotes in R is treated as a string. Internally R stores every string within double quotes, even when you create them with single quote.

 

Rules Applied in String Construction

·     The quotes at the beginning and end of a string should be both double quotes or both single quote. They can not be mixed.

·     Double quotes can be inserted into a string starting and ending with single quote.

·     Single quote can be inserted into a string starting and ending with double quotes.

·     Double quotes can not be inserted into a string starting and ending with double quotes.

·     Single quote can not be inserted into a string starting and ending with single quote.

 

 

 

 

Examples of Strings in R

Formatting numbers & strings – format() function

Numbers and strings can be formatted to a specific style using format()function.

Syntax – The basic syntax for format function is :

 

Following is the description of the parameters used:

·   x is the vector input.

·   digits is the total number of digits displayed.

·   nsmall is the minimum number of digits to the right of the decimal point.

·   scientific is set to TRUE to display scientific notation.

·   width indicates the minimum width to be displayed by padding blanks in the beginning.

·   justify is the display of the string to left, right or center.

 

Other functions

Functions

Functionality

nchar(x)

This function counts the number of characters including spaces in a string.

toupper(x) / tolower(x)

These functions change the case of characters of a string.

substring(x,first,last)

This function extracts parts of a String.

A function is a set of statements organized together to perform a specific task. R has a large number of in-built functions and the user can create their own functions.

The different parts of a function are:

·   Function Name: This is the actual name of the function. It is stored in R environment as an object with this name.

·   Arguments: An argument is a placeholder. When a function is invoked, you pass a value to the argument. Arguments are optional; that is, a function may contain no arguments. Also arguments can have default values.

·   Function Body: The function body contains a collection of statements that defines what the function does.

·   Return Value: The return value of a function is the last expression in the function body to be evaluated.

 

R has many in-built functions which can be directly called in the program without defining them first. Simple examples of in-built functions are seq(), mean(), max(), sum(x)and paste(…) etc.

 

We can also create and use our own functions referred as user defined functions. An R function is created by using the keyword function. The basic syntax of an R function definition is as follows:

 

Example: Calling a function with argument values (by position and by name)

 

Example: Calling a function with default values

 

Lazy Evaluation of Function: Arguments to functions are evaluated lazily, which means so they are evaluated only when needed by the function body.

 

 

Vectors are the most basic R data objects and there are six types of atomic vectors. They are logical, integer, double, complex, character and raw. Even when you write just one value in R, it becomes a vector of length 1 and belongs to one of the above vector types.

# Atomic vector of type character.

print(“ABC”);

[1] “ABC”

# Atomic vector of type double.

print (1.2)

[1] 12.5

# Atomic vector of type integer.

print(10L)

[1] 10

# Atomic vector of type logical.

print(TRUE)

[1] TRUE

# Atomic vector of type complex.

print(4+8i)

[1] 4+8i

# Atomic vector of type raw.

print(charToRaw(‘hello’))

[1] 68 65 6c 6c 6f

 

Multiple Elements Vector

Using colon operator with numeric data

# Creating a sequence from 2 to 8.

v <- 2:8

print(v)

[1] 2 3 4 5 6 7 8

# Creating a sequence from 6.6 to 12.6.

v <- 6.6:12.6

print(v)

[1] 6.6 7.6 8.6 9.6 10.6 11.6 12.6

# If the final element specified does not belong to the sequence then it is discarded.

v <- 3.8:11.4

print(v)

[1] 3.8 4.8 5.8 6.8 7.8 8.8 9.8 10.8

 

Using sequence (Seq.) operator

Syntax and example of using Seq. operator:

# # Create vector with elements from 5 to 9 incrementing by 0.4.

print (seq(5, 9, by=0.4))

[1] 5.0 5.4 5.8 6.2 6.6 7.0 7.4 7.8 8.2 8.6 9.0

 

Using the c () function

The non-character values are coerced to character type if one of the elements is a char.

Syntax and example of using c() function:

##  The logical and numeric values are converted to characters.

x <- c(‘apple’, ‘red’, 5, TRUE)

print(x)

[1] “apple” “red” “5” “TRUE”

Accessing Vector Elements

Elements of a Vector are accessed using indexing. The [ ] brackets are used for indexing. Indexing starts with position 1. Giving a negative value in the index drops that element from result. TRUE, FALSE or 0 and 1 can also be used for indexing.

Syntax and example:

# Accessing vector elements using position.

t <- c(“Sun”,”Mon”,”Tue”,”Wed”,”Thurs”,”Fri”,”Sat”)

u <- t[c(2,3,6)]

print(u)

[1] “Mon” “Tue” “Fri”

 

# Accessing vector elements using logical indexing.

v <- t[c(TRUE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE)]

print(v)

[1] “Sun” “Fri”

 

# Accessing vector elements using negative indexing.

x <- t[c(-2,-5)]

print(x)

[1] “Sun” “Tue” “Wed” “Fri” “Sat”

 

# Accessing vector elements using 0/1 indexing.

y <- t[c(0,0,0,0,0,0,1)]

print(y)

[1] “Sun”

 

Vector Manipulation

Vector Arithmetic- Two vectors of same length can be added, subtracted, multiplied or divided giving the result as a vector output.

Syntax and example:

# Create two vectors.

v1 <- c(3,8,4,5,0,11)

v2 <- c(4,11,0,8,1,2)

 

# Vector addition.

add.result <- v1+v2

print(add.result)

[1] 7 19 4 13 1 13

 

# Vector substraction.

sub.result <- v1-v2

print(sub.result)

[1] -1 -3 4 -3 -1 9

 

# Vector multiplication.

multi.result <- v1*v2

print(multi.result)

[1] 12 88 0 40 0 22

 

# Vector division.

divi.result <- v1/v2

print(divi.result)

[1] 0.7500000 0.7272727 Inf 0.6250000 0.0000000 5.5000000

 

Vector Element Recycling

If we apply arithmetic operations to two vectors of unequal length, then the elements of the shorter vector are recycled to complete the operations.

Syntax and example:

v1 <- c(3,8,4,5,0,11)

v2 <- c(4,11)

# V2 becomes c(4,11,4,11,4,11)

add.result <- v1+v2

print(add.result)

[1] 7 19 8 16 4 22

 

sub.result <- v1-v2

print(sub.result)

[1] -1 -3 0 -6 -4 0

 

Vector Element Sorting

Elements in a vector can be sorted using the sort() function.

Syntax and example:

v <- c(3,8,4,5,0,11, -9, 304)

# Sort the elements of the vector.

sort.result <- sort(v)

print(sort.result)

[1] -9 0 3 4 5 8 11 304

 

# Sort the elements in the reverse order.

revsort.result <- sort(v, decreasing = TRUE)

print(revsort.result)

[1] 304 11 8 5 4 3 0 -9

 

 

# Sorting character vectors.

v <- c(“Red”,”Blue”,”yellow”,”violet”)

sort.result <- sort(v)

print(sort.result)

[1] “Blue” “Red” “violet” “yellow”

 

# Sorting character vectors in reverse order.

revsort.result <- sort(v, decreasing = TRUE)

print(revsort.result)

[1] “yellow” “violet” “Red” “Blue”

 

Lists are the R objects which contain elements of different types like – numbers, strings, vectors and another list inside it. A list can also contain a matrix or a function as its elements. List is created using list() function.

 

Syntax and example:

## Create a list containing strings, numbers, vectors and a logical values.

list_data <- list(“Red”, “Green”, c(21,32,11), TRUE, 51.23, 119.1)

print(list_data)

 

[[1]]

[1] “Red”

[[2]]

[1] “Green”

[[3]]

[1] 21 32 11

[[4]]

[1] TRUE

[[5]]

[1] 51.23

[[6]]

[1] 119.1

 

Naming List Elements

The list elements can be given names and they can be accessed using these names.

 

Manipulating List Elements

We can add, delete and update list elements as shown below. We can add and delete elements only at the end of a list. But we can update any element.

 

Merging Lists

You can merge many lists into one list by placing all the lists inside one list() function.

Converting Lists to Vector

A list can be converted to a vector so that the elements of the vector can be used for further manipulation. All the arithmetic operations on vectors can be applied after the list is converted into vectors. To do this conversion, we use the unlist() function. It takes the list as input and produces a vector.

 

Matrices are the R objects in which the elements are arranged in a two-dimensional

format. They contain elements of the same atomic types. But we use matrices containing numeric elements to be used in mathematical calculations. A Matrix is created using the matrix() function.

 

Syntax

Parameters used:

·        data is the input vector which becomes the data elements of the matrix.

·        nrow is the number of rows to be created.

·        ncol is the number of columns to be created.

·        byrow is a logical clue. If TRUE then the input vector elements are arranged by row.

·        dimname is the names assigned to the rows and columns.

# Elements are arranged sequentially by row.

M <- matrix(c(3:14), nrow=4, byrow=TRUE)

print(M)

# Elements are arranged sequentially by column.

N <- matrix(c(3:14), nrow=4, byrow=FALSE)

print(N)

# Define the column and row names.

rownames = c(“row1”, “row2”, “row3”, “row4”)

colnames = c(“col1”, “col2”, “col3”)

 

# Accessing Elements of a Matrix

# Access the element at 3rd column and 1st row.

print(N[1,3])

# Access the element at 2nd column and 4th row.

print(N[4,2])

 

# Access only the 2nd row.

print(N[2,])

# Access only the 3rd column.

print(N[,3])

 

Matrix Computations

Various mathematical operations are performed on the matrices using the R operators. The result of the operation is also a matrix. The dimensions (number of rows and columns) should be same for the matrices involved in the operation.

# Create two 2×3 matrices.

matrix1 <- matrix(c(3, 9, -1, 4, 2, 6), nrow=2)

print(matrix1)

matrix2 <- matrix(c(5, 2, 0, 9, 3, 4), nrow=2)

print(matrix2)

# Add the matrices.

result <- matrix1 + matrix2

cat(“Result of addition”,”\n”)

print(result)

# Subtract the matrices

result <- matrix1 – matrix2

cat(“Result of subtraction”,”\n”)

print(result)

 

Matrix Multiplication & Division

# Create two 2×3 matrices.

matrix1 <- matrix(c(3, 9, -1, 4, 2, 6), nrow=2)

print(matrix1)

matrix2 <- matrix(c(5, 2, 0, 9, 3, 4), nrow=2)

print(matrix2)

# Multiply the matrices.

result <- matrix1 * matrix2

cat(“Result of multiplication”,”\n”)

print(result)

# Divide the matrices

result <- matrix1 / matrix2

cat(“Result of division”,”\n”)

print(result)

 

Arrays are the R data objects which can store data in more than two dimensions. For example – If we create an array of dimension (2, 3, 4) then it creates 4 rectangular matrices each with 2 rows and 3 columns. Arrays can store only data type. An array is created using the array() function. It takes vectors as input and uses the values in the dim parameter to create an array.

 

# Create two vectors of different lengths.

vector1 <- c(5,9,3)

vector2 <- c(10,11,12,13,14,15)

# Take these vectors as input to the array.

result <- array(c(vector1,vector2),dim=c(3,3,2))

print(result)

 

Naming Columns and Rows: We can give names to the rows, columns and matrices in the array by using the dimnames parameter.

# Create two vectors of different lengths.

vector1 <- c(5,9,3)

vector2 <- c(10,11,12,13,14,15)

column.names <- c(“COL1″,”COL2″,”COL3”)

row.names <- c(“ROW1″,”ROW2″,”ROW3”)

matrix.names <- c(“Matrix1″,”Matrix2”)

# Take these vectors as input to the array.

result <- array(c(vector1,vector2),dim=c(3,3,2),dimnames =

                  list(column.names,row.names,matrix.names))

print(result)

 

Accessing Array Elements

# Create two vectors of different lengths.

vector1 <- c(5,9,3)

vector2 <- c(10,11,12,13,14,15)

column.names <- c(“COL1″,”COL2″,”COL3”)

row.names <- c(“ROW1″,”ROW2″,”ROW3”)

matrix.names <- c(“Matrix1″,”Matrix2”)

# Take these vectors as input to the array.

result <- array(c(vector1,vector2),dim=c(3,3,2),dimnames =

                  list(column.names,row.names,matrix.names))

# Print the third row of the second matrix of the array.

print(result[3,,2])

# Print the element in the 1st row and 3rd column of the 1st matrix.

print(result[1,3,1])

# Print the 2nd Matrix.

print(result[,,2])

 

Manipulating Array Elements

As array is made up matrices in multiple dimensions, the operations on elements of array are carried out by accessing elements of the matrices.

# Create two vectors of different lengths.

vector1 <- c(5,9,3)

vector2 <- c(10,11,12,13,14,15)

# Take these vectors as input to the array.

array1 <- array(c(vector1,vector2),dim=c(3,3,2))

# Create two vectors of different lengths.

vector3 <- c(9,1,0)

vector4 <- c(6,0,11,3,14,1,2,6,9)

array2 <- array(c(vector1,vector2),dim=c(3,3,2))

# create matrices from these arrays.

matrix1 <- array1[,,2]

matrix2 <- array2[,,2]

# Add the matrices.

result <- matrix1+matrix2

print(result)

 

Calculations Across Array Elements: We can do calculations across the elements in an array using the apply() function.

 

Syntax

 

Parameters used:

·        x is an array.

·        margin is the name of the data set used.

·        fun is the function to be applied across the elements of the array.

 

 

We use the apply() function below to calculate the sum of the elements in the rows of an array across all the matrices.

# Create two vectors of different lengths.

vector1 <- c(5,9,3)

vector2 <- c(10,11,12,13,14,15)

# Take these vectors as input to the array.

new.array <- array(c(vector1,vector2),dim=c(3,3,2))

print(new.array)

# Use apply to calculate the sum of the rows across all the matrices.

result <- apply(new.array, c(1), sum)

print(result)

 

Array indexing. Subsections of an array

Individual elements of an array may be referenced by giving the name of the array followed by

the subscripts in square brackets, separated by commas.

More generally, subsections of an array may be specified by giving a sequence of index vectors

in place of subscripts; however if any index position is given an empty index vector, then the full

range of that subscript is taken.

Continuing the previous example, a[2,,] is a 42 array with dimension vector c(4,2) and

data vector containing the values

c(a[2,1,1], a[2,2,1], a[2,3,1], a[2,4,1],

a[2,1,2], a[2,2,2], a[2,3,2], a[2,4,2])

in that order. a[,,] stands for the entire array, which is the same as omitting the subscripts

entirely and using a alone.

For any array, say Z, the dimension vector may be referenced explicitly as dim(Z) (on either

side of an assignment).

Also, if an array name is given with just one subscript or index vector, then the corresponding

values of the data vector only are used; in this case the dimension vector is ignored. This is not

the case, however, if the single index is not a vector but itself an array, as we next discuss.

 

Factors are the data objects which are used to categorize the data and store it as levels. They can store both strings and integers. They are useful in the columns which have a limited number of unique values. Like “Male, “Female” and True, False etc. They are useful in data analysis for statistical modeling.

A factor is a vector object used to specify a discrete classification (grouping) of the components

of other vectors of the same length. R provides both ordered and unordered factors. While the

“real” application of factors is with model formulae (see Section 11.1.1 [Contrasts], page 53), we

here look at a specific example.

4.1 A specific example

Suppose, for example, we have a sample of 30 tax accountants from all the states and territories

of Australia1 and their individual state of origin is specified by a character vector of state

mnemonics as

> state <- c(“tas”, “sa”, “qld”, “nsw”, “nsw”, “nt”, “wa”, “wa”,

“qld”, “vic”, “nsw”, “vic”, “qld”, “qld”, “sa”, “tas”,

“sa”, “nt”, “wa”, “vic”, “qld”, “nsw”, “nsw”, “wa”,

“sa”, “act”, “nsw”, “vic”, “vic”, “act”)

Notice that in the case of a character vector, “sorted” means sorted in alphabetical order.

A factor is similarly created using the factor() function:

> statef <- factor(state)

The print() function handles factors slightly differently from other objects:

> statef

[1] tas sa qld nsw nsw nt wa wa qld vic nsw vic qld qld sa

[16] tas sa nt wa vic qld nsw nsw wa sa act nsw vic vic act

Levels: act nsw nt qld sa tas vic wa

To find out the levels of a factor the function levels() can be used.

> levels(statef)

[1] “act” “nsw” “nt” “qld” “sa” “tas” “vic” “wa”

4.2 The function tapply() and ragged arrays

To continue the previous example, suppose we have the incomes of the same tax accountants in

another vector (in suitably large units of money)

> incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,

61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,

59, 46, 58, 43)

To calculate the sample mean income for each state we can now use the special function

tapply():

> incmeans <- tapply(incomes, statef, mean)

giving a means vector with the components labelled by the levels

act nsw nt qld sa tas vic wa

44.500 57.333 55.500 53.600 55.000 60.500 56.000 52.250

The function tapply() is used to apply a function, here mean(), to each group of components

of the first argument, here incomes, defined by the levels of the second component, here statef2, as if they were separate vector structures. The result is a structure of the same length as the

levels attribute of the factor containing the results. The reader should consult the help document

for more details.

Suppose further we needed to calculate the standard errors of the state income means. To do

this we need to write an R function to calculate the standard error for any given vector. Since

there is an builtin function var() to calculate the sample variance, such a function is a very

simple one liner, specified by the assignment:

> stdError <- function(x) sqrt(var(x)/length(x))

(Writing functions will be considered later in Chapter 10 [Writing your own functions], page 42.

Note that R’s a builtin function sd() is something different.) After this assignment, the standard

errors are calculated by

> incster <- tapply(incomes, statef, stderr)

and the values calculated are then

> incster

act nsw nt qld sa tas vic wa

1.5 4.3102 4.5 4.1061 2.7386 0.5 5.244 2.6575

As an exercise you may care to find the usual 95% confidence limits for the state mean

incomes. To do this you could use tapply() once more with the length() function to find

the sample sizes, and the qt() function to find the percentage points of the appropriate t-

distributions. (You could also investigate R’s facilities for t-tests.)

The function tapply() can also be used to handle more complicated indexing of a vector

by multiple categories. For example, we might wish to split the tax accountants by both state

and sex. However in this simple instance (just one factor) what happens can be thought of as

follows. The values in the vector are collected into groups corresponding to the distinct entries

in the factor. The function is then applied to each of these groups individually. The value is a

vector of function results, labelled by the levels attribute of the factor.

The combination of a vector and a labelling factor is an example of what is sometimes called

a ragged array, since the subclass sizes are possibly irregular. When the subclass sizes are all

the same the indexing may be done implicitly and much more efficiently, as we see in the next

section.

4.3 Ordered factors

The levels of factors are stored in alphabetical order, or in the order they were specified to

factor if they were specified explicitly.

Sometimes the levels will have a natural ordering that we want to record and want our

statistical analysis to make use of. The ordered() function creates such ordered factors but

is otherwise identical to factor. For most purposes the only difference between ordered and

unordered factors is that the former are printed showing the ordering of the levels, but the

contrasts generated for them in fitting linear models are different.

 

Factors are created using the factor () function by taking a vector as input.

Factors are categorical variables that are super useful in summary statistics, plots, and regressions. They basically act like dummy variables that R codes for you.  So, let’s start off with some data:

and let’s check out what kinds of variables we have:

 

so we see that Race is a factor variable with three levels.  I can see all the levels this way:

So what his means that R groups statistics by these levels.  Internally, R stores the integer values 1, 2, and 3, and maps the character strings (in alphabetical order, unless I reorder) to these values, i.e. 1=Black, 2=Hispanic, and 3=White.  Now if I were to do a summary of this variable, it shows me the counts for each category, as below.  R won’t let me do a mean or any other statistic of a factor variable other than a count, so keep that in mind. But you can always change your factor to be numeric.

If I do a plot of age on race, I get a boxplot from the normal plot command since that is what makes sense for a categorical variable:

 

plot(mydata$Age~mydata$Race, xlab=”Race”, ylab=”Age”, main=”Boxplots of Age by Race”)

# Create a vector as input.

data <-

  c(“East”,”West”,”East”,”North”,”North”,”East”,”West”,”West”,”West”,”East”,”North”)

print(data)

print(is.factor(data))

# Apply the factor function.

factor_data <- factor(data)

print(factor_data)

print(is.factor(factor_data))

 

Factors in Data Frame

On creating any data frame with a column of text data, R treats the text column as categorical data and creates factors on it.

# Create the vectors for data frame.

height <- c(132,151,162,139,166,147,122)

weight <- c(48,49,66,53,67,52,40)

gender <- c(“male”,”male”,”female”,”female”,”male”,”female”,”male”)

# Create the data frame.

input_data <- data.frame(height,weight,gender)

print(input_data)

# Test if the gender column is a factor.

print(is.factor(input_data$gender))

# Print the gender column so see the levels.

print(input_data$gender)

 

Changing the Order of Levels: The order of the levels in a factor can be changed by applying the factor function again with new order of the levels.

data <-

  c(“East”,”West”,”East”,”North”,”North”,”East”,”West”,”West”,”West”,”East”,”North”)

# Create the factors

factor_data <- factor(data)

print(factor_data)

# Apply the factor function with required order of the level.

new_order_data <- factor(factor_data,levels = c(“East”,”West”,”North”))

print(new_order_data)

 

Generating Factor Levels: We can generate factor levels by using the gl() function. It takes two integers as input which indicates how many levels and how many times each level.

Syntax: gl(n, k, labels)

 

Following is the description of the parameters used:

·        n is a integer giving the number of levels.

·        k is a integer giving the number of replications.

·        labels is a vector of labels for the resulting factor levels.

v <- gl(3, 4, labels = c(“Tampa”, “Seattle”,”Boston”))

print(v)

 

 

A data frame is a table or a two-dimensional array-like structure in which each column contains values of one variable and each row contains one set of values from each column. Following are the characteristics of a data frame:

·        The column names should be non-empty.

·        The row names should be unique.

·        The data stored in a data frame can be of numeric, factor or character type.

·        Each column should contain same number of data items.

 

# Create the data frame.

emp.data <- data.frame(

  emp_id = c (1:5),

  emp_name = c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”),

  salary = c(623.3,515.2,611.0,729.0,843.25),

  start_date = as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05-

                         11″,”2015-03-27″)),

  stringsAsFactors=FALSE

  )

# Print the data frame.

print(emp.data)

 

Get the Structure of the Data Frame: The structure of the data frame can be seen by using str() function.

# Create the data frame.

emp.data <- data.frame(

emp_id = c (1:5),

emp_name = c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”),

salary = c(623.3,515.2,611.0,729.0,843.25),

start_date = as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05-

11″,”2015-03-27″)),

stringsAsFactors=FALSE

)

# Get the structure of the data frame.

str(emp.data)

 

Summary of Data in Data Frame

The statistical summary and nature of the data can be obtained by applying summary() function.

# Create the data frame.

emp.data <- data.frame(

emp_id = c (1:5),

emp_name = c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”),

salary = c(623.3,515.2,611.0,729.0,843.25),

start_date = as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05-

11″,”2015-03-27″)),

stringsAsFactors=FALSE

)

# Print the summary.

print(summary(emp.data))

 

Extract Data from Data Frame

Extract specific column from a data frame using column name.

# Create the data frame.

emp.data <- data.frame(

  emp_id = c (1:5),

  emp_name = c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”),

  salary = c(623.3,515.2,611.0,729.0,843.25),

  start_date = as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05-

                         11″,”2015-03-27″)),

  stringsAsFactors=FALSE

  )

# Extract Specific columns.

result <- data.frame(emp.data$emp_name,emp.data$salary)

print(result)

 

# Extract 3rd and 5th row with 2nd and 4th column.

result <- emp.data[c(3,5),c(2,4)]

print(result)

 

# Extract first two rows.

result <- emp.data[1:2,]

print(result)

 

# Expand Data Frame – A data frame can be expanded by adding columns and rows.

# Add the “dept” coulmn.

emp.data$dept <- c(“IT”,”Operations”,”IT”,”HR”,”Finance”)

v <- emp.data

print(v)

 

 

Add Row

To add more rows permanently to an existing data frame, we need to bring in the new rows in the same structure as the existing data frame and use the rbind() function. In the example below we create a data frame with new rows and merge it with the existing data frame to create the final data frame.

# Create the first data frame.

emp.data <- data.frame(

  emp_id = c (1:5),

  emp_name = c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”),

  salary = c(623.3,515.2,611.0,729.0,843.25),

  start_date = as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05-

                         11″,”2015-03-27″)),

  dept=c(“IT”,”Operations”,”IT”,”HR”,”Finance”),

  stringsAsFactors=FALSE

)

# Create the second data frame

emp.newdata <- data.frame(

  emp_id = c (6:8),

  emp_name = c(“Rasmi”,”Pranab”,”Tusar”),

  salary = c(578.0,722.5,632.8),

  start_date = as.Date(c(“2013-05-21″,”2013-07-30″,”2014-06-17”)),

  dept = c(“IT”,”Operations”,”Fianance”),

  stringsAsFactors=FALSE

)

# Bind the two data frames.

emp.finaldata <- rbind(emp.data,emp.newdata)

print(emp.finaldata)

 

Unit 4: Simple manipulations; numbers and vectors

Vectors and assignment

R operates on named data structures. The simplest such structure is the numeric vector, which is a single entity consisting of an ordered collection of numbers. To set up a vector named x, say, consisting of five numbers, namely 10.4, 5.6, 3.1, 6.4 and 21.7, use the R command

> x <- c(10.4, 5.6, 3.1, 6.4, 21.7)

 

This is an assignment statement using the function c() which in this context can take an arbitrary number of vector arguments and whose value is a vector got by concatenating its

arguments end to end. A number occurring by itself in an expression is taken as a vector of length one. Notice that the assignment operator (‘<-’), which consists of the two characters ‘<’ (“less than”) and ‘-’ (“minus”) occurring strictly side-by-side and it ‘points’ to the object receiving the value of the expression. In most contexts the ‘=’ operator can be used as an alternative. Assignment can also be made using the function assign(). An equivalent way of making the same assignment as above is with:

> assign(“x”, c(10.4, 5.6, 3.1, 6.4, 21.7))

The usual operator, <-, can be thought of as a syntactic short-cut to this.

Assignments can also be made in the other direction, using the obvious change in the assignment operator. So the same assignment could be made using

> c(10.4, 5.6, 3.1, 6.4, 21.7) -> x

If an expression is used as a complete command, the value is printed and lost 2. So now if we

were to use the command

> 1/x

the reciprocals of the five values would be printed at the terminal (and the value of x, of course, unchanged).

The further assignment

> y <- c(x, 0, x)

would create a vector y with 11 entries consisting of two copies of x with a zero in the middle

place.

 

Vector arithmetic

Vectors can be used in arithmetic expressions, in which case the operations are performed element by element. Vectors occurring in the same expression need not all be of the same length. If they are not, the value of the expression is a vector with the same length as the longest vector which occurs in the expression. Shorter vectors in the expression are recycled as often as need be (perhaps fractionally) until they match the length of the longest vector. In particular a constant is simply repeated. So with the above assignments the command

> v <- 2*x + y + 1

generates a new vector v of length 11 constructed by adding together, element by element, 2*x repeated 2.2 times, y repeated just once, and 1 repeated 11 times.

 

The elementary arithmetic operators are the usual +, -, *, / and ^ for raising to a power. In

addition all of the common arithmetic functions are available. log, exp, sin, cos, tan, sqrt,

and so on, all have their usual meaning. max and min select the largest and smallest elements of a vector respectively. range is a function whose value is a vector of length two, namely c(min(x), max(x)). length(x) is the number of elements in x, sum(x) gives the total of the elements in x, and prod(x) their product.

Two statistical functions are mean(x) which calculates the sample mean, which is the same

as sum(x)/length(x), and var(x) which gives sum((x-mean(x))^2)/(length(x)-1)

 

or sample variance. If the argument to var() is an n-by-p matrix the value is a p-by-p sample

covariance matrix got by regarding the rows as independent p-variate sample vectors.

sort(x) returns a vector of the same size as x with the elements arranged in increasing order;

however there are other more flexible sorting facilities available (see order() or sort.list()

which produce a permutation to do the sorting).

Note that max and min select the largest and smallest values in their arguments, even if they

are given several vectors. The parallel maximum and minimum functions pmax and pmin return a vector (of length equal to their longest argument) that contains in each element the largest (smallest) element in that position in any of the input vectors.

For most purposes the user will not be concerned if the “numbers” in a numeric vector

are integers, reals or even complex. Internally calculations are done as double precision real

numbers, or double precision complex numbers if the input data are complex.

 

To work with complex numbers, supply an explicit complex part. Thus

sqrt(-17)    :    will give NaN and a warning, but

sqrt(-17+0i)     :    will do the computations as complex numbers.

 

Generating regular sequences

R has a number of facilities for generating commonly used sequences of numbers. For example

1:30 is the vector c(1, 2, …, 29, 30). The colon operator has high priority within an expression,

so, for example 2*1:15 is the vector c(2, 4, …, 28, 30). Put n <- 10 and compare

the sequences 1:n-1 and 1:(n-1).

The construction 30:1 may be used to generate a sequence backwards.

The function seq() is a more general facility for generating sequences. It has five arguments,

only some of which may be specified in any one call. The first two arguments, if given, specify

the beginning and end of the sequence, and if these are the only two arguments given the result is the same as the colon operator. That is seq(2,10) is the same vector as 2:10.

Arguments to seq(), and to many other R functions, can also be given in named form, in

which case the order in which they appear is irrelevant. The first two arguments may be named from=value and to=value; thus seq(1,30), seq(from=1, to=30) and seq(to=30, from=1)

are all the same as 1:30. The next two arguments to seq() may be named by=value and

length=value, which specify a step size and a length for the sequence respectively. If neither

of these is given, the default by=1 is assumed.

For example

> seq(-5, 5, by=.2) -> s3

generates in s3 the vector c(-5.0, -4.8, -4.6, …, 4.6, 4.8, 5.0). Similarly

> s4 <- seq(length=51, from=-5, by=.2)

generates the same vector in s4.

The fifth argument may be named along=vector, which is normally used as the only argument

to create the sequence 1, 2, …, length(vector), or the empty sequence if the vector

is empty (as it can be).

A related function is rep() which can be used for replicating an object in various complicated

ways. The simplest form is

> s5 <- rep(x, times=5)

which will put five copies of x end-to-end in s5. Another useful version is

> s6 <- rep(x, each=5)

which repeats each element of x five times before moving on to the next.

 

Logical vectors

As well as numerical vectors, R allows manipulation of logical quantities. The elements of a

logical vector can have the values TRUE, FALSE, and NA (for “not available”). The

first two are often abbreviated as T and F, respectively. Note however that T and F are just

variables which are set to TRUE and FALSE by default, but are not reserved words and hence can be overwritten by the user. Hence, you should always use TRUE and FALSE.

Logical vectors are generated by conditions. For example

> temp <- x > 13

sets temp as a vector of the same length as x with values FALSE corresponding to elements of x where the condition is not met and TRUE where it is.

The logical operators are <, <=, >, >=, == for exact equality and != for inequality. In addition

if c1 and c2 are logical expressions, then c1 & c2 is their intersection (“and”), c1 | c2 is their

union (“or”), and !c1 is the negation of c1.

Logical vectors may be used in ordinary arithmetic, in which case they are coerced into

numeric vectors, FALSE becoming 0 and TRUE becoming 1. However there are situations where logical vectors and their coerced numeric counterparts are not equivalent, for example see the next subsection.

 

Missing values

In some cases the components of a vector may not be completely known. When an element

or value is “not available” or a “missing value” in the statistical sense, a place within a vector

may be reserved for it by assigning it the special value NA. In general, any operation on an NA

becomes an NA. The motivation for this rule is simply that if the specification of an operation

is incomplete, the result cannot be known and hence is not available.

The function is.na(x) gives a logical vector of the same size as x with value TRUE if and

only if the corresponding element in x is NA.

> z <- c(1:3,NA); ind <- is.na(z)

Notice that the logical expression x == NA is quite different from is.na(x) since NA is not

really a value but a marker for a quantity that is not available. Thus x == NA is a vector of the

same length as x all of whose values are NA as the logical expression itself is incomplete and

hence undecidable.

Note that there is a second kind of “missing” values which are produced by numerical computation, the so-called Not a Number, NaN, values. Examples are

> 0/0

or

> Inf – Inf

which both give NaN since the result cannot be defined sensibly.

In summary, is.na(xx) is TRUE both for NA and NaN values. To differentiate these,

is.nan(xx) is only TRUE for NaNs.

Missing values are sometimes printed as <NA> when character vectors are printed without

quotes.

2.6 Character vectors

Character quantities and character vectors are used frequently in R, for example as plot labels.

Where needed they are denoted by a sequence of characters delimited by the double quote

character, e.g., “x-values”, “New iteration results”.

Character strings are entered using either matching double (“) or single (’) quotes, but are

printed using double quotes (or sometimes without quotes). They use C-style escape sequences,

using \ as the escape character, so \\ is entered and printed as \\, and inside double quotes “

is entered as \”. Other useful escape sequences are \n, newline, \t, tab and \b, backspace—see

?Quotes for a full list.

Character vectors may be concatenated into a vector by the c() function; examples of their

use will emerge frequently.

The paste() function takes an arbitrary number of arguments and concatenates them one by

one into character strings. Any numbers given among the arguments are coerced into character

strings in the evident way, that is, in the same way they would be if they were printed. The

arguments are by default separated in the result by a single blank character, but this can be

changed by the named argument, sep=string, which changes it to string, possibly empty.

For example

> labs <- paste(c(“X”,”Y”), 1:10, sep=””)

makes labs into the character vector

c(“X1”, “Y2”, “X3”, “Y4”, “X5”, “Y6”, “X7”, “Y8”, “X9”, “Y10”)

Note particularly that recycling of short lists takes place here too; thus c(“X”, “Y”) is

repeated 5 times to match the sequence 1:10.3

2.7 Index vectors; selecting and modifying subsets of a data set

Subsets of the elements of a vector may be selected by appending to the name of the vector an

index vector in square brackets. More generally any expression that evaluates to a vector may

have subsets of its elements similarly selected by appending an index vector in square brackets

immediately after the expression.

Such index vectors can be any of four distinct types.

1. A logical vector. In this case the index vector is recycled to the same length as the vector

from which elements are to be selected. Values corresponding to TRUE in the index vector

are selected and those corresponding to FALSE are omitted. For example

> y <- x[!is.na(x)]

creates (or re-creates) an object y which will contain the non-missing values of x, in the

same order. Note that if x has missing values, y will be shorter than x. Also

> (x+1)[(!is.na(x)) & x>0] -> z

creates an object z and places in it the values of the vector x+1 for which the corresponding

value in x was both non-missing and positive.

 

2. A vector of positive integral quantities. In this case the values in the index vector must lie

in the set f1, 2, . . . , length(x)g. The corresponding elements of the vector are selected and

concatenated, in that order, in the result. The index vector can be of any length and the

result is of the same length as the index vector. For example x[6] is the sixth component

of x and

> x[1:10]

selects the first 10 elements of x (assuming length(x) is not less than 10). Also

> c(“x”,”y”)[rep(c(1,2,2,1), times=4)]

(an admittedly unlikely thing to do) produces a character vector of length 16 consisting of

“x”, “y”, “y”, “x” repeated four times.

3. A vector of negative integral quantities. Such an index vector specifies the values to be

excluded rather than included. Thus

> y <- x[-(1:5)]

gives y all but the first five elements of x.

4. A vector of character strings. This possibility only applies where an object has a names

attribute to identify its components. In this case a sub-vector of the names vector may be

used in the same way as the positive integral labels in item 2 further above.

> fruit <- c(5, 10, 1, 20)

> names(fruit) <- c(“orange”, “banana”, “apple”, “peach”)

> lunch <- fruit[c(“apple”,”orange”)]

The advantage is that alphanumeric names are often easier to remember than numeric

indices. This option is particularly useful in connection with data frames, as we shall see

later.

An indexed expression can also appear on the receiving end of an assignment, in which case

the assignment operation is performed only on those elements of the vector. The expression

must be of the form vector[index_vector] as having an arbitrary expression in place of the

vector name does not make much sense here.

For example

> x[is.na(x)] <- 0

replaces any missing values in x by zeros and

> y[y < 0] <- -y[y < 0]

has the same effect as

> y <- abs(y)

2.8 Other types of objects

Vectors are the most important type of object in R, but there are several others which we will

meet more formally in later sections.

matrices or more generally arrays are multi-dimensional generalizations of vectors. In fact,

they are vectors that can be indexed by two or more indices and will be printed in special

ways. See Chapter 5 [Arrays and matrices], page 18.

factors provide compact ways to handle categorical data. See Chapter 4 [Factors], page 16.

lists are a general form of vector in which the various elements need not be of the same

type, and are often themselves vectors or lists. Lists provide a convenient way to return the

results of a statistical computation. See Section 6.1 [Lists], page 26.

data frames are matrix-like structures, in which the columns can be of different types. Think

of data frames as ‘data matrices’ with one row per observational unit but with (possibly) both numerical and categorical variables. Many experiments are best described by data

frames: the treatments are categorical but the response is numeric. See Section 6.3 [Data

frames], page 27.

functions are themselves objects in R which can be stored in the project’s workspace. This

provides a simple and convenient way to extend R. See Chapter 10 [Writing your own

functions], page 42.

Objects, their modes and attributes

 

Changing the length of an object

An “empty” object may still have a mode. For example

> e <- numeric()

makes e an empty vector structure of mode numeric. Similarly character() is a empty character

vector, and so on. Once an object of any size has been created, new components may be added

to it simply by giving it an index value outside its previous range. Thus

> e[3] <- 17

now makes e a vector of length 3, (the first two components of which are at this point both NA).

This applies to any structure at all, provided the mode of the additional component(s) agrees

with the mode of the object in the first place.

This automatic adjustment of lengths of an object is used often, for example in the scan()

function for input. (see Section 7.2 [The scan() function], page 31.)

Conversely to truncate the size of an object requires only an assignment to do so. Hence if

alpha is an object of length 10, then

> alpha <- alpha[2 * 1:5]

makes it an object of length 5 consisting of just the former components with even index. (The

old indices are not retained, of course.) We can then retain just the first three values by

> length(alpha) <- 3

and vectors can be extended (by missing values) in the same way.

3.3 Getting and setting attributes

The function attributes(object) returns a list of all the non-intrinsic attributes currently

defined for that object. The function attr(object, name) can be used to select a specific

attribute. These functions are rarely used, except in rather special circumstances when some

new attribute is being created for some particular purpose, for example to associate a creation

date or an operator with an R object. The concept, however, is very important.

Some care should be exercised when assigning or deleting attributes since they are an integral

part of the object system used in R.

When it is used on the left hand side of an assignment it can be used either to associate a

new attribute with object or to change an existing one. For example

> attr(z, “dim”) <- c(10,10)

allows R to treat z as if it were a 10-by-10 matrix.

3.4 The class of an object

All objects in R have a class, reported by the function class. For simple vectors this is just the

mode, for example “numeric”, “logical”, “character” or “list”, but “matrix”, “array”,

“factor” and “data.frame” are other possible values.

A special attribute known as the class of the object is used to allow for an object-oriented

style4 of programming in R. For example if an object has class “data.frame”, it will be printed

in a certain way, the plot() function will display it graphically in a certain way, and other

so-called generic functions such as summary() will react to it as an argument in a way sensitive

to its class.

To remove temporarily the effects of class, use the function unclass(). For example if winter

has the class “data.frame” then

> winter

 

will print it in data frame form, which is rather like a matrix, whereas

> unclass(winter)

will print it as an ordinary list. Only in rather special situations do you need to use this facility,

but one is when you are learning to come to terms with the idea of class and generic functions.

Generic functions and classes will be discussed further in Section 10.9 [Object orientation],

page 48, but only briefly.

 

 

 

 

Importing and manipulating your data are important steps in the data science workflow. R allows for the import of different data formats using specific packages that can make your job easier:

·        readr for importing flat files

·        The readxl package for getting excel files into R

·        The haven package lets you import SAS, STATA and SPSS data files into R.

·        Databases: connect via packages like RMySQL and RpostgreSQL, and access and manipulate via DBI

·        rvest for webscraping

 

Once your data is available in your working environment you are ready to start manipulating it using these packages:

·        The tidyr package for tidying your data.

·        The stringr package for string manipulation.

·        For data frame like objects learn the ins and outs of the dplyr package

·        Need to perform heavy data wrangling tasks? Check out the data.table package

·        Performing time series analysis? Try out packages like like zoo, xts and quantmod.

 

Let’s practice

 

# Get and print current working directory.

print(getwd())

 

#Reading a CSV File

data <- read.csv(“input.csv”)

print(data)

 

# Analyzing the CSV File

data <- read.csv(“input.csv”)

print(is.data.frame(data))

print(ncol(data))

print(nrow(data))

 

#Get the maximum salary:

# Create a data frame.

data <- read.csv(“input.csv”)

# Get the max salary from data frame.

sal <- max(data$salary)

print(sal)

 

# Get the max salary from data frame.

sal <- max(data$salary)

# Get the person detail having max salary.

retval <- subset(data, salary == max(salary))

print(retval)

 

#Get the persons in IT department whose salary is greater than 600

info <- subset(data, salary > 600 & dept == “IT”)

print(info)

 

#Get the people who joined on or after 2014

retval <- subset(data, as.Date(start_date) > as.Date(“2014-01-01”))

print(retval)

 

Writing into a CSV File

R can create csv file form existing data frame. The write.csv() function is used to create the csv file. This file gets created in the working directory

 

# Create a data frame.

data <- read.csv(“input.csv”)

retval <- subset(data, as.Date(start_date) > as.Date(“2014-01-01”))

# Write filtered data into a new file.

write.csv(retval,”output.csv”)

newdata <- read.csv(“output.csv”)

print(newdata)

 

retval <- subset(data, as.Date(start_date) > as.Date(“2014-01-01”))

# Write filtered data into a new file.

write.csv(retval,”output.csv”, row.names=FALSE)

newdata <- read.csv(“output.csv”)

print(newdata)

 

 

# Verify the package is installed.

any(grepl(“xlsx”,installed.packages()))

# Load the library into R workspace.

library(“xlsx”)

 

Input as XLSX file

Open Microsoft excel. Copy and paste the following data in the work sheet named as sheet1.

Also copy and paste the following data to another worksheet and rename this worksheet to “city”.

 

Save the Excel file as “input.xlsx”. You should save it in the current working directory of the R workspace.

 

Reading the Excel File

The input.xlsx is read by using the read.xlsx() function as shown below. The result is stored as a data frame in the R environment.

# Read the first worksheet in the file input.xlsx.

data <- read.xlsx(“input.xlsx”, sheetIndex = 1)

print(data)

 

Note: These examples are for 32 bit Windows

 

First, load the RODBC package (you’ll also have to install it if you don’t have it already).

 

# Load RODBC package

 library(RODBC)

 

Next, connect to the Access database. This code creates an object called “channel” that tells R where the Access database is.

 

If you paste the path from windows be sure to change every backslash to a forward slash.

Do not include the file extension (.accdb or .mdb) on the end of the name of the database.

 

# Connect to Access db

 channel <- odbcConnectAccess(“C:/Documents/Name_Of_My_Access_Database”)

 

Finally, run a SQL query to return the data.

# Get data

data <- sqlQuery( channel , paste (“select *

 from Name_of_table_in_my_database”))

 

Return All Data from One Table

Example shows how to connect to database in R and queries the database DATABASE and returns all of the data (this is specified using the * in SQL) from the table DATATABLE. The table is preceded by the database schema SCHEMA and separated by a period. Each of the words in all caps needs within the query needs to be replaced so that the query applies to your database.

# Load RODBC package

library(RODBC)

 

# Create a connection to the database called “channel”

# If you are using operating system authentication (the computer already knows who you

# are because you are logged into it) you can leave out the uid=”USERNAME”, part.

channel <- odbcConnect(“DATABASE”, uid=”USERNAME”, pwd=”PASSWORD”, believeNRows=FALSE)

 

# Check that connection is working (Optional)

odbcGetInfo(channel)

 

# Find out what tables are available (Optional)

Tables <- sqlTables(channel, schema=”SCHEMA”)

 

# Query the database and put the results into the data frame “dataframe”

 dataframe <- sqlQuery(channel, “

 SELECT *

 FROM

 SCHEMA.DATATABLE”)

 

Return Only Specific Fields

Example shows how to connect to database in R and query the database DATABASE and pull only the specified fields from the table DATATABLE. Note that loading the RODBC package and creating a connection does not have to be repeated if they were done in the first example.

 

# Load RODBC package

library(RODBC)

 

# Create a connection to the database called “channel”

channel <- odbcConnect(“DATABASE”, uid=”USERNAME”, pwd=”PASSWORD”, believeNRows=FALSE)

 

# Find out what fields are available in the table (Optional)

# as.data.frame coerces the data into a data frame for easy viewing

Columns <- as.data.frame(colnames(sqlFetch(channel, “SCHEMA.DATATABLE”)))

 

# Query the database and put the results into the data frame “dataframe”

 dataframe <- sqlQuery(channel, “

 SELECT SCHOOL,

 STUDENT_NAME

 FROM

 SCHEMA.DATATABLE”)

 

Joining Two Tables and Returning Only Specific Fields and Records

 

# Load RODBC package

library(RODBC)

 

# Create a connection to the database called “channel”

channel <- odbcConnect(“DATABASE”, uid=”USERNAME”, pwd=”PASSWORD”, believeNRows=FALSE)

 

# Query the database and put the results into the data frame “dataframe”

 dataframe <- sqlQuery(channel, “

 SELECT

 DT.SCHOOL_YEAR,

 DTTWO.DISTRICT_NAME AS DISTRICT,

 DTTWO.SCHOOL_NAME AS SCHOOL,

 DT.GRADE_LEVEL AS GRADE,

 DT.ACTL_ATT_DAYS AS ACTUAL_DAYS,

 DT.POSS_ATT_DAYS AS POSSIBLE_DAYS

 FROM

 (SCHEMA.DATATABLE DT INNER JOIN SCHEMA.DATATABLE_TWO DTTWO

 ON (DT.SCHOOL_YEAR = DTTWO.SCHOOL_YEAR AND

 DT.SCHOOL_NUMBER = DTTWO.SCHOOL_CODE))

 WHERE

 DT.SCHOOL_YEAR = ‘2011-12’ AND

 DTTWO.SCHOOL_NAME = ‘Pine Tree Elementary School'”)

 

Using a Parameter from R to Return Only Specific Records

 

# Load RODBC package

library(RODBC)

 

# Create a connection to the database called “channel”

channel <- odbcConnect(“DATABASE”, uid=”USERNAME”, pwd=”PASSWORD”, believeNRows=FALSE)

 

# Parameter

YEARS <- c(“2012”, “2013”, “2014”)

 

# Query the database and put the results into the data frame “dataframe”

dataframe <- sqlQuery(channel, paste(“SELECT

 YEAR,

 SCHOOL_YEAR,

 DISTRICT_CODE,

 GRADE_LEVEL

 FROM SCHEMA.DATATABLE

 WHERE SCHEMA.DATATABLE.SCHOOL_YEAR IN (‘”, paste(YEARS, collapse = “‘, ‘”), “‘)

 “, sep=””))

 

 

 

 

The basis of any analysis is to understand, evaluate and interpret complex results. Thus, it is very imperative for an analyst to have a very comprehensive understanding of the data under scrutiny and relationship among various variables. The simplest yet very power powerful approach to gain a better understanding of the data is graphical techniques. For example, if you are looking at a excel spreadsheet for daily revenue data for a firm in a year, it is obviously not possible to understand if there is a particular trend or seasonality. But, by just plotting the data using a line chart, you can easily see seasonality, trend, and average behavior in one short. Let’s take an example of a scatterplot. A simple scatter plot not only shows the correlation between two variables but also shows linearity, non-linearity, homogeneity in the data. More importantly, data visualization also helps in presenting results to higher management group in a very simple manner. In this section we will explore various data visualization technique using R.

 

For most of the plots in the next sub sections, we have used a dataset consisting of following metrics for Year 2010-2017 for a website.

·        Date     

·        Visits     

·        Page views

·        Unique Visitors

·        Bounce rate

 

Basic Visualization Techniques

1.      Histogram: Histogram is used to plot continuous variable. It breaks the data into bins (or breaks) and shows frequency distribution of these bins. Histograms are appropriate to understand underlying distribution.

R Code:

h <- hist(Data$Visits, # Vector of data to be plotted

          main = “Total Visits of a Web Site Per Year”, # Title of the plot

          xlab = ” Visits”, # Title of the x – axis

          # xlim = c(15, 40),# limit on the x axis

          col = “palevioletred1”, # Color of the bar to be filled

          border = “brown”, # color of the border around the bin

          freq = T) # representation of frequencies

text (h$mids, h$counts, labels=h$counts, adj = c(0.5, -0.5)) # Give number on each bar

Figure:

In a histogram, the area of the bar indicates the frequency of occurrences for each value. From the figure, it found that the visits in the range of 1000000-1200000 occurring three times, the spread is more between 1000000-12000000. From the figure we can say the, there are no outliers in the data.  The Histogram shows the data follows an irregular clustered distribution.

2.      Bar/Line chart:

Line: Line Charts are chosen to examine a trend spread over a period. Additionally, line plot is used to compare relative changes in quantities across some variable (like time). Line charts are typically used to analyze trend in a data. It can also be used to understand outliers and to check normality assumptions.

R Code:

p <- plot_ly (Data, # Data frame

             x = ~Date, # x- axis data

             y = ~Visits) %>% # y- axis data

      add_lines() %>% # Add traces to a plotly visualization

      filter(Visits == min(Visits)) # filtering minimum among all values

      plotly_data (p) # obtaining data associated with a plotly graph

      add_markers (p) # Add traces to a plotly visualization

      layout (p, annotations = list(x = ~Year, y = ~Visits, text = “Valley”)) %>%

      layout (title = “Total Visits of a Web Site per year”, xaxis = list (title = “Date”,        showgrid = F), yaxis = list (title = “Visits”), showlegend = F)

 

 

Figure:

 

The above line chart shows the visitors for a website yearly from 2010 to 2016.  It gives fairly good idea that the visitors of the website have grown continuously up to 2015 over a particular time frame. In the year 2015, the total visitors for a website are high and decreased in the year 2016 around 15%. The visitor’s data of a website follows a left skewed normal distribution. I
Bar: Bar Plots are used to compare cumulative totals across several groups.

R Code:

plot_ly (Data, # Data frame

        type=”bar”, # Type of chart

        x = ~Date, # x- axis data

        y = ~Visits, # y- axis data

        visible = TRUE, # Visualbility of plot

        showlegend = TRUE) %>% # Legend status

  layout (title = “Total Visits of a Web Site Per Year”, # Title of the chart

         xaxis = list (title = “Year”, showgrid = TRUE, color = “red”), # list of x-axis properties

         yaxis = list (title = “Visits”, showgrid = TRUE, color = “green”)) # list of y-axis properties

Figure:

 

The bar chart indicates the number of visitors for a website between the years 2010-2016. It can be seen that the number of visitors is increasing linearly up to 2015; however, it decreases in the year 2016.

3.      Box plot: Box Plot used for visualizing the spread of the data and deriving inferences accordingly and also determine outliers.

R Code:

boxplot (Data [, 2:4], # Specifying data

        las = 1, #for Naming Vertical (las = 2) or Horizontal (las = 10)

        col = c (“sienna”,”green”), # Color of the box

        main = “Total Visits and Pageviews of a Web Site Per Year”) # Title of the plot

Figure:

The chart gives information about the spread of the data for Visitors, Page.views, and Unique visitors. The quartile range for visitors, page views, and unique visitors are around 300000, 1100000 and 150000respectively. That means there is tightly bound for unique visitors. For Visitors, unique visitors the median lies very close to the upper quartile.

4.      Scatter plot: Scatter plot used to visualize data easily and for simple data inspection.

R Code:

plot_ly (Data, # Data frame

        type =”scatter”, # Type of chart

        x = ~Date, # x- axis data

        y = ~Visits, # y- axis data

        visible = TRUE, # Visualbility of plot

        showlegend = TRUE) %>% # Legend status

  layout (title = “Total Visits of a Web Site Per Year”,

         xaxis = list (title = “Date”, showgrid = TRUE, color = “red”),

         yaxis = list (title = “Visits”, showgrid = TRUE, color = “green”))

Figure:

The graph above shows the relationship between visitors, page views, unique visitors and bounce rate during 2010 to 2016. It is observed that, higher number of visitors to a website leads to lower bounce rate. However visitors, page views and unique visitors interrelated to each other.

 

Advanced Visualization Techniques

 

1.      Heat map- Heat maps used to do empirical data analysis with two dimensions as the axis and the third dimension shown by intensity of color.

R Code:

heatmap (as.matrix (Data[, 18:21]), las=2)

 

R Code:

heatmap.2 (as.matrix (Data), # numeric matrix of the values

          dendrogram =”row”) # row dendrogram plotted and row reordering done

Figure:

 

The heat map gives the hierarchical clustering of visitors, unique visitors, page views and bounce rate. Initially, visitors and unique visitors together form a cluster because of their much similarity in their values. Then, bounce rate is clustered with the existing one, and finally, they clustered with page views.

2.      Mosaic plot- A mosaic plot can be used for plotting categorical data very effectively with the area of the data showing the relative proportions.

R Code:

mosaicplot (~ Visits + Page.views, # formula

           data = Data, # Data frame

           main = “Total Visits and Page views of a website per Year”, # Title of the plot

           color = TRUE, # Color shading

           dir = “h”, # Vector of split directions

           las = 2) # the style of axis labels

Figure:

In the mosaic plot, the data is split into different bars and shown the relationship between visitors, page views, unique visitors, and bounce rate. The mosaic plot is divided first into horizontal bars whose widths are proportional to the probabilities associated with the year. Then each bar is split vertically into bars that are proportional to the conditional probabilities of visitors, page views, unique visitors, and bounce rate. The colors represent the level of the residual/probability for that cell/combination of levels. 

3.      Map visualization-

a.     World map

R Code:

newmap <- getMap (resolution = “high”) # Accessing map stored in the package with high resolution

plot (newmap, # Map source

     xlim = c (10, 50), # co-ordinates in x – direction

     ylim = c (0, 81), # co-ordinates in y – direction

     asp = 1) # Aspect ratio

Figure:

b.     Plotting a location based on longitudes and latitudes

R Code:

m <- leaflet () %>%

               addTiles () %>% # Add default Open Street Map tiles

addMarkers (lng=87.3091, lat=22.3145, popup=”The Indian institute of Technology Kharagpur”) # longitude and latitude of IIT Kharagpur

m # Print the map

Figure:

 

4.      3D graphs- 

a.     Scatter plot

R Code:

scatterplot3d(x = Data$Date, # the x coordinates of points

              y = Data$Visits, # the y coordinates of points

              z = Data$Page.views, # the z coordinates of points

              residuals=TRUE, # Residuals

              bg=”black”, # Background color

              axis.scales=TRUE,

              grid=TRUE, # grid should be drawn on the plot or not

              ellipsoid=T,

              main = “Total Visits of a Web Site Per Year”, # Title of plot

              xlab = “Year”, # Title of x-axis

              ylab = “Page.Views”, # Title of y-axis

              zlab = “Visits”) # Title of z-axis

 

 

 

Figure:

b.     Surface plot

R Code:

plot_ly (Data, # Data frame

        x = ~Date, # The x coordinates of points

        y = ~Visits, # The x coordinates of points

        z = volcano, # The x coordinates of points

        type = “surface”) # Surface plot

layout (title = “Total Visits of a Web Site Per Year”, # Title of the plot

       xaxis = list (title = “Year”, showgrid = TRUE, color = “red”), # x-axis title and other properties

       yaxis = list (title = “Visits”, showgrid = TRUE, color = “green”)) # x-axis title and other properties

Figure:

c.      Spinning scatter plot

R Code:

scatter3d (as.numeric (Data$Year), # The x coordinates of points

          Data$Visits, # The y coordinates of points

          Data$Page.views) # The z coordinates of points

 

Figure:

 

 

5.      Correlogram – Correlogram used to visualize the data in correlation matrices.

R Code:

corrgram (Data, #Data frame

         order=NULL, # Variables are not re-ordered

         panel=panel.shade, # To plot content of each panel

         text.panel=panel.txt,

         main=”Correlogram between website Visits and Page views”) # Title of the plot

 

Figure:

From the figure, we observed that there is a positive correlation between visitors, page views, and unique visitors. However, Bounce rate has a negative correlation with other three values.

 

 

 

 

To install a package, in the console, type: install.packages(“RGoogleAnalytics”) and hit enter.

install.packages(“RGoogleAnalytics”)

 

magrittr

 

A Forward-Pipe Operator for R: Provides a mechanism for chaining commands with a new forward-pipe operator, %>%. This operator will forward a value, or the result of an expression, into the next function call/expression. The magrittr is a package developed to give two main benefits: 1) to decrease development time, and 2) to improve readability and maintainability of code.

 

Below codes are based on the mtcars dataset provided in R.

Compare the codes with and without %>%.

library(magrittr)

car_data <-

  mtcars %>%

  subset(hp > 100) %>%

  print

 

car_data <-

  mtcars

print (car_data)

 

%>% changes the semantics of the code and makes it more intuitive to both read and write.

rvest

rvest is a package that makes it easy to scrape (or harvest) data from html web pages, inspired by libraries like beautiful soup. It is designed to work with magrittr so that you can express complex operations as elegant pipelines composed of simple, easily understood pieces. Install it with:

 

Test the rvest library: code to get the rating of the Titanic movie from IMDB.com (http://www.imdb.com/title/tt0120338/). selectorgadget (refer online tutorial to learn about this plugin) to figure out which css selector matches the data we want. strong span is the CSS selector for to extract the rating.

library(rvest)

movie_link <- html(“http://www.imdb.com/title/tt0120338/”)

movie_link %>%

     html_node(“strong span”) %>%

     html_text() %>%

     as.numeric()

 

Rcurl

A wrapper for ‘libcurl’ <http://curl.haxx.se/libcurl/> Provides functions to allow one to compose general HTTP requests and provides convenient functions to fetch URIs, get & post forms, etc. and process the results returned by the Web server. This provides a great deal of control over the HTTP/FTP/… connection and the form of the request while providing a higher-level interface than is available just using R socket connections. Additionally, the underlying implementation is robust and extensive, supporting FTP/FTPS/TFTP (uploads and downloads), SSL/HTTPS, telnet, dict, ldap, and also supports cookies, redirects, authentication, etc.

 

library(RCurl)

# Amazon search: The Best American Short Stories of the Century

URL  <- “https://www.amazon.com/Best-American-Short-Stories-2016/dp/0544582896/ref=sr_1_1?ie=UTF8&qid=1493919877&sr=8-1&keywords=The+Best+American+Short+Stories”

html <- getURLContent(URL)

print(html)

 

gridExtra

Provides a number of user-level functions to work with “grid” graphics, notably to arrange multiple grid-based plots on a page, and draw tables.

 

Below is a sample example where we have mixed a few grobs and plots

 

library(gridExtra)

library(grid)

library(ggplot2)

library(lattice)

p <- qplot(1,1)

p2 <- xyplot(1~1)

r <- rectGrob(gp=gpar(fill=”grey90″))

t <- textGrob(“text”)

grid.arrange(t, p, p2, r, ncol=2)

 

Other R Libraries ReQuired in Data Visualization

 

These libraries are used in the examples shown under Data Visualization section

·        library (plotly): Plotly’s R graphing library makes interactive, publication-quality graphs online. Examples of how to make line plots, scatter plots, area charts, bar charts, error bars, box plots, histograms, heatmaps, subplots, multiple-axes, and 3D (WebGL based) charts.

·        library (ggplot2): A system for ‘declaratively’ creating graphics, based on “The Grammar of Graphics”. You provide the data, tell ‘ggplot2’ how to map variables to aesthetics, what graphical primitives to use, and it takes care of the details.

·        library (RColorBrewer): Provides color schemes for maps (and other graphics) designed by Cynthia Brewer.

·        library (gplots): Various R programming tools for plotting data, including: – calculating and plotting locally smoothed summary function as (‘bandplot’, ‘wapply’), – and more. Refer the documentation.

·        library (vcd): Visualization techniques, data sets, summary and inference procedures aimed particularly at categorical data. Special emphasis is given to highly extensible grid graphics.

·        require (stats): This package contains functions for statistical calculations and random number generation.

·        library (maps): Package to display maps. Projection code and larger maps are in separate packages (‘mapproj’ and ‘mapdata’).

·        library (leaflet): Leaflet is one of the most popular open-source JavaScript libraries for interactive maps. This R package makes it easy to integrate and control Leaflet maps in R.

·        library (maptools): Tools for Reading and Handling Spatial Objects

·        library (rworldmap): Enables mapping of country level and gridded user datasets.

·        library (Rcmdr): A platform-independent basic-statistics GUI (graphical user interface) for R, based on the tcltk package.

·        library (rgl) – 3D Visualization Using OpenGL: Provides medium to high level functions for 3D interactive graphics, including functions modelled on base graphics (plot3d(), etc.) as well as functions for constructing representations of geometric objects (cube3d(), etc.). Output may be on screen using OpenGL, or to various standard 3D file formats including WebGL, PLY, OBJ, STL as well as 2D image formats, including PNG, Postscript, SVG, PGF.

·        library (scatterplot3d): Plots 3D Scatter Plot

·        library (corrgram): Calculates correlation of variables and displays the results graphically. Included panel functions can display points, shading, ellipses, and correlation values with confidence intervals.

·        library(markdown): ‘Markdown’ is a plain-text formatting syntax that can be converted to ‘XHTML’ or other formats.

·        library(shiny): Makes it incredibly easy to build interactive web applications with R. Automatic “reactive” binding between inputs and outputs and extensive prebuilt widgets make it possible to build beautiful, responsive, and powerful applications with minimal effort.

·        library (htmltools): Tools for HTML generation and output.

 

 

 

 

 

 

 

 

#R Program to Add Two Vectors

> x <- c(3,6,8)

[1] 3 6 8

> y <- c(2,9,0)

[1] 2 9 0

 

> x + y

[1]  5 15  8

 

> x + 1    # 1 is recycled to (1,1,1)

[1] 4 7 9

 

> x + c(1,4)    # (1,4) is recycled to (1,4,1) but warning issued

[1]  4 10  9

Warning message:

In x + c(1, 4) :

 longer object length is not a multiple of shorter object length

 

 

#Find Sum, Mean and Product of Vector in R Programming

> sum(2,7,5)

[1] 14

 

> x

[1]  2 NA  3  1  4

 

> sum(x)    # if any element is NA or NaN, result is NA or NaN

[1] NA

 

> sum(x, na.rm=TRUE)    # this way we can ignore NA and NaN values

[1] 10

 

> mean(x, na.rm=TRUE)

[1] 2.5

 

> prod(x, na.rm=TRUE)

[1] 24

 

 

#R Program to Take Input From User

my.name <- readline(prompt=”Enter name: “)

my.age <- readline(prompt=”Enter age: “)

 

# convert character into integer

my.age <- as.integer(my.age)

 

print(paste(“Hi,”, my.name, “next year you will be”, my.age+1, “years old.”))

 

 

#R Program to Generate Random Number from Standard Distributions

> runif(1)    # generates 1 random number

[1] 0.3984754

 

> runif(3)    # generates 3 random number

[1] 0.8090284 0.1797232 0.6803607

 

> runif(3, min=5, max=10)    # define the range between 5 and 10

[1] 7.099781 8.355461 5.173133

 

 

#R Program to Sample from a Population

> x

[1]  1  3  5  7  9 11 13 15 17

 

> # sample 2 items from x

> sample(x, 2)

[1] 13  9

 

 

#R Program to Find Minimum and Maximum

> x

[1]  5  8  3  9  2  7  4  6 10

 

> # find the minimum

> min(x)

[1] 2

 

> # find the maximum

> max(x)

[1] 10

 

> # find the range

> range(x)

[1]  2 10

 

 

#Find factors of a number

print(paste(“The factors of”,x,”are:”))

for(i in 1:x) {

  if((x %% i) == 0) {

    print(i)

  }

 

}

 

 

# Program to check if

# the input number is

# prime or not

 

# take input from the user

num = as.integer(readline(prompt=“Enter a number: “))

 

flag = 0

# prime numbers are greater than 1

if(num > 1) {

    # check for factors

    flag = 1

    for(i in 2:(num-1)) {

        if ((num %% i) == 0) {

            flag = 0

            break

        }

    }

}

if(num == 2)    flag = 1

if(flag == 1) {

    print(paste(num,“is a prime number”))

} else {

    print(paste(num,“is not a prime number”))

}

 

 

 

# Program to check if
# the input number is odd or even.
# A number is even if division
# by 2 give a remainder of 0.
# If remainder is 1, it is odd.
 
num = as.integer(readline(prompt="Enter a number: "))
if((num %% 2) == 0) {
    print(paste(num,"is Even"))
} else {
    print(paste(num,"is Odd"))
}

 

 

 

# In this program, we input a number
# check if the number is positive or
# negative or zero and display
# an appropriate message
 
num = as.double(readline(prompt="Enter a number: "))
if(num > 0) {
    print("Positive number")
} else {
    if(num == 0) {
        print("Zero")
    } else {
        print("Negative number")
    }
}

 

 

 

# take input from the user
num = as.integer(readline(prompt="Enter a number: "))
factorial = 1
 
# check is the number is negative, positive or zero
if(num < 0) {
    print("Sorry, factorial does not exist for negative numbers")
} else if(num == 0) {
    print("The factorial of 0 is 1")
} else {
    for(i in 1:num) {
        factorial = factorial * i
    }
    print(paste("The factorial of", num ,"is",factorial))
}

 

 

 

# Program to find the multiplication
# table (from 1 to 10)
# of a number input by the user
 
# take input from the user
num = as.integer(readline(prompt = "Enter a number: "))
 
# use for loop to iterate 10 times
for(i in 1:10) {
    print(paste(num,'x', i, '=', num*i))
}

 

 

# take input from the user
nterms = as.integer(readline(prompt="How many terms? "))
 
# first two terms
n1 = 0
n2 = 1
count = 2
 
# check if the number of terms is valid
if(nterms <= 0) {
    print("Plese enter a positive integer")
} else {
    if(nterms == 1) {
        print("Fibonacci sequence:")
        print(n1)
    } else {
        print("Fibonacci sequence:")
        print(n1)
        print(n2)
        while(count < nterms) {
            nth = n1 + n2
            print(nth)
            # update values
            n1 = n2
            n2 = nth
            count = count + 1
        }
    }
}

 

 

# Program make a simple calculator
# that can add, subtract, multiply
# and divide using functions
 
add <- function(x, y) {
    return(x + y)
}
 
subtract <- function(x, y) {
    return(x - y)
}
 
multiply <- function(x, y) {
    return(x * y)
}
 
divide <- function(x, y) {
    return(x / y)
}
 
# take input from the user
print("Select operation.")
print("1.Add")
print("2.Subtract")
print("3.Multiply")
print("4.Divide")
 
choice = as.integer(readline(prompt="Enter choice[1/2/3/4]: "))
 
num1 = as.integer(readline(prompt="Enter first number: "))
num2 = as.integer(readline(prompt="Enter second number: "))
 
operator <- switch(choice,"+","-","*","/")
result <- switch(choice, add(num1, num2), subtract(num1, num2), multiply(num1, num2), divide(num1, num2))
 
print(paste(num1, operator, num2, "=", result))
check <- function(x) {
   if (x > 0) {
       result <- "Positive"
   }
   else if (x < 0) {
       result <- "Negative"
   }
   else {
       result <- "Zero"
   }
   return(result)
}

 

 

# take input from the user
num = as.integer(readline(prompt = "Enter a number: "))
 
if(num < 0) {
    print("Enter a positive number")
} else {
    sum = 0
    # use while loop to iterate until zero
    while(num > 0) {
        sum = sum + num
        num = num - 1
    }
    print(paste("The sum is", sum))
}

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 https://learn.swapnil.pw/