DATA ANALYSIS WITH R
DAY 1: 10 SEP 2022
#Compiler
#interpreter
print(“XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX”)
print(5 + 3)
print(“5 + 3”)
hello = 5
print(hello)
print(4) # comment
#Data types -what is the data
#Basic data type: single value
#logical: TRUE / FALSE
var1 = TRUE #FALSE
var1 <- TRUE
TRUE -> var1
#
print(class(var1))
#Integer: positive or negative numbers without decimal part
var1 <- 3L
print(class(var1))
#numeric: can take decimal values
var1 <- 3.5
print(class(var1))
#CHARACTER
var1 <- “HEllo”
print(class(var1))
#complex: square root of -1
var1 = 5i #complex numbers are represented iota
print(var1 * var1)
print(class(var1))
#Raw
print(charToRaw(“l”))
#### data structure
#vector : same type of values
hello=68
var1 = c(34,45,67,”hello”)
print(var1)
print(class(var1))
# lists
var1 <- list(3,5,”Hello”, TRUE, c(2,4,8,2,4,6,8,2,4,6,8))
print(var1)
cat(“Hello”, “there”)
#print(“Hello”, “there”)
#Matrices
mat1 = matrix(c(1,3,5,7,9,11,13,15,18), nrow=3,ncol=3, byrow = TRUE)
print(mat1)
mat1 = matrix(c(1,3,5,7,9,11,13,15,18), nrow=3,ncol=3, byrow = FALSE)
print(mat1)
var1 = array(c(1,3,5,7,9,11,13,15,18,9,11,13,15,18,21,22,25,28), dim=c(2,2,2,2))
print(var1)
# Factor
color = c(“Red”,”Green”,”Blue”,”Green”,”Blue”,”Green”,”Blue”,”Green”,”Blue”,”Red”)
color_f = factor(color)
print(color_f)
# Data Frames
employee <- data.frame(
Name = c(“Sachin”,”Virat”,”Rohit”),
City = c(“Mumbai”,”Delhi”,”Chennai”),
Avg = c(113,24,85)
)
print(employee)
DAY 2: 11 SEP 2022
#Arithmetic operators
v1 = c(1,3,5,7)
v2 = c(2,4,6,8)
print(v1 + v2)
print(v1 – v2)
print(v1 * v2)
print(v1 / v2)
# %% is for remainder
num = 15
rem = num %%2
print(rem)
# integer division or quotient: %/%
qt = 15 %/% 4
print(qt)
#5 ^ 3 : cube power of
print( 5^ 3)
#Relational Operators: bigger smaller relation – oUput is logical
var1 = 55
var2 = 66
print(var1 > var2) # is var1 greater than var2?
print(var1 < var2)
print(var1 >= var2)
print(var1 <= var2)
print(var1 == var2)
print(var1 != var2)
#Logical operators: Input is logical and output is also logical
#prediction: Sachin and Laxman will open the batting
#actual: Sachin and Rahul opened the batting
#prediction: Sachin or Laxman will open the batting
#actual: Sachin and Rahul opened the batting
# & for and , | for or
a=5
b=6
c=7
print(a > b | b < c) # for OR – even 1 True will make it True
# T & T = T F & F = F T & F = F F&T = F (multiplication)
# T | T = T F | F = F T | F = T F|T = T (addition)
print(!TRUE)
#Assignment Operators:
a = 5
a <- 5
a <<- 5 #left assignment
#right assignment:
100 -> b
200 ->> b
c=6
b -> c
print(b)
print(c)
####################################################3
## CONDITIONS
#if avg >= 90 I want to print COngratulations
avg = 90
if (avg >=90) {
print(“Congratulations”)
}
avg =40
if (avg>=50) {
print(“You have passed”)
} else {
print(“Sorry, You have failed”)
}
# if – else if – else
#avg > 90: Grade A, avg>80: Grade B, avg>70: C, avg > 60: D, avg >50: E, <50: F
avg = 90
if (avg>=90) {
print(“Grade A”)
val = 1
} else if (avg >=80) {
print(“Grade B”)
val=2
} else if(avg>=70) {
print(“Grade C”)
val = 3
} else if (avg >= 60) {
print(“Grade D”)
val = 4
} else if (avg>=50) {
print(“Grade E”)
val =5
} else {
print(“Grade F”)
val = 6
}
## switch
#switch(expression, case1: case2)…
result <- switch(
val,
“Hello”,
“How are you?”,
“Where are you?”,
“Hows going?”
)
print(result)
#loops – repeat block
## repeat: exit check
## while : entry check
## for : when we know how many times to repeat
TABLE OF CONTENTS
Unit 1: Getting Started with R.. 2
Unit 3: VECTORS, LISTS, ARRAYS & MATRICES. 19
Unit 4: Working with Files. 45
Unit 5: Working with MSAccess Database. 48
Unit 6: Working with Graphs. 51
Unit 7: Overview of R Packages. 64
Unit 8: Programming Examples. 68
Unit 1: Getting
Started with R
R is a free
software environment for statistical computing and graphics. It compiles and
runs on a wide variety of UNIX platforms, Windows and MacOS. Why
R? It’s free, open source, powerful and highly extensible. “You
have a lot of prepackaged stuff that’s already available, so you’re standing on
the shoulders of giants,” Google’s chief economist told The New York Times
back in 2009.There can be little doubt that interest in the R statistics
language, especially for data analysis, is soaring.
Downloading R
The primary R
system is available from the Comprehensive R Archive Network, also known as
CRAN. CRAN also hosts many add-on packages that can be used to extend the
functionality of R. The “base” R system that you download from CRAN: Linux,
Windows, Mac, Source Code
Website to
download: https://cran.r-project.org/mirrors.html
The R Foundation for Statistical Computing
The R
Foundation is a not-for-profit organization working in the public interest. It
was founded by the members of the R Development Core Team in order to:
·
Provide
support for the R project and other innovations in statistical computing. We
believe that R has become a mature and valuable tool and we would like to
ensure its continued development and the development of future innovations in
software for statistical and computational research.
·
Provide a
reference point for individuals, institutions or commercial enterprises that
want to support or interact with the R development community.
·
Hold and
administer the copyright of R software and documentation.
R functionality is divided into a number of
packages:
·
The “base” R system contains,
among other things, the base package which is required to run R and contains
the most fundamental functions.
·
The other packages contained in
the “base” system include utils, stats, datasets,
graphics, grDevices, grid, methods, tools, parallel, compiler, splines, tcltk,
stats4.
·
There are also “Recommended”
packages: boot, class, cluster, codetools, foreign,
KernSmooth, lattice, mgcv, nlme, rpart, survival, MASS, spatial, nnet, Matrix.
When you download a fresh installation of R
from CRAN, you get all of the above, which represents a substantial amount of
functionality. However, there are many other packages available:
·
There are over 4000 packages on
CRAN that have been developed by users and programmers around the world.
·
People often make packages
available on their personal websites; there is no reliable way to keep track of
how many packages are available in this fashion.
·
There are a number of packages
being developed on repositories like GitHub and BitBucket but there is no
reliable listing of all these packages.
More details
can be found at the R foundation website: https://www.r-project.org/
Let’s create our first R Program
Launch R. In
Windows you can launch R software using the option shown below under Program
Files.
Figure 1: Launch R Programming Window
After launching R interpreter, you will get
a prompt > where you can start typing your
Program. Let’s try our first program:
In the Hello World code below, vString is a variable which stores the String value “Hello World” and in
the next line we print the value of the vString variable. Please note that R command are case sensitive. print is the valid command to print the value on the screen.
Figure 2: Hello World
# is the syntax used to print comments in
the program
Figure 3: R Programming
R Basic
Syntax
Download
and Install R software
When R is run, this will launch R
interpreter. You will get a prompt where you can start typing your programs as
follows:
Here first statement defines a string
variable myString, where we assign a string “Hello, World!” and then
next statement print() is being used to print the value stored in variable
myString.
R Script File
Usually, you will do your programming by
writing your programs in script files and then you execute those scripts at
your command prompt with the help of R interpreter called Rscript. So let’s
start with writing following code in a text file called test.R as under:
Save the above code in a file test.R and
execute it at Linux command prompt as given below. Even if you are using
Windows or other system, syntax will remain same.
For windows, go to command prompt and
browse to the directory where R.exe/Rscript.exe is installed.
Run-> Rscript filename.R (filename.R is the name of the file which
has R program along with the path name.)
We
will use RStudio for rest of our course example. Download and install R Studio.
Generally, while doing programming in any
programming language, you need to use various variables to store information.
Variables are nothing but reserved memory locations to store values. This means
that, when you create a variable you reserve some space in memory. In contrast
to other programming languages like C and java in R, the variables are not declared
as some data type. The variables are assigned with R-Objects and the data type
of the R-object becomes the data type of the variable.
R has five basic or “atomic” classes of
objects:
·
character
·
numeric (real numbers)
·
integer
·
complex
·
logical (True/False)
The frequently used ones are:
Vectors |
Lists |
Matrices |
Arrays |
Factors |
Data Frames |
The simplest of these objects is the vector
object and there are six data types of these atomic vectors, also termed as six
classes of vectors. The other R-Objects are built upon the atomic vectors.
Figure 4: Data Types in R
Creating
Vectors
The c() function can be used to create vectors of objects by concatenating
things together. When you want to create
vector with more than one element, you should use c() function which means to combine the elements into a vector. You can
also use the vector() function to
initialize vectors.
Figure 5: Vector example
Lists,
Matrices, Arrays
A list is an R-object which can
contain many different types of elements inside it like vectors, functions and
even another list inside it.
A matrix is a two-dimensional
rectangular data set. It can be created using a vector input to the matrix
function.
While matrices are confined to two
dimensions, arrays can be of any number of dimensions. The array function
takes a dim attribute which creates the required number of dimension. In the
below example we create an array with two elements which are 3×3 matrices each.
Factors
Factors are used to represent categorical
data and can be unordered or ordered. One can think of a factor as an integer
vector where each integer has a label. Factors are important in statistical
modeling and are treated specially by modelling functions like lm() and glm().
Using factors with labels is better than using integers because factors are
self-describing. Having a variable that has values “Male” and “Female” is
better than a variable that has values 1 and 2. Factor objects can be created
with the factor() function.
Figure 6: List, Matrix and Array example
Figure 7: Factors example
Data
Frames
Data frames are tabular data objects.
Unlike a matrix in data frame each column can contain different modes of data.
The first column can be numeric while the second column can be character and
third column can be logical. It is a list of vectors of equal length. Data
Frames are created using the data.frame() function.
Figure 8: Data frames example
Mixing
Objects
There are occasions when different classes
of R objects get mixed together. Sometimes this happens by accident but it can
also happen on purpose. In implicit coercion, what R tries to do is find a way
to represent all of the objects in the vector in a reasonable fashion.
Sometimes this does exactly what you want and sometimes not. For example,
combining a numeric object with a character object will create a character
vector, because numbers can usually be easily represented as strings.
Figure 9: Mixing and Missing Objects examples
We have the following types of operators in
R programming:
·
Arithmetic Operators
·
Relational Operators
·
Logical Operators
·
Assignment Operators
·
Miscellaneous Operators
Arithmetic
Operators
Figure 10: Assignment Operators
Relational
Operators
Operators |
Meaning |
> |
Checks
if each element of the first vector is greater than the corresponding element
of the second vector. |
< |
Checks
if each element of the first vector is less than the corresponding element of
the second vector. |
== |
Checks
if each element of the first vector is equal to the corresponding element of
the second vector. |
<= |
Checks
if each element of the first vector is less than or equal to the
corresponding element of the second vector. |
>= |
Checks
if each element of the first vector is greater than or equal to the corresponding
element of the second vector. |
!= |
Checks
if each element of the first vector is unequal to the corresponding element
of the second vector. |
Logical
Operators
Operators |
Meaning |
& |
It
is called Element-wise Logical AND operator. It combines each element of the
first vector with the corresponding element of the second vector and gives a
output TRUE if both the elements are TRUE. |
| |
It
is called Element-wise Logical OR operator. It combines each element of the
first vector with the corresponding element of the second vector and gives a
output TRUE if one the elements is TRUE. |
! |
It
is called Logical NOT operator. Takes each element of the vector and gives
the opposite logical value. |
The
logical operator && (logical AND) and || (logical OR) considers only
the first element of the vectors and give a vector of single element as
output. |
Readers are encouraged to practice all the
operators and see the output.
Assignment
Operators
A variable in R can store an atomic
vector, group of atomic vectors or a combination of many R objects. The
variables can be assigned values using leftward, rightward and equal to
operator. The values of the variables can be printed using print() or cat() function. The cat() function combines multiple items into a continuous print output.
In R, a variable itself is not declared of
any data type, rather it gets the data type of the R -object assigned to it. So
R is called a dynamically typed language, which means that we can change a
variable’s data type of the same variable again and again when using it in a
program.
Figure 11: Variable assignment
Figure 12: Listing and deleting variables
Miscellaneous
Operators
Operators |
Meaning |
: |
Colon
operator. It creates the series of numbers in sequence for a vector. |
%in% |
This
operator is used to identify if an element belongs to a vector. |
%*% |
This
operator is used to multiply a matrix with its transpose. |
R provides the following types of decision
making statements:
Statement |
Description |
If
statement |
An
if statement consists of a Boolean expression followed by one or more
statements. |
If
else statement |
An
if statement can be followed by an optional else statement, which executes
when the Boolean expression is false. |
Switch
statement |
A
switch statement allows a variable to be tested for equality against a list
of values. |
Figure 13: Example of If Statement
Figure 14: Example of If Else Statement
Multiple if
else
An if statement can be followed by an
optional else if…else statement, which is very
useful to test various conditions using
single if…else if statement.
Syntax
When using if, else if, else statements
there are few points to keep in mind.
·
An if can have zero or one else
and it must come after any else if’s.
·
An if can have zero to many
else if’s and they must come before the else.
·
Once an else if succeeds, none
of the remaining else if’s or else’s will be tested.
SWITCH
statement
A switch statement allows a variable to be
tested for equality against a list of values. Each value is called a case, and
the variable being switched on is checked for each case.
Syntax
The following rules apply to a switch
statement:
·
If the value of expression is
not a character string it is coerced to integer.
·
You can have any number of case
statements within a switch. Each case is followed by the value to be compared
to and a colon.
·
If the value of the integer is
between 1 and nargs()-1 (The max number of arguments)then the corresponding
element of case condition is evaluated and the
·
result returned.
·
If expression evaluates to a
character string then that string is matched (exactly) to the names of the
elements.
·
If there is more than one
match, the first matching element is returned.
·
No Default argument is
available.
·
In the case of no match, if
there is a unnamed element of … its value is returned. (If there is more than
one such argument an error is returned.)
Loops are used to repeat a block of code.
Being able to have your program repeatedly execute a block of code is one of
the most basic but useful tasks in programming- a loop lets you write a very
simple statement to produce a significantly greater result simply by
repetition. R programming language provides the following kinds of loop to
handle looping requirements:
Loop Type |
Description |
REPEAT
loop |
Executes
a sequence of statements multiple times and abbreviates the code that manages
the loop variable. |
WHILE
loop |
Repeats
a statement or group of statements while a given condition is true. It tests
the condition before executing the loop body. |
FOR
loop |
It
executes a block of statements repeatedly until the specified condition
returns false. |
Look
Control Statements
Control Type |
Description |
BREAK
statement |
Terminates
the loop statement and transfers execution to the statement immediately
following the loop. |
NEXT
statement |
The
next statement simulates the behavior of R switch (skips the line of
execution). |
REPEAT – loop
The Repeat loop executes the same code
again and again until a stop condition is met.
Syntax: Example:
WHILE – loop
The While loop executes the same code again
and again until a stop condition is met.
Syntax: Example:
FOR – loop
A for loop is a repetition control
structure that allows you to efficiently write a loop that needs to execute a
specific number of times.
Syntax: Example:
Any value written within a pair of single
quote or double quotes in R is treated as a string. Internally R stores every
string within double quotes, even when you create them with single quote.
Rules Applied in String Construction
·
The quotes at the beginning and
end of a string should be both double quotes or both single quote. They can not
be mixed.
·
Double quotes can be inserted
into a string starting and ending with single quote.
·
Single quote can be inserted
into a string starting and ending with double quotes.
·
Double quotes can not be
inserted into a string starting and ending with double quotes.
·
Single quote can not be
inserted into a string starting and ending with single quote.
Examples of Strings in R
Formatting numbers & strings – format() function
Numbers and strings can be formatted to a
specific style using format()function.
Syntax – The basic syntax for format
function is :
Following is the description of the
parameters used:
·
x is the vector input.
·
digits is the total number of
digits displayed.
·
nsmall is the minimum number of
digits to the right of the decimal point.
·
scientific is set to TRUE to
display scientific notation.
·
width indicates the minimum
width to be displayed by padding blanks in the beginning.
·
justify is the display of the
string to left, right or center.
Other functions
Functions |
Functionality |
nchar(x) |
This
function counts the number of characters including spaces in a string. |
toupper(x)
/ tolower(x) |
These
functions change the case of characters of a string. |
substring(x,first,last) |
This
function extracts parts of a String. |
A function is
a set of statements organized together to perform a specific task. R has a
large number of in-built functions and the user can create their own functions.
The different
parts of a function are:
·
Function Name: This is the
actual name of the function. It is stored in R environment as an object with
this name.
·
Arguments: An argument is a
placeholder. When a function is invoked, you pass a value to the argument.
Arguments are optional; that is, a function may contain no arguments. Also
arguments can have default values.
·
Function Body: The function
body contains a collection of statements that defines what the function does.
·
Return Value: The return value
of a function is the last expression in the function body to be evaluated.
R has many in-built functions which can be
directly called in the program without defining them first. Simple examples of
in-built functions are seq(), mean(), max(), sum(x)and paste(…) etc.
We can also create and use our own
functions referred as user defined functions. An R function is created by using
the keyword function. The basic syntax of an R function definition is as
follows:
Example: Calling a function with argument
values (by position and by name)
Example: Calling a function with default
values
Lazy Evaluation of Function: Arguments to functions are evaluated lazily, which means so they
are evaluated only when needed by the function body.
Vectors are the most basic R data objects
and there are six types of atomic vectors. They are logical, integer, double,
complex, character and raw. Even when you write just one value in R, it becomes
a vector of length 1 and belongs to one of the above vector types.
# Atomic vector of type character. print(“ABC”); [1]
“ABC” |
# Atomic vector of type double. print (1.2) [1] 12.5 |
# Atomic vector of type integer. print(10L) [1] 10 |
# Atomic vector of type logical. print(TRUE) [1] TRUE |
# Atomic vector of type complex. print(4+8i) [1] 4+8i |
# Atomic vector of type raw. print(charToRaw(‘hello’)) [1] 68 65 6c
6c 6f |
Multiple Elements Vector
Using colon operator with numeric data
# Creating a sequence from 2 to 8. v <- 2:8 print(v) [1] 2 3 4 5 6 7
8 |
# Creating a sequence from 6.6 to 12.6. v <- 6.6:12.6 print(v) [1] 6.6 7.6
8.6 9.6 10.6 11.6 12.6 |
# If the final element specified does not belong to the
sequence then it is discarded. v <- 3.8:11.4 print(v) [1] 3.8 4.8
5.8 6.8 7.8 8.8 9.8 10.8 |
Using sequence
(Seq.) operator
Syntax and example of using Seq. operator:
# # Create vector with elements from 5 to 9 incrementing
by 0.4. print (seq(5, 9, by=0.4)) [1] 5.0 5.4
5.8 6.2 6.6 7.0 7.4 7.8 8.2 8.6 9.0 |
Using the c () function
The non-character values are coerced to character type if one of the
elements is a char.
Syntax and example of using c() function:
## The logical and
numeric values are converted to characters. x <- c(‘apple’, ‘red’, 5, TRUE) print(x) [1]
“apple” “red” “5” “TRUE” |
Accessing Vector
Elements
Elements of a Vector are accessed using indexing. The [ ] brackets
are used for indexing. Indexing starts with position 1. Giving a negative value
in the index drops that element from result. TRUE, FALSE or 0 and 1 can also be
used for indexing.
Syntax and example:
# Accessing vector elements using position. t <-
c(“Sun”,”Mon”,”Tue”,”Wed”,”Thurs”,”Fri”,”Sat”) u <- t[c(2,3,6)] print(u) [1]
“Mon” “Tue” “Fri”
# Accessing vector elements using logical indexing. v <- t[c(TRUE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE)] print(v) [1]
“Sun” “Fri”
# Accessing vector elements using negative indexing. x <- t[c(-2,-5)] print(x) [1]
“Sun” “Tue” “Wed” “Fri”
“Sat”
# Accessing vector elements using 0/1 indexing. y <- t[c(0,0,0,0,0,0,1)] print(y) [1]
“Sun” |
Vector Manipulation
Vector Arithmetic- Two vectors of same length can be added,
subtracted, multiplied or divided giving the result as a vector output.
Syntax and example:
# Create two vectors. v1 <- c(3,8,4,5,0,11) v2 <- c(4,11,0,8,1,2)
# Vector addition. add.result <- v1+v2 print(add.result) [1] 7 19 4 13
1 13
# Vector substraction. sub.result <- v1-v2 print(sub.result) [1] -1 -3 4 -3
-1 9
# Vector multiplication. multi.result <- v1*v2 print(multi.result) [1] 12 88 0 40
0 22
# Vector division. divi.result <- v1/v2 print(divi.result) [1] 0.7500000
0.7272727 Inf 0.6250000 0.0000000 5.5000000 |
Vector Element
Recycling
If we apply arithmetic operations to two vectors of unequal length,
then the elements of the shorter vector are recycled to complete the operations.
Syntax and example:
v1 <- c(3,8,4,5,0,11) v2 <- c(4,11) # V2 becomes c(4,11,4,11,4,11) add.result <- v1+v2 print(add.result) [1] 7 19 8 16
4 22
sub.result <- v1-v2 print(sub.result) [1] -1 -3 0 -6
-4 0 |
Vector Element
Sorting
Elements in a vector can be sorted using the sort() function.
Syntax and example:
v <- c(3,8,4,5,0,11, -9, 304) # Sort the elements of the vector. sort.result <- sort(v) print(sort.result) [1] -9 0 3 4 5
8 11 304
# Sort the elements in the reverse order. revsort.result <- sort(v, decreasing = TRUE) print(revsort.result) [1] 304 11 8 5
4 3 0 -9
# Sorting character vectors. v <-
c(“Red”,”Blue”,”yellow”,”violet”) sort.result <- sort(v) print(sort.result) [1]
“Blue” “Red” “violet” “yellow”
# Sorting character vectors in reverse order. revsort.result <- sort(v, decreasing = TRUE) print(revsort.result) [1]
“yellow” “violet” “Red” “Blue” |
Lists are the R objects which contain
elements of different types like – numbers, strings, vectors and another list
inside it. A list can also contain a matrix or a function as its elements. List
is created using list() function.
Syntax and example:
## Create a list containing strings, numbers, vectors and
a logical values. list_data <- list(“Red”, “Green”,
c(21,32,11), TRUE, 51.23, 119.1) print(list_data)
[[1]] [1]
“Red” [[2]] [1]
“Green” [[3]] [1] 21 32 11 [[4]] [1] TRUE [[5]] [1] 51.23 [[6]] [1] 119.1 |
Naming List Elements
The list elements can be given names and
they can be accessed using these names.
Manipulating List Elements
We can add, delete and update list elements
as shown below. We can add and delete elements only at the end of a list. But
we can update any element.
Merging Lists
You can merge many lists into one list by
placing all the lists inside one list() function.
Converting Lists to Vector
A list can be converted to a vector so that
the elements of the vector can be used for further manipulation. All the
arithmetic operations on vectors can be applied after the list is converted
into vectors. To do this conversion, we use the unlist() function. It takes the list as input and produces a
vector.
|
Matrices are the R objects in which the
elements are arranged in a two-dimensional
format. They contain elements of the same
atomic types. But we use matrices containing numeric elements to be used in
mathematical calculations. A Matrix is created using the matrix() function.
Syntax
Parameters used:
·
data is the input vector which
becomes the data elements of the matrix.
·
nrow is the number of rows to
be created.
·
ncol is the number of columns
to be created.
·
byrow is a logical clue. If
TRUE then the input vector elements are arranged by row.
·
dimname is the names assigned
to the rows and columns.
# Elements are
arranged sequentially by row. M <-
matrix(c(3:14), nrow=4, byrow=TRUE) print(M) # Elements are
arranged sequentially by column. N <-
matrix(c(3:14), nrow=4, byrow=FALSE) print(N) # Define the
column and row names. rownames =
c(“row1”, “row2”, “row3”, “row4”) colnames =
c(“col1”, “col2”, “col3”)
# Accessing
Elements of a Matrix # Access the
element at 3rd column and 1st row. print(N[1,3]) # Access the
element at 2nd column and 4th row. print(N[4,2])
# Access only
the 2nd row. print(N[2,]) # Access only
the 3rd column. print(N[,3]) |
Matrix
Computations
Various mathematical operations are
performed on the matrices using the R operators. The result of the operation is
also a matrix. The dimensions (number of rows and columns) should be same for
the matrices involved in the operation.
# Create two
2×3 matrices. matrix1 <-
matrix(c(3, 9, -1, 4, 2, 6), nrow=2) print(matrix1) matrix2 <-
matrix(c(5, 2, 0, 9, 3, 4), nrow=2) print(matrix2) # Add the
matrices. result <-
matrix1 + matrix2 cat(“Result
of addition”,”\n”) print(result) # Subtract
the matrices result <-
matrix1 – matrix2 cat(“Result of
subtraction”,”\n”) print(result) |
Matrix
Multiplication & Division
# Create two
2×3 matrices. matrix1 <-
matrix(c(3, 9, -1, 4, 2, 6), nrow=2) print(matrix1) matrix2 <-
matrix(c(5, 2, 0, 9, 3, 4), nrow=2) print(matrix2) # Multiply the
matrices. result <-
matrix1 * matrix2 cat(“Result
of multiplication”,”\n”) print(result) # Divide the
matrices result <-
matrix1 / matrix2 cat(“Result
of division”,”\n”) print(result) |
Arrays are the R data objects which can
store data in more than two dimensions. For example – If we create an array of
dimension (2, 3, 4) then it creates 4 rectangular matrices each with 2 rows and
3 columns. Arrays can store only data type. An array is created using the
array() function. It takes vectors as input and uses the values in the dim
parameter to create an array.
# Create two
vectors of different lengths. vector1 <-
c(5,9,3) vector2 <-
c(10,11,12,13,14,15) # Take these
vectors as input to the array. result <-
array(c(vector1,vector2),dim=c(3,3,2)) print(result) |
Naming Columns and Rows: We can give names
to the rows, columns and matrices in the array by using the dimnames parameter.
# Create two vectors of
different lengths. vector1 <- c(5,9,3) vector2 <-
c(10,11,12,13,14,15) column.names <-
c(“COL1″,”COL2″,”COL3”) row.names <-
c(“ROW1″,”ROW2″,”ROW3”) matrix.names <-
c(“Matrix1″,”Matrix2”) # Take these vectors as input to
the array. result <-
array(c(vector1,vector2),dim=c(3,3,2),dimnames =
list(column.names,row.names,matrix.names)) print(result) |
Accessing Array Elements
# Create two
vectors of different lengths. vector1 <-
c(5,9,3) vector2 <-
c(10,11,12,13,14,15) column.names
<- c(“COL1″,”COL2″,”COL3”) row.names <-
c(“ROW1″,”ROW2″,”ROW3”) matrix.names
<- c(“Matrix1″,”Matrix2”) # Take these
vectors as input to the array. result <-
array(c(vector1,vector2),dim=c(3,3,2),dimnames =
list(column.names,row.names,matrix.names)) # Print the
third row of the second matrix of the array. print(result[3,,2]) # Print the
element in the 1st row and 3rd column of the 1st matrix. print(result[1,3,1]) # Print the
2nd Matrix. print(result[,,2]) |
Manipulating Array Elements
As array is made
up matrices in multiple dimensions, the operations on elements of array are
carried out by accessing elements of the matrices.
# Create two
vectors of different lengths. vector1 <-
c(5,9,3) vector2 <-
c(10,11,12,13,14,15) # Take these
vectors as input to the array. array1 <-
array(c(vector1,vector2),dim=c(3,3,2)) # Create two
vectors of different lengths. vector3 <-
c(9,1,0) vector4 <-
c(6,0,11,3,14,1,2,6,9) array2 <-
array(c(vector1,vector2),dim=c(3,3,2)) # create
matrices from these arrays. matrix1 <-
array1[,,2] matrix2 <-
array2[,,2] # Add the
matrices. result <-
matrix1+matrix2 print(result) |
Calculations
Across Array Elements: We can do calculations
across the elements in an array using the apply() function.
Syntax
Parameters used:
·
x is an array.
·
margin is the name of the data
set used.
·
fun is the function to be
applied across the elements of the array.
We use the apply() function below to
calculate the sum of the elements in the rows of an array across all the
matrices.
# Create two
vectors of different lengths. vector1 <-
c(5,9,3) vector2 <-
c(10,11,12,13,14,15) # Take these
vectors as input to the array. new.array
<- array(c(vector1,vector2),dim=c(3,3,2)) print(new.array) # Use apply to
calculate the sum of the rows across all the matrices. result <-
apply(new.array, c(1), sum) print(result) |
Array indexing. Subsections of an array
Individual elements of an array may be
referenced by giving the name of the array followed by
the subscripts in square brackets,
separated by commas.
More generally, subsections of an array may
be specified by giving a sequence of index vectors
in place of subscripts; however if any
index position is given an empty index vector, then the full
range of that subscript is taken.
Continuing the previous example, a[2,,] is
a 42 array with dimension vector c(4,2) and
data vector containing the values
c(a[2,1,1], a[2,2,1], a[2,3,1], a[2,4,1],
a[2,1,2], a[2,2,2], a[2,3,2], a[2,4,2])
in that order. a[,,] stands for the entire
array, which is the same as omitting the subscripts
entirely and using a alone.
For any array, say Z, the dimension vector
may be referenced explicitly as dim(Z) (on either
side of an assignment).
Also, if an array name is given with just
one subscript or index vector, then the corresponding
values of the data vector only are used; in
this case the dimension vector is ignored. This is not
the case, however, if the single index is
not a vector but itself an array, as we next discuss.
Factors are the data objects which are used
to categorize the data and store it as levels. They can store both strings and
integers. They are useful in the columns which have a limited number of unique
values. Like “Male, “Female” and True, False etc. They are
useful in data analysis for statistical modeling.
A factor is a vector object used to specify
a discrete classification (grouping) of the components
of other vectors of the same length. R
provides both ordered and unordered factors. While the
“real” application of factors is with model
formulae (see Section 11.1.1 [Contrasts], page 53), we
here look at a specific example.
4.1 A specific example
Suppose, for example, we have a sample of
30 tax accountants from all the states and territories
of Australia1 and their individual state of
origin is specified by a character vector of state
mnemonics as
> state <- c(“tas”,
“sa”, “qld”, “nsw”, “nsw”,
“nt”, “wa”, “wa”,
“qld”, “vic”,
“nsw”, “vic”, “qld”, “qld”,
“sa”, “tas”,
“sa”, “nt”,
“wa”, “vic”, “qld”, “nsw”,
“nsw”, “wa”,
“sa”, “act”,
“nsw”, “vic”, “vic”, “act”)
Notice that in the case of a character
vector, “sorted” means sorted in alphabetical order.
A factor is similarly created using the
factor() function:
> statef <- factor(state)
The print() function handles factors
slightly differently from other objects:
> statef
[1] tas sa qld nsw nsw nt wa wa qld vic nsw
vic qld qld sa
[16] tas sa nt wa vic qld nsw nsw wa sa act
nsw vic vic act
Levels: act nsw nt qld sa tas vic wa
To find out the levels of a factor the
function levels() can be used.
> levels(statef)
[1] “act” “nsw”
“nt” “qld” “sa” “tas” “vic”
“wa”
4.2 The function tapply() and ragged arrays
To continue the previous example, suppose
we have the incomes of the same tax accountants in
another vector (in suitably large units of
money)
> incomes <- c(60, 49, 40, 61, 64,
60, 59, 54, 62, 69, 70, 42, 56,
61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48,
52, 46,
59, 46, 58, 43)
To calculate the sample mean income for
each state we can now use the special function
tapply():
> incmeans <- tapply(incomes, statef,
mean)
giving a means vector with the components
labelled by the levels
act nsw nt qld sa tas vic wa
44.500 57.333 55.500 53.600 55.000 60.500
56.000 52.250
The function tapply() is used to apply a
function, here mean(), to each group of components
of the first argument, here incomes,
defined by the levels of the second component, here statef2, as if they were
separate vector structures. The result is a structure of the same length as the
levels attribute of the factor containing
the results. The reader should consult the help document
for more details.
Suppose further we needed to calculate the
standard errors of the state income means. To do
this we need to write an R function to
calculate the standard error for any given vector. Since
there is an builtin function var() to
calculate the sample variance, such a function is a very
simple one liner, specified by the
assignment:
> stdError <- function(x)
sqrt(var(x)/length(x))
(Writing functions will be considered later
in Chapter 10 [Writing your own functions], page 42.
Note that R’s a builtin function sd() is
something different.) After this assignment, the standard
errors are calculated by
> incster <- tapply(incomes, statef,
stderr)
and the values calculated are then
> incster
act nsw nt qld sa tas vic wa
1.5 4.3102 4.5 4.1061 2.7386 0.5 5.244 2.6575
As an exercise you may care to find the
usual 95% confidence limits for the state mean
incomes. To do this you could use tapply()
once more with the length() function to find
the sample sizes, and the qt() function to
find the percentage points of the appropriate t-
distributions. (You could also investigate
R’s facilities for t-tests.)
The function tapply() can also be used to
handle more complicated indexing of a vector
by multiple categories. For example, we
might wish to split the tax accountants by both state
and sex. However in this simple instance
(just one factor) what happens can be thought of as
follows. The values in the vector are
collected into groups corresponding to the distinct entries
in the factor. The function is then applied
to each of these groups individually. The value is a
vector of function results, labelled by the
levels attribute of the factor.
The combination of a vector and a labelling
factor is an example of what is sometimes called
a ragged array, since the subclass sizes are
possibly irregular. When the subclass sizes are all
the same the indexing may be done
implicitly and much more efficiently, as we see in the next
section.
4.3 Ordered factors
The levels of factors are stored in
alphabetical order, or in the order they were specified to
factor if they were specified explicitly.
Sometimes the levels will have a natural
ordering that we want to record and want our
statistical analysis to make use of. The
ordered() function creates such ordered factors but
is otherwise identical to factor. For most
purposes the only difference between ordered and
unordered factors is that the former are
printed showing the ordering of the levels, but the
contrasts generated for them in fitting
linear models are different.
Factors are created using the factor ()
function by taking a vector as input.
Factors are categorical variables that are
super useful in summary statistics, plots, and regressions. They basically act
like dummy variables that R codes for you.
So, let’s start off with some data:
and let’s check out what kinds of variables
we have:
so we see that Race is a factor variable
with three levels. I can see all the
levels this way:
So what his means that R groups statistics
by these levels. Internally, R stores
the integer values 1, 2, and 3, and maps the character strings (in alphabetical
order, unless I reorder) to these values, i.e. 1=Black, 2=Hispanic, and
3=White. Now if I were to do a summary
of this variable, it shows me the counts for each category, as below. R won’t let me do a mean or any other
statistic of a factor variable other than a count, so keep that in mind. But
you can always change your factor to be numeric.
If I do a plot of age on race, I get a
boxplot from the normal plot command since that is what makes sense for a
categorical variable:
plot(mydata$Age~mydata$Race,
xlab=”Race”, ylab=”Age”, main=”Boxplots of Age by Race”)
# Create a vector as input. data <-
c(“East”,”West”,”East”,”North”,”North”,”East”,”West”,”West”,”West”,”East”,”North”) print(data) print(is.factor(data)) # Apply the factor function. factor_data <- factor(data) print(factor_data) print(is.factor(factor_data)) |
Factors
in Data Frame
On creating any data frame with a column of
text data, R treats the text column as categorical data and creates factors on
it.
# Create the
vectors for data frame. height <-
c(132,151,162,139,166,147,122) weight <-
c(48,49,66,53,67,52,40) gender <-
c(“male”,”male”,”female”,”female”,”male”,”female”,”male”) # Create the
data frame. input_data
<- data.frame(height,weight,gender) print(input_data) # Test if the
gender column is a factor. print(is.factor(input_data$gender)) # Print the
gender column so see the levels. print(input_data$gender) |
Changing the Order of Levels: The order of
the levels in a factor can be changed by applying the factor function again
with new order of the levels.
data <-
c(“East”,”West”,”East”,”North”,”North”,”East”,”West”,”West”,”West”,”East”,”North”) # Create the
factors factor_data
<- factor(data) print(factor_data) # Apply the
factor function with required order of the level. new_order_data
<- factor(factor_data,levels =
c(“East”,”West”,”North”)) print(new_order_data) |
Generating Factor Levels: We can generate
factor levels by using the gl() function. It takes two integers as input which
indicates how many levels and how many times each level.
Syntax:
gl(n, k, labels)
Following is the description of the
parameters used:
·
n is a integer giving the
number of levels.
·
k is a integer giving the
number of replications.
·
labels is a vector of labels
for the resulting factor levels.
v <- gl(3,
4, labels = c(“Tampa”, “Seattle”,”Boston”)) print(v) |
A data frame is a table or a
two-dimensional array-like structure in which each column contains values of
one variable and each row contains one set of values from each column.
Following are the characteristics of a data frame:
·
The column names should be
non-empty.
·
The row names should be unique.
·
The data stored in a data frame
can be of numeric, factor or character type.
·
Each column should contain same
number of data items.
# Create the
data frame. emp.data <-
data.frame( emp_id = c (1:5), emp_name =
c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”), salary = c(623.3,515.2,611.0,729.0,843.25), start_date = as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05-
11″,”2015-03-27″)), stringsAsFactors=FALSE ) # Print the
data frame. print(emp.data) |
Get the Structure of the Data Frame: The
structure of the data frame can be seen by using str() function.
# Create the
data frame. emp.data
<- data.frame( emp_id = c
(1:5), emp_name =
c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”), salary =
c(623.3,515.2,611.0,729.0,843.25), start_date =
as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05- 11″,”2015-03-27″)), stringsAsFactors=FALSE ) # Get the
structure of the data frame. str(emp.data) |
Summary
of Data in Data Frame
The statistical summary and nature of the
data can be obtained by applying summary() function.
# Create the
data frame. emp.data
<- data.frame( emp_id = c
(1:5), emp_name =
c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”), salary =
c(623.3,515.2,611.0,729.0,843.25), start_date =
as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05- 11″,”2015-03-27″)), stringsAsFactors=FALSE ) # Print the
summary. print(summary(emp.data)) |
Extract
Data from Data Frame
Extract specific column from a data frame
using column name.
# Create the
data frame. emp.data <-
data.frame( emp_id = c (1:5), emp_name =
c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”), salary = c(623.3,515.2,611.0,729.0,843.25), start_date =
as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05-
11″,”2015-03-27″)), stringsAsFactors=FALSE ) # Extract
Specific columns. result <-
data.frame(emp.data$emp_name,emp.data$salary) print(result)
# Extract 3rd
and 5th row with 2nd and 4th column. result <-
emp.data[c(3,5),c(2,4)] print(result)
# Extract
first two rows. result <-
emp.data[1:2,] print(result)
# Expand Data
Frame – A data frame can be expanded by adding columns and rows. # Add the
“dept” coulmn. emp.data$dept
<-
c(“IT”,”Operations”,”IT”,”HR”,”Finance”) v <-
emp.data print(v)
|
Add
Row
To add more rows permanently to an existing
data frame, we need to bring in the new rows in the same structure as the
existing data frame and use the rbind() function. In the example below we
create a data frame with new rows and merge it with the existing data frame to
create the final data frame.
# Create the
first data frame. emp.data <-
data.frame( emp_id = c (1:5), emp_name =
c(“Rick”,”Dan”,”Michelle”,”Ryan”,”Gary”), salary = c(623.3,515.2,611.0,729.0,843.25), start_date = as.Date(c(“2012-01-01″,”2013-09-23″,”2014-11-15”,”2014-05-
11″,”2015-03-27″)),
dept=c(“IT”,”Operations”,”IT”,”HR”,”Finance”), stringsAsFactors=FALSE ) # Create the
second data frame emp.newdata
<- data.frame( emp_id = c (6:8), emp_name = c(“Rasmi”,”Pranab”,”Tusar”), salary = c(578.0,722.5,632.8), start_date =
as.Date(c(“2013-05-21″,”2013-07-30″,”2014-06-17”)), dept =
c(“IT”,”Operations”,”Fianance”), stringsAsFactors=FALSE ) # Bind the two
data frames. emp.finaldata
<- rbind(emp.data,emp.newdata) print(emp.finaldata) |
Unit 4: Simple manipulations; numbers and vectors
Vectors and assignment
R operates on named data structures. The
simplest such structure is the numeric vector, which is a single entity
consisting of an ordered collection of numbers. To set up a vector named x, say,
consisting of five numbers, namely 10.4, 5.6, 3.1, 6.4 and 21.7, use the R
command
> x <- c(10.4, 5.6, 3.1, 6.4, 21.7)
This is an assignment statement using the
function c() which in this context can take an arbitrary number of vector
arguments and whose value is a vector got by concatenating its
arguments end to end. A number occurring by
itself in an expression is taken as a vector of length one. Notice that the
assignment operator (‘<-’), which consists of the two characters ‘<’
(“less than”) and ‘-’ (“minus”) occurring strictly side-by-side and it ‘points’
to the object receiving the value of the expression. In most contexts the ‘=’
operator can be used as an alternative. Assignment can also be made using the
function assign(). An equivalent way of making the same assignment as above is
with:
> assign(“x”, c(10.4, 5.6, 3.1, 6.4, 21.7))
The usual operator, <-, can be thought
of as a syntactic short-cut to this.
Assignments can also be made in the other
direction, using the obvious change in the assignment operator. So the same
assignment could be made using
> c(10.4, 5.6, 3.1, 6.4, 21.7) -> x
If an expression is used as a complete
command, the value is printed and lost 2. So now if we
were to use the command
> 1/x
the reciprocals of the five values would be
printed at the terminal (and the value of x, of course, unchanged).
The further assignment
> y <- c(x, 0, x)
would create a vector y with 11 entries
consisting of two copies of x with a zero in the middle
place.
Vector
arithmetic
Vectors can be used in arithmetic
expressions, in which case the operations are performed element by element.
Vectors occurring in the same expression need not all be of the same length. If
they are not, the value of the expression is a vector with the same length as
the longest vector which occurs in the expression. Shorter vectors in the
expression are recycled as often as need be (perhaps fractionally) until they
match the length of the longest vector. In particular a constant is simply
repeated. So with the above assignments the command
> v <- 2*x + y + 1
generates a new vector v of length 11
constructed by adding together, element by element, 2*x repeated 2.2 times, y
repeated just once, and 1 repeated 11 times.
The elementary arithmetic operators are the
usual +, -, *, / and ^ for raising to a power. In
addition all of the common arithmetic
functions are available. log, exp, sin, cos, tan, sqrt,
and so on, all have their usual meaning.
max and min select the largest and smallest elements of a vector respectively.
range is a function whose value is a vector of length two, namely c(min(x),
max(x)). length(x) is the number of elements in x, sum(x) gives the total of
the elements in x, and prod(x) their product.
Two statistical functions are mean(x) which
calculates the sample mean, which is the same
as sum(x)/length(x),
and var(x) which gives sum((x-mean(x))^2)/(length(x)-1)
or sample variance. If the argument to
var() is an n-by-p matrix the value is a p-by-p sample
covariance matrix got by regarding the rows
as independent p-variate sample vectors.
sort(x) returns a vector of the same size
as x with the elements arranged in increasing order;
however there are other more flexible
sorting facilities available (see order() or sort.list()
which produce a permutation to do the
sorting).
Note that max and min select the largest
and smallest values in their arguments, even if they
are given several vectors. The parallel
maximum and minimum functions pmax and pmin return a vector (of length equal to
their longest argument) that contains in each element the largest (smallest)
element in that position in any of the input vectors.
For most purposes the user will not be
concerned if the “numbers” in a numeric vector
are integers, reals or even complex.
Internally calculations are done as double precision real
numbers, or double precision complex
numbers if the input data are complex.
To work with complex numbers, supply an
explicit complex part. Thus
sqrt(-17)
: will give NaN and a warning,
but
sqrt(-17+0i) :
will do the computations as complex numbers.
Generating
regular sequences
R has a number of facilities for generating
commonly used sequences of numbers. For example
1:30 is the vector c(1, 2, …, 29, 30).
The colon operator has high priority within an expression,
so, for example 2*1:15 is the vector c(2,
4, …, 28, 30). Put n <- 10 and compare
the sequences 1:n-1 and 1:(n-1).
The construction 30:1 may be used to
generate a sequence backwards.
The function seq() is a more general
facility for generating sequences. It has five arguments,
only some of which may be specified in any
one call. The first two arguments, if given, specify
the beginning and end of the sequence, and
if these are the only two arguments given the result is the same as the colon
operator. That is seq(2,10) is the same vector as 2:10.
Arguments to seq(), and to many other R
functions, can also be given in named form, in
which case the order in which they appear
is irrelevant. The first two arguments may be named from=value and to=value;
thus seq(1,30), seq(from=1, to=30) and seq(to=30, from=1)
are all the same as 1:30. The next two
arguments to seq() may be named by=value and
length=value, which specify a step size and
a length for the sequence respectively. If neither
of these is given, the default by=1 is
assumed.
For example
> seq(-5, 5, by=.2) -> s3
generates in s3 the vector c(-5.0, -4.8,
-4.6, …, 4.6, 4.8, 5.0). Similarly
> s4 <- seq(length=51, from=-5, by=.2)
generates the same vector in s4.
The fifth argument may be named
along=vector, which is normally used as the only argument
to create the sequence 1, 2, …,
length(vector), or the empty sequence if the vector
is empty (as it can be).
A related function is rep() which can be
used for replicating an object in various complicated
ways. The simplest form is
> s5 <- rep(x, times=5)
which will put five copies of x end-to-end
in s5. Another useful version is
> s6 <- rep(x, each=5)
which repeats each element of x five times
before moving on to the next.
Logical
vectors
As well as numerical vectors, R allows
manipulation of logical quantities. The elements of a
logical vector can have the values TRUE,
FALSE, and NA (for “not available”). The
first two are often abbreviated as T and F,
respectively. Note however that T and F are just
variables which are set to TRUE and FALSE
by default, but are not reserved words and hence can be overwritten by the
user. Hence, you should always use TRUE and FALSE.
Logical vectors are generated by
conditions. For example
> temp <- x > 13
sets temp as a vector of the same length as
x with values FALSE corresponding to elements of x where the condition is not
met and TRUE where it is.
The logical operators are <, <=,
>, >=, == for exact equality and != for inequality. In addition
if c1 and c2 are logical expressions, then
c1 & c2 is their intersection (“and”), c1 | c2 is their
union (“or”), and !c1 is the negation of
c1.
Logical vectors may be used in ordinary
arithmetic, in which case they are coerced into
numeric vectors, FALSE becoming 0 and TRUE
becoming 1. However there are situations where logical vectors and their
coerced numeric counterparts are not equivalent, for example see the next
subsection.
Missing
values
In some cases the components of a vector
may not be completely known. When an element
or value is “not available” or a “missing
value” in the statistical sense, a place within a vector
may be reserved for it by assigning it the
special value NA. In general, any operation on an NA
becomes an NA. The motivation for this rule
is simply that if the specification of an operation
is incomplete, the result cannot be known
and hence is not available.
The function is.na(x) gives a logical
vector of the same size as x with value TRUE if and
only if the corresponding element in x is
NA.
> z <- c(1:3,NA); ind <- is.na(z)
Notice that the logical expression x == NA
is quite different from is.na(x) since NA is not
really a value but a marker for a quantity
that is not available. Thus x == NA is a vector of the
same length as x all of whose values are NA
as the logical expression itself is incomplete and
hence undecidable.
Note that there is a second kind of
“missing” values which are produced by numerical computation, the so-called Not
a Number, NaN, values. Examples are
> 0/0
or
> Inf – Inf
which both give NaN since the result cannot
be defined sensibly.
In summary, is.na(xx) is TRUE both for NA
and NaN values. To differentiate these,
is.nan(xx) is only TRUE for NaNs.
Missing values are sometimes printed as
<NA> when character vectors are printed without
quotes.
2.6 Character vectors
Character quantities and character vectors
are used frequently in R, for example as plot labels.
Where needed they are denoted by a sequence
of characters delimited by the double quote
character, e.g., “x-values”,
“New iteration results”.
Character strings are entered using either
matching double (“) or single (’) quotes, but are
printed using double quotes (or sometimes
without quotes). They use C-style escape sequences,
using \ as the escape character, so \\ is
entered and printed as \\, and inside double quotes “
is entered as \”. Other useful escape
sequences are \n, newline, \t, tab and \b, backspace—see
?Quotes for a full list.
Character vectors may be concatenated into
a vector by the c() function; examples of their
use will emerge frequently.
The paste() function takes an arbitrary
number of arguments and concatenates them one by
one into character strings. Any numbers
given among the arguments are coerced into character
strings in the evident way, that is, in the
same way they would be if they were printed. The
arguments are by default separated in the
result by a single blank character, but this can be
changed by the named argument, sep=string,
which changes it to string, possibly empty.
For example
> labs <-
paste(c(“X”,”Y”), 1:10, sep=””)
makes labs into the character vector
c(“X1”, “Y2”,
“X3”, “Y4”, “X5”, “Y6”, “X7”,
“Y8”, “X9”, “Y10”)
Note particularly that recycling of short
lists takes place here too; thus c(“X”, “Y”) is
repeated 5 times to match the sequence
1:10.3
2.7 Index vectors; selecting and modifying
subsets of a data set
Subsets of the elements of a vector may be
selected by appending to the name of the vector an
index vector in square brackets. More
generally any expression that evaluates to a vector may
have subsets of its elements similarly
selected by appending an index vector in square brackets
immediately after the expression.
Such index vectors can be any of four
distinct types.
1. A logical vector. In this case the index
vector is recycled to the same length as the vector
from which elements are to be selected.
Values corresponding to TRUE in the index vector
are selected and those corresponding to
FALSE are omitted. For example
> y <- x[!is.na(x)]
creates (or re-creates) an object y which
will contain the non-missing values of x, in the
same order. Note that if x has missing
values, y will be shorter than x. Also
> (x+1)[(!is.na(x)) & x>0] ->
z
creates an object z and places in it the
values of the vector x+1 for which the corresponding
value in x was both non-missing and
positive.
2. A vector of positive integral
quantities. In this case the values in the index vector must lie
in the set f1, 2, . . . , length(x)g. The
corresponding elements of the vector are selected and
concatenated, in that order, in the result.
The index vector can be of any length and the
result is of the same length as the index
vector. For example x[6] is the sixth component
of x and
> x[1:10]
selects the first 10 elements of x
(assuming length(x) is not less than 10). Also
>
c(“x”,”y”)[rep(c(1,2,2,1), times=4)]
(an admittedly unlikely thing to do)
produces a character vector of length 16 consisting of
“x”, “y”,
“y”, “x” repeated four times.
3. A vector of negative integral
quantities. Such an index vector specifies the values to be
excluded rather than included. Thus
> y <- x[-(1:5)]
gives y all but the first five elements of
x.
4. A vector of character strings. This
possibility only applies where an object has a names
attribute to identify its components. In
this case a sub-vector of the names vector may be
used in the same way as the positive
integral labels in item 2 further above.
> fruit <- c(5, 10, 1, 20)
> names(fruit) <-
c(“orange”, “banana”, “apple”, “peach”)
> lunch <- fruit[c(“apple”,”orange”)]
The advantage is that alphanumeric names
are often easier to remember than numeric
indices. This option is particularly useful
in connection with data frames, as we shall see
later.
An indexed expression can also appear on
the receiving end of an assignment, in which case
the assignment operation is performed only
on those elements of the vector. The expression
must be of the form vector[index_vector] as
having an arbitrary expression in place of the
vector name does not make much sense here.
For example
> x[is.na(x)] <- 0
replaces any missing values in x by zeros
and
> y[y < 0] <- -y[y < 0]
has the same effect as
> y <- abs(y)
2.8 Other types of objects
Vectors are the most important type of
object in R, but there are several others which we will
meet more formally in later sections.
matrices or more generally arrays are
multi-dimensional generalizations of vectors. In fact,
they are vectors that can be indexed by two
or more indices and will be printed in special
ways. See Chapter 5 [Arrays and matrices],
page 18.
factors provide compact ways to handle
categorical data. See Chapter 4 [Factors], page 16.
lists are a general form of vector in which
the various elements need not be of the same
type, and are often themselves vectors or
lists. Lists provide a convenient way to return the
results of a statistical computation. See
Section 6.1 [Lists], page 26.
data frames are matrix-like structures, in
which the columns can be of different types. Think
of data frames as ‘data matrices’ with one
row per observational unit but with (possibly) both numerical and categorical
variables. Many experiments are best described by data
frames: the treatments are categorical but
the response is numeric. See Section 6.3 [Data
frames], page 27.
functions are themselves objects in R which
can be stored in the project’s workspace. This
provides a simple and convenient way to
extend R. See Chapter 10 [Writing your own
functions], page 42.
Objects,
their modes and attributes
Changing the length of an object
An “empty” object may still have a mode.
For example
> e <- numeric()
makes e an empty vector structure of mode
numeric. Similarly character() is a empty character
vector, and so on. Once an object of any
size has been created, new components may be added
to it simply by giving it an index value
outside its previous range. Thus
> e[3] <- 17
now makes e a vector of length 3, (the
first two components of which are at this point both NA).
This applies to any structure at all,
provided the mode of the additional component(s) agrees
with the mode of the object in the first
place.
This automatic adjustment of lengths of an
object is used often, for example in the scan()
function for input. (see Section 7.2 [The
scan() function], page 31.)
Conversely to truncate the size of an
object requires only an assignment to do so. Hence if
alpha is an object of length 10, then
> alpha <- alpha[2 * 1:5]
makes it an object of length 5 consisting
of just the former components with even index. (The
old indices are not retained, of course.)
We can then retain just the first three values by
> length(alpha) <- 3
and vectors can be extended (by missing
values) in the same way.
3.3 Getting and setting attributes
The function attributes(object) returns a
list of all the non-intrinsic attributes currently
defined for that object. The function
attr(object, name) can be used to select a specific
attribute. These functions are rarely used,
except in rather special circumstances when some
new attribute is being created for some
particular purpose, for example to associate a creation
date or an operator with an R object. The
concept, however, is very important.
Some care should be exercised when
assigning or deleting attributes since they are an integral
part of the object system used in R.
When it is used on the left hand side of an
assignment it can be used either to associate a
new attribute with object or to change an
existing one. For example
> attr(z, “dim”) <-
c(10,10)
allows R to treat z as if it were a
10-by-10 matrix.
3.4 The class of an object
All objects in R have a class, reported by
the function class. For simple vectors this is just the
mode, for example “numeric”,
“logical”, “character” or “list”, but
“matrix”, “array”,
“factor” and
“data.frame” are other possible values.
A special attribute known as the class of
the object is used to allow for an object-oriented
style4 of programming in R. For example if
an object has class “data.frame”, it will be printed
in a certain way, the plot() function will
display it graphically in a certain way, and other
so-called generic functions such as
summary() will react to it as an argument in a way sensitive
to its class.
To remove temporarily the effects of class,
use the function unclass(). For example if winter
has the class “data.frame” then
> winter
will print it in data frame form, which is
rather like a matrix, whereas
> unclass(winter)
will print it as an ordinary list. Only in
rather special situations do you need to use this facility,
but one is when you are learning to come to
terms with the idea of class and generic functions.
Generic functions and classes will be
discussed further in Section 10.9 [Object orientation],
page 48, but only briefly.
Importing and
manipulating your data are important steps in the data science workflow. R
allows for the import of different data formats using specific packages that
can make your job easier:
·
readr for importing flat files
·
The readxl package for getting
excel files into R
·
The haven package lets you
import SAS, STATA and SPSS data files into R.
·
Databases:
connect via packages like RMySQL and RpostgreSQL, and access and manipulate via DBI
·
rvest for webscraping
Once your
data is available in your working environment you are ready to start
manipulating it using these packages:
·
The tidyr package for tidying
your data.
·
The stringr package for
string manipulation.
·
For data
frame like objects learn the ins and outs of the dplyr package
·
Need to
perform heavy data wrangling tasks? Check out the data.table package
·
Performing
time series analysis? Try out packages like like zoo, xts and quantmod.
Let’s
practice
# Get and
print current working directory. print(getwd())
#Reading a
CSV File data <-
read.csv(“input.csv”) print(data)
# Analyzing
the CSV File data <-
read.csv(“input.csv”) print(is.data.frame(data)) print(ncol(data)) print(nrow(data))
#Get the
maximum salary: # Create a
data frame. data <-
read.csv(“input.csv”) # Get the
max salary from data frame. sal <-
max(data$salary) print(sal)
# Get the max
salary from data frame. sal <-
max(data$salary) # Get the
person detail having max salary. retval <-
subset(data, salary == max(salary)) print(retval)
#Get the
persons in IT department whose salary is greater than 600 info <-
subset(data, salary > 600 & dept == “IT”) print(info)
#Get the
people who joined on or after 2014 retval
<- subset(data, as.Date(start_date) > as.Date(“2014-01-01”)) print(retval) |
Writing into a CSV File
R can create
csv file form existing data frame. The write.csv() function is used to create
the csv file. This file gets created in the working directory
# Create a
data frame. data <-
read.csv(“input.csv”) retval <-
subset(data, as.Date(start_date) > as.Date(“2014-01-01”)) # Write
filtered data into a new file. write.csv(retval,”output.csv”) newdata <-
read.csv(“output.csv”) print(newdata)
retval <-
subset(data, as.Date(start_date) > as.Date(“2014-01-01”)) # Write
filtered data into a new file. write.csv(retval,”output.csv”,
row.names=FALSE) newdata <-
read.csv(“output.csv”) print(newdata)
|
# Verify the
package is installed. any(grepl(“xlsx”,installed.packages())) # Load the
library into R workspace. library(“xlsx”) |
Input as XLSX file
Open Microsoft
excel. Copy and paste the following data in the work sheet named as sheet1.
Also copy and
paste the following data to another worksheet and rename this worksheet to
“city”.
Save the
Excel file as “input.xlsx”. You should save it in the current working
directory of the R workspace.
Reading the Excel File
The
input.xlsx is read by using the read.xlsx() function as shown below. The result
is stored as a data frame in the R environment.
# Read the
first worksheet in the file input.xlsx. data <-
read.xlsx(“input.xlsx”, sheetIndex = 1) print(data) |
Note: These
examples are for 32 bit Windows
First, load
the RODBC package (you’ll also have to install it if you don’t have it
already).
# Load
RODBC package library(RODBC) |
Next, connect
to the Access database. This code creates an object called “channel” that tells
R where the Access database is.
If you paste
the path from windows be sure to change every backslash to a forward slash.
Do not
include the file extension (.accdb or .mdb) on the end of the name of the
database.
#
Connect to Access db channel <-
odbcConnectAccess(“C:/Documents/Name_Of_My_Access_Database”) |
Finally, run
a SQL query to return the data.
# Get
data data
<- sqlQuery( channel , paste (“select * from Name_of_table_in_my_database”)) |
Return All Data from One Table
Example shows
how to connect to database in R and queries the database DATABASE and returns
all of the data (this is specified using the * in SQL) from the table
DATATABLE. The table is preceded by the database schema SCHEMA and separated by
a period. Each of the words in all caps needs within the query needs to be
replaced so that the query applies to your database.
# Load
RODBC package library(RODBC)
# Create
a connection to the database called “channel” # If you
are using operating system authentication (the computer already knows who you # are
because you are logged into it) you can leave out the
uid=”USERNAME”, part. channel
<- odbcConnect(“DATABASE”, uid=”USERNAME”,
pwd=”PASSWORD”, believeNRows=FALSE)
# Check
that connection is working (Optional) odbcGetInfo(channel)
# Find
out what tables are available (Optional) Tables
<- sqlTables(channel, schema=”SCHEMA”)
# Query
the database and put the results into the data frame “dataframe” dataframe <- sqlQuery(channel, “ SELECT * FROM SCHEMA.DATATABLE”) |
Return Only Specific Fields
Example shows
how to connect to database in R and query the database DATABASE and pull only
the specified fields from the table DATATABLE. Note that loading the RODBC
package and creating a connection does not have to be repeated if they were
done in the first example.
# Load
RODBC package library(RODBC)
# Create
a connection to the database called “channel” channel
<- odbcConnect(“DATABASE”, uid=”USERNAME”,
pwd=”PASSWORD”, believeNRows=FALSE)
# Find
out what fields are available in the table (Optional) #
as.data.frame coerces the data into a data frame for easy viewing Columns
<- as.data.frame(colnames(sqlFetch(channel,
“SCHEMA.DATATABLE”)))
# Query
the database and put the results into the data frame “dataframe” dataframe <- sqlQuery(channel, “ SELECT SCHOOL, STUDENT_NAME FROM SCHEMA.DATATABLE”) |
Joining Two
Tables and Returning Only Specific Fields and Records
# Load
RODBC package library(RODBC)
# Create
a connection to the database called “channel” channel
<- odbcConnect(“DATABASE”, uid=”USERNAME”,
pwd=”PASSWORD”, believeNRows=FALSE)
# Query
the database and put the results into the data frame “dataframe” dataframe <- sqlQuery(channel, “ SELECT DT.SCHOOL_YEAR, DTTWO.DISTRICT_NAME AS DISTRICT, DTTWO.SCHOOL_NAME AS SCHOOL, DT.GRADE_LEVEL AS GRADE, DT.ACTL_ATT_DAYS AS ACTUAL_DAYS, DT.POSS_ATT_DAYS AS POSSIBLE_DAYS FROM (SCHEMA.DATATABLE DT INNER JOIN
SCHEMA.DATATABLE_TWO DTTWO ON (DT.SCHOOL_YEAR = DTTWO.SCHOOL_YEAR AND DT.SCHOOL_NUMBER = DTTWO.SCHOOL_CODE)) WHERE DT.SCHOOL_YEAR = ‘2011-12’ AND DTTWO.SCHOOL_NAME = ‘Pine Tree Elementary
School'”) |
Using a Parameter
from R to Return Only Specific Records
# Load
RODBC package library(RODBC)
# Create
a connection to the database called “channel” channel
<- odbcConnect(“DATABASE”, uid=”USERNAME”,
pwd=”PASSWORD”, believeNRows=FALSE)
#
Parameter YEARS
<- c(“2012”, “2013”, “2014”)
# Query
the database and put the results into the data frame “dataframe” dataframe
<- sqlQuery(channel, paste(“SELECT YEAR, SCHOOL_YEAR, DISTRICT_CODE, GRADE_LEVEL FROM SCHEMA.DATATABLE WHERE SCHEMA.DATATABLE.SCHOOL_YEAR IN
(‘”, paste(YEARS, collapse = “‘, ‘”), “‘) “, sep=””)) |
The basis of any analysis is to understand, evaluate and interpret
complex results. Thus, it is very imperative for an analyst to have a very
comprehensive understanding of the data under scrutiny and relationship among
various variables. The simplest yet very power powerful approach to gain a
better understanding of the data is graphical techniques. For example, if you
are looking at a excel spreadsheet for daily revenue data for a firm in a year,
it is obviously not possible to understand if there is a particular trend or
seasonality. But, by just plotting the data using a line chart, you can easily
see seasonality, trend, and average behavior in one short. Let’s take an
example of a scatterplot. A simple scatter plot not only shows the correlation
between two variables but also shows linearity, non-linearity, homogeneity in
the data. More importantly, data visualization also helps in presenting results
to higher management group in a very simple manner. In this section we will
explore various data visualization technique using R.
For most of the plots in the next sub sections, we have used a dataset
consisting of following metrics for Year 2010-2017 for a website.
·
Date
·
Visits
·
Page views
·
Unique Visitors
·
Bounce rate
Basic Visualization
Techniques
1.
Histogram: Histogram is used to plot continuous variable. It breaks
the data into bins (or breaks) and shows frequency distribution of these bins. Histograms
are appropriate to understand underlying distribution.
R Code:
h <-
hist(Data$Visits, # Vector of data to be plotted
main = “Total Visits of a Web
Site Per Year”, # Title of the
plot
xlab
= ” Visits”, # Title of the x – axis
# xlim
= c(15, 40),# limit on the x axis
col = “palevioletred1”, # Color of the
bar to be filled
border = “brown”, # color of the border around the bin
freq = T) # representation of frequencies
text (h$mids,
h$counts, labels=h$counts, adj = c(0.5, -0.5)) # Give number on each bar
Figure:
In a histogram, the area of the bar indicates
the frequency of occurrences for each value. From the figure, it found that the
visits in the range of 1000000-1200000 occurring three times, the spread is more between 1000000-12000000. From the figure we can say the, there are no outliers in the data. The Histogram shows the data follows an
irregular clustered distribution.
2. Bar/Line chart:
Line: Line Charts are chosen to examine a
trend spread over a period. Additionally,
line plot is used to compare relative changes in quantities across some
variable (like time). Line charts are typically used to analyze trend in a data. It can also
be used to understand outliers and to check normality assumptions.
R Code:
p <- plot_ly (Data,
# Data
frame
x = ~Date, # x- axis data
y = ~Visits) %>% # y- axis data
add_lines() %>% # Add traces to
a plotly visualization
filter(Visits == min(Visits)) # filtering
minimum among all values
plotly_data (p) # obtaining data associated with a
plotly graph
add_markers (p) # Add traces to a plotly
visualization
layout (p, annotations = list(x = ~Year,
y = ~Visits, text = “Valley”)) %>%
layout (title = “Total Visits of a
Web Site per year”, xaxis = list (title = “Date”, showgrid = F), yaxis = list (title =
“Visits”), showlegend = F)
Figure:
The above line chart shows the visitors for a website yearly from 2010 to 2016. It gives fairly good idea that the visitors
of the website have grown continuously up
to 2015 over a particular time frame. In
the year 2015, the total visitors for a website are high and decreased in the
year 2016 around 15%. The visitor’s data of a website follows a left skewed normal
distribution. I
Bar: Bar Plots are used to compare
cumulative totals across several groups.
R Code:
plot_ly (Data, # Data frame
type=”bar”, # Type of chart
x = ~Date, # x- axis data
y = ~Visits, # y- axis data
visible = TRUE, # Visualbility of plot
showlegend = TRUE) %>% # Legend status
layout (title = “Total Visits of a Web
Site Per Year”, # Title of the chart
xaxis = list (title = “Year”,
showgrid = TRUE, color = “red”), # list of x-axis properties
yaxis = list (title = “Visits”,
showgrid = TRUE, color = “green”)) # list of y-axis properties
Figure:
The bar chart indicates the number of visitors for a website between the
years 2010-2016. It can be seen that the
number of visitors is increasing linearly
up to 2015; however, it decreases in the year 2016.
3.
Box plot: Box Plot used for visualizing the spread of the
data and deriving inferences accordingly and also determine outliers.
R Code:
boxplot (Data [, 2:4],
#
Specifying data
las = 1, #for Naming Vertical (las = 2) or
Horizontal (las = 10)
col = c
(“sienna”,”green”), # Color of the box
main = “Total Visits and Pageviews
of a Web Site Per Year”) # Title of the plot
Figure:
The chart gives information about the spread of the data for
Visitors, Page.views, and Unique
visitors. The quartile range for visitors, page views,
and unique visitors are around 300000, 1100000 and 150000respectively. That
means there is tightly bound for unique
visitors. For Visitors, unique visitors the median lies very close to the upper
quartile.
4.
Scatter plot: Scatter plot used to visualize data easily and
for simple data inspection.
R Code:
plot_ly (Data, # Data frame
type =”scatter”, # Type of chart
x = ~Date, # x- axis data
y = ~Visits, # y- axis data
visible = TRUE, # Visualbility of plot
showlegend = TRUE) %>% # Legend status
layout (title = “Total Visits of a Web
Site Per Year”,
xaxis = list (title =
“Date”, showgrid = TRUE, color = “red”),
yaxis = list (title =
“Visits”, showgrid = TRUE, color = “green”))
Figure:
The graph above shows the relationship between visitors,
page views, unique visitors and bounce rate during 2010 to 2016. It is observed
that, higher number of visitors to a website leads to lower
bounce rate. However visitors, page views and unique visitors interrelated to
each other.
Advanced
Visualization Techniques
1. Heat map- Heat maps used to do empirical data analysis with two dimensions as the
axis and the third dimension shown by intensity of color.
R Code:
heatmap (as.matrix
(Data[, 18:21]), las=2)
R Code:
heatmap.2 (as.matrix
(Data), #
numeric matrix of the values
dendrogram =”row”) # row
dendrogram plotted and row reordering done
Figure:
The heat map gives the hierarchical clustering of visitors,
unique visitors, page views and bounce rate. Initially,
visitors and unique visitors together form a cluster because of their much similarity in their values. Then, bounce rate
is clustered with the existing one, and finally,
they clustered with page views.
2.
Mosaic plot- A mosaic plot can be used for
plotting categorical data very effectively
with the area of the data showing the relative proportions.
R Code:
mosaicplot (~ Visits + Page.views, # formula
data =
Data, #
Data frame
main =
“Total Visits and Page views of a website per Year”, # Title of the plot
color =
TRUE, #
Color shading
dir =
“h”, # Vector of split directions
las = 2) # the style of
axis labels
Figure:
In the mosaic plot, the data is split
into different bars and shown the relationship between visitors, page views,
unique visitors, and bounce rate. The
mosaic plot is divided first into horizontal bars whose widths are proportional
to the probabilities associated with the year. Then each bar is split
vertically into bars that are proportional
to the conditional probabilities of visitors, page views, unique visitors, and bounce rate. The colors represent the level of the
residual/probability for that cell/combination
of levels.
3. Map visualization-
a. World map
R Code:
newmap <- getMap (resolution = “high”) # Accessing map
stored in the package with high resolution
plot (newmap, # Map source
xlim = c (10, 50), # co-ordinates in x –
direction
ylim = c (0, 81), # co-ordinates in y – direction
asp = 1) # Aspect ratio
Figure:
b. Plotting a
location based on longitudes and latitudes
R Code:
m <- leaflet ()
%>%
addTiles
() %>% # Add default Open Street Map tiles
addMarkers
(lng=87.3091, lat=22.3145, popup=”The Indian institute of Technology
Kharagpur”) # longitude and latitude of IIT Kharagpur
m # Print the map
Figure:
4. 3D graphs-
a. Scatter plot
R Code:
scatterplot3d(x =
Data$Date,
# the x coordinates of points
y = Data$Visits, # the y
coordinates of points
z = Data$Page.views, # the z
coordinates of points
residuals=TRUE, # Residuals
bg=”black”, # Background
color
axis.scales=TRUE,
grid=TRUE, # grid should
be drawn on the plot or not
ellipsoid=T,
main = “Total Visits of a
Web Site Per Year”, # Title of plot
xlab = “Year”, # Title of
x-axis
ylab = “Page.Views”, # Title of
y-axis
zlab = “Visits”) # Title of
z-axis
Figure:
b. Surface plot
R Code:
plot_ly (Data, # Data frame
x = ~Date, # The x coordinates of points
y = ~Visits, # The x coordinates of points
z = volcano, # The x coordinates of points
type = “surface”) # Surface plot
layout (title =
“Total Visits of a Web Site Per Year”, # Title of the plot
xaxis = list (title = “Year”,
showgrid = TRUE, color = “red”), # x-axis title and other properties
yaxis = list (title =
“Visits”, showgrid = TRUE, color = “green”)) # x-axis title
and other properties
Figure:
c. Spinning scatter plot
R Code:
scatter3d (as.numeric
(Data$Year), # The x coordinates of points
Data$Visits, # The
y coordinates of points
Data$Page.views) # The z
coordinates of points
Figure:
5. Correlogram – Correlogram used to visualize the data in
correlation matrices.
R Code:
corrgram (Data, #Data frame
order=NULL, # Variables are not re-ordered
panel=panel.shade, # To plot
content of each panel
text.panel=panel.txt,
main=”Correlogram between website
Visits and Page views”) # Title of the plot
Figure:
From the figure, we
observed that there is a positive correlation between visitors, page views, and unique visitors. However, Bounce rate has a negative correlation with other three values.
To install a package, in the console, type: install.packages(“RGoogleAnalytics”)
and hit enter.
install.packages(“RGoogleAnalytics”)
magrittr
A Forward-Pipe Operator for R: Provides a mechanism for chaining
commands with a new forward-pipe operator, %>%. This operator will forward a
value, or the result of an expression, into the next function call/expression. The
magrittr is a package developed to give two main benefits: 1) to decrease
development time, and 2) to improve readability and maintainability of code.
Below codes are based on the mtcars
dataset provided in R.
Compare the codes with and without %>%.
library(magrittr)
car_data <-
mtcars %>%
subset(hp >
100) %>%
print
car_data <-
mtcars
print (car_data)
%>% changes the semantics of the code and makes it more intuitive to
both read and write.
rvest
rvest is a package that makes it
easy to scrape (or harvest) data from html web pages, inspired by libraries
like beautiful soup. It is designed to work with magrittr so that you can
express complex operations as elegant pipelines composed of simple, easily
understood pieces. Install it with:
Test the rvest library: code to get the rating of the Titanic movie from
IMDB.com (http://www.imdb.com/title/tt0120338/). selectorgadget
(refer online tutorial to learn about this plugin) to figure out which css
selector matches the data we want. strong
span is the CSS selector for to extract the rating.
library(rvest)
movie_link <-
html(“http://www.imdb.com/title/tt0120338/”)
movie_link %>%
html_node(“strong span”) %>%
html_text()
%>%
as.numeric()
Rcurl
A wrapper for ‘libcurl’ <http://curl.haxx.se/libcurl/> Provides
functions to allow one to compose general HTTP requests and provides convenient
functions to fetch URIs, get & post forms, etc. and process the results
returned by the Web server. This provides a great deal of control over the
HTTP/FTP/… connection and the form of the request while providing a
higher-level interface than is available just using R socket connections.
Additionally, the underlying implementation is robust and extensive, supporting
FTP/FTPS/TFTP (uploads and downloads), SSL/HTTPS, telnet, dict, ldap, and also
supports cookies, redirects, authentication, etc.
library(RCurl)
# Amazon search: The Best American Short Stories of
the Century
URL <-
“https://www.amazon.com/Best-American-Short-Stories-2016/dp/0544582896/ref=sr_1_1?ie=UTF8&qid=1493919877&sr=8-1&keywords=The+Best+American+Short+Stories”
html <- getURLContent(URL)
print(html)
gridExtra
Provides a
number of user-level functions to work with “grid” graphics, notably
to arrange multiple grid-based plots on a page, and draw tables.
Below is a
sample example where we have mixed a few grobs and plots
library(gridExtra)
library(grid)
library(ggplot2)
library(lattice)
p <- qplot(1,1)
p2 <- xyplot(1~1)
r <- rectGrob(gp=gpar(fill=”grey90″))
t <- textGrob(“text”)
grid.arrange(t, p, p2, r, ncol=2)
Other R Libraries ReQuired
in Data Visualization
These libraries are used in the examples shown under Data Visualization
section
·
library (plotly): Plotly’s R graphing library makes interactive,
publication-quality graphs online. Examples of how to make line plots, scatter
plots, area charts, bar charts, error bars, box plots, histograms, heatmaps,
subplots, multiple-axes, and 3D (WebGL based) charts.
·
library (ggplot2): A system for ‘declaratively’ creating graphics, based
on “The Grammar of Graphics”. You provide the data, tell ‘ggplot2’
how to map variables to aesthetics, what graphical primitives to use, and it
takes care of the details.
·
library (RColorBrewer): Provides color schemes for maps (and other
graphics) designed by Cynthia Brewer.
·
library (gplots): Various R programming tools for plotting data,
including: – calculating and plotting locally smoothed summary function as
(‘bandplot’, ‘wapply’), – and more. Refer the documentation.
·
library (vcd): Visualization techniques, data sets, summary and
inference procedures aimed particularly at categorical data. Special emphasis
is given to highly extensible grid graphics.
·
require (stats): This package contains functions for statistical
calculations and random number generation.
·
library (maps): Package to display maps. Projection code and larger maps
are in separate packages (‘mapproj’ and ‘mapdata’).
·
library (leaflet): Leaflet is one of the most popular open-source
JavaScript libraries for interactive maps. This R package makes it easy to
integrate and control Leaflet maps in R.
·
library (maptools): Tools for Reading and Handling Spatial Objects
·
library (rworldmap): Enables mapping of country level and gridded user
datasets.
·
library (Rcmdr): A platform-independent basic-statistics GUI (graphical
user interface) for R, based on the tcltk package.
·
library (rgl) – 3D Visualization Using OpenGL: Provides medium to high
level functions for 3D interactive graphics, including functions modelled on
base graphics (plot3d(), etc.) as well as functions for constructing
representations of geometric objects (cube3d(), etc.). Output may be on screen
using OpenGL, or to various standard 3D file formats including WebGL, PLY, OBJ,
STL as well as 2D image formats, including PNG, Postscript, SVG, PGF.
·
library (scatterplot3d): Plots 3D Scatter Plot
·
library (corrgram): Calculates correlation of variables and displays the
results graphically. Included panel functions can display points, shading,
ellipses, and correlation values with confidence intervals.
·
library(markdown): ‘Markdown’ is a plain-text formatting syntax that can
be converted to ‘XHTML’ or other formats.
·
library(shiny): Makes it incredibly easy to build interactive web applications
with R. Automatic “reactive” binding between inputs and outputs and
extensive prebuilt widgets make it possible to build beautiful, responsive, and
powerful applications with minimal effort.
·
library (htmltools): Tools for HTML generation and output.
#R Program to
Add Two Vectors > x <-
c(3,6,8) [1] 3 6 8 > y <-
c(2,9,0) [1] 2 9 0
> x + y [1] 5 15
8
> x +
1 # 1 is recycled to (1,1,1) [1] 4 7 9
> x +
c(1,4) # (1,4) is recycled to
(1,4,1) but warning issued [1] 4 10
9 Warning
message: In x + c(1, 4)
: longer object length is not a multiple of
shorter object length |
#Find Sum,
Mean and Product of Vector in R Programming >
sum(2,7,5) [1] 14
> x [1] 2 NA
3 1 4
>
sum(x) # if any element is NA or
NaN, result is NA or NaN [1] NA
> sum(x,
na.rm=TRUE) # this way we can ignore
NA and NaN values [1] 10
> mean(x,
na.rm=TRUE) [1] 2.5
> prod(x,
na.rm=TRUE) [1] 24 |
#R Program to
Take Input From User my.name <-
readline(prompt=”Enter name: “) my.age <-
readline(prompt=”Enter age: “)
# convert
character into integer my.age <-
as.integer(my.age)
print(paste(“Hi,”,
my.name, “next year you will be”, my.age+1, “years
old.”)) |
#R Program to
Generate Random Number from Standard Distributions >
runif(1) # generates 1 random number [1] 0.3984754
>
runif(3) # generates 3 random number [1] 0.8090284
0.1797232 0.6803607
> runif(3,
min=5, max=10) # define the range
between 5 and 10 [1] 7.099781
8.355461 5.173133
|
#R Program to Sample
from a Population > x [1] 1
3 5 7 9
11 13 15 17
> # sample
2 items from x > sample(x,
2) [1] 13 9 |
#R Program to
Find Minimum and Maximum > x [1] 5
8 3 9
2 7 4 6
10
> # find
the minimum > min(x) [1] 2
> # find
the maximum > max(x) [1] 10
> # find
the range > range(x) [1] 2 10 |
#Find factors
of a number print(paste(“The
factors of”,x,”are:”)) for(i in 1:x)
{ if((x %% i) == 0) { print(i) }
} |
# Program
to check if # the input
number is # prime or
not
# take
input from the user num = as.integer(readline(prompt=“Enter a number: “))
flag = 0 # prime
numbers are greater than 1 if(num > 1) { # check for
factors flag = 1 for(i in 2:(num-1)) { if ((num %% i) == 0) { flag = 0 break } } } if(num == 2) flag = 1 if(flag == 1) { print(paste(num,“is a prime number”)) } else { print(paste(num,“is not a prime number”)) }
|
# Program to check if # the input number is odd or even. # A number is even if division # by 2 give a remainder of 0. # If remainder is 1, it is odd. num = as.integer(readline(prompt="Enter a number: ")) if((num %% 2) == 0) { print(paste(num,"is Even")) } else { print(paste(num,"is Odd")) }
|
# In this program, we input a number # check if the number is positive or # negative or zero and display # an appropriate message num = as.double(readline(prompt="Enter a number: ")) if(num > 0) { print("Positive number") } else { if(num == 0) { print("Zero") } else { print("Negative number") } }
|
# take input from the user num = as.integer(readline(prompt="Enter a number: ")) factorial = 1 # check is the number is negative, positive or zero if(num < 0) { print("Sorry, factorial does not exist for negative numbers") } else if(num == 0) { print("The factorial of 0 is 1") } else { for(i in 1:num) { factorial = factorial * i } print(paste("The factorial of", num ,"is",factorial)) } |
# Program to find the multiplication # table (from 1 to 10) # of a number input by the user # take input from the user num = as.integer(readline(prompt = "Enter a number: ")) # use for loop to iterate 10 times for(i in 1:10) { print(paste(num,'x', i, '=', num*i)) } |
# take input from the user nterms = as.integer(readline(prompt="How many terms? ")) # first two terms n1 = 0 n2 = 1 count = 2 # check if the number of terms is valid if(nterms <= 0) { print("Plese enter a positive integer") } else { if(nterms == 1) { print("Fibonacci sequence:") print(n1) } else { print("Fibonacci sequence:") print(n1) print(n2) while(count < nterms) { nth = n1 + n2 print(nth) # update values n1 = n2 n2 = nth count = count + 1 } } } |
# Program make a simple calculator # that can add, subtract, multiply # and divide using functions add <- function(x, y) { return(x + y) } subtract <- function(x, y) { return(x - y) } multiply <- function(x, y) { return(x * y) } divide <- function(x, y) { return(x / y) } # take input from the user print("Select operation.") print("1.Add") print("2.Subtract") print("3.Multiply") print("4.Divide") choice = as.integer(readline(prompt="Enter choice[1/2/3/4]: ")) num1 = as.integer(readline(prompt="Enter first number: ")) num2 = as.integer(readline(prompt="Enter second number: ")) operator <- switch(choice,"+","-","*","/") result <- switch(choice, add(num1, num2), subtract(num1, num2), multiply(num1, num2), divide(num1, num2)) print(paste(num1, operator, num2, "=", result)) check <- function(x) { if (x > 0) { result <- "Positive" } else if (x < 0) { result <- "Negative" } else { result <- "Zero" } return(result) } |
# take input from the user num = as.integer(readline(prompt = "Enter a number: ")) if(num < 0) { print("Enter a positive number") } else { sum = 0 # use while loop to iterate until zero while(num > 0) { sum = sum + num num = num - 1 } print(paste("The sum is", sum)) } |
https://learn.swapnil.pw/