Learn R Programming

 

var1 = 5

var1 = 50

print(var1)

#[1] 50

print(var1 + var4)

var = 55

var = 55

var4 = 55

#print(var1, var4)

#Error in print.default(var1, var4) : invalid printing digits 55

cat(var1, var4)  #50 55

print(‘var1 + var4’)

cat(‘var1 + var4=’,var1 + var4)

var1 + var4= 105

#class

print(class(var2))  #[1] “list”

#class

var2 <- 6

print(class(var2))   #[1] “numeric”

#class

var2 <- 6

print(class(var2))  #[1] “numeric”

var2 <- 6.0

print(class(var2))  #[1] “numeric”

var2 <- 6L  #”numeric”

print(class(var2))   #[1] “integer”

var2 <- “6L”  #”integer”

print(class(var2))   #[1] “character”

var2 = TRUE

print(class(var2))   ## “logical”

var1 = 10

var2 = 15

print(var1 %% var2)  #modulo – remainder

var1 = 100

print(var1 %% var2)  #modulo – remainder

var1 = 95

var2 = 15

print(var1 %% var2)  #modulo – remainder

var1 = 5

var2 = 15

print(var1 ^ var2)  #power:


var1<- 15

var2 <- 20

var3 <- 15

#Relational Operator / comparison operator – output is Logical

print(var1 > var2)  #is var1 greater than var2? – FALSE

print(var1 >= var3) 

print(var1 <= var3) 

print(var1 == var3) # double = is for asking is it equal?

print(var1 != var3)


#Logical operator- input and output both are logical

#I will do work 1 and work 2 today

#actual – I did only work 1 => No


#I will do work 1 or work 2 today

#actual – I did only work 1 => Yes

print(var1 == var3 | var1 != var3)  #

print(var1 == var3 & var1 != var3)


#CONDITIONAL STATEMENTS

var1 <- 0

# is it positive or not ?

if (var1 >0) {

  print(“Its positive”)

}


if (var1 >0) {

  print(“Its positive”)

} else {

  print(“Its not positive”)

}


if (var1 >0) {

  print(“Its positive”)

} else if (var1<0){

  print(“Its negative”)

} else {

  print(“Its zero”)

}



#Collections: Vectors, Lists, Matrices, Arrays, Factors & DataFrames

#Vectors: will store multiple values of same datatype

vec1 <- c(45,56,36)

print(vec1)


#List: multiple data types

list1 = list(45,56,”Hello”,c(2,4,6))

print(list1)


#Matrix

mat1 = matrix(c(2,2,4,4,6,6,8,8,10,10,11,11) ,nrow=3,ncol = 4,byrow = FALSE)

print(mat1)


#Arrays – more than 2-D

arr1 = array(c(2,2,4,4,6,6,8,8,10,10,11,11),dim=c(2,4,2,2))

print(arr1)



#factors: categorical values

gender = factor(c(“M”,”M”,”M”,’F’,”F”,”F”))

print(class(gender))

print(nlevels(gender))


#DataFrame

players_stats <- data.frame(

  ID= c(10,20,30),

  Name=c(“Sachin”,”Virat”,”Dhoni”)

)

print(players_stats)



#membership:  %in% : check if left side value is in right side or not

cities<- c(“Delhii”,”New York”,”London”)


print(“Delhi” %in% cities)


avg <- 98

## avg: 80: Grade A, 70-80: B, 60-70- C, 50-60 – D, 40-50: E , <40: Failed

if (avg >=80) {

  print(“Grade: A”)

  if (avg>=90){

    print(“You win special certificate!”)

    if (avg>=95) {

      print(“You win medal”)

    }

  }

} else if (avg>=70) {

  print(“Grade: B”)

} else if (avg>=60) {

  print(“Grade: C”)

} else if (avg >=50) {

  print(“Grade: D”)

} else if (avg>=40) {

  print(“Grade: E”)

} else {

  print(“Failed”)

}

 

result = 3

val1 <- switch(result,

               “Grade A”,”Grade B”,”Grade C”,”Grade D”,”Grade E”, “Grade F”)

cat(“Result – “,val1)

 

 

#Loops – to repeat:  

#repeat: keep repeating – break when a condition is met -EXIT Controlled

#while: will check for the condition and then repeat: ENTRY Controlled 

#for (exactly  how many times to run)

 

start = 1

repeat{

  print(start)

  if (start==10){

    break

  }

  start = start+1

}

 

start = 11

while (start <=20) {

  print(start)

  start = start + 1

}

 

#For loop

 

words <- LETTERS[1:5]

for (i in words) {

  print(i)

}

numbers <- seq(1,10,by=3)

for (i in numbers) {

  print(i)

}

 

num = 30

start = 2

isPrime=TRUE

repeat{

  

  if (num%%start==0){

    isPrime = FALSE

    break

  }

  if (start==num-1) {

    break

  }

  start=start+1

}

 

if (isPrime) {

  print(“Number is Prime”)

} else {

  print(“Number is not Prime”)

}

 

 

## Assignment 1: Do the above with WHILE and FOR

## Assignment 2: Extend the same logic (one of the 3) to generate prime numbers

## between 1000 and 1500



for (num in 10:20){

  #print(num)

  num1=53

  Isprime=TRUE

  for (a in 3:(num1-1)) {

    # cat(“testing value a”,a)

    if (num1%%a == 0) {

      Isprime=FALSE

      #print(a)

      #print(“inside Hello”)

      break

    }

  }

  if (Isprime==TRUE){

    print(num)

  }

}

########################


#Built-in function

print() #parameter


myfunc.generatePrime <- function(num) {

  isPrime=TRUE

  for(i in 2:(num-1)) {

    if (num %%i==0) {

      isPrime=FALSE

    }

  }

  if (isPrime){

    print(‘num is prime’) 

  } else {

    print(‘num is not Prime’)

  }

}


val <- mean(1:100)

print(val)


myfunc.generatePrime(30)


myfunc.checkPrime2 <- function(num) {

  isPrime=TRUE

  for(i in 2:(num-1)) {

    if (num %%i==0) {

      isPrime=FALSE

    }

  }

  return(isPrime)

}


output <- myfunc.checkPrime2(53)

if (output){

  print(‘num is prime’) 

} else {

  print(‘num is not Prime’)

}


for (num in 1000:1300) {

  output <- myfunc.checkPrime2(num)

  if (output){

    print(num) 

  }

}

######   #####################  ################

#built in functions

print(seq(10,90))

print(max(10:90))

print(mean(10:90))

 

#user defined functions

sum.func <- function(num1=1, num2=2,num3=4,num4=6) {

  cat(“Number 1 = “,num1)

  cat(“\n Number 2 = “,num2,”\n”)

  cat(“Number 3 = “,num3)

  cat(“\n Number 4 = “,num4,”\n”)

  result = num1 * num2

  print(result)

}

#calling the functions by parameters

sum.func(40,30)

#call by name

sum.func(num2=40,num4=30)

 

## Assignments: Logic built using loops- convert them to

## functions

 

# #####################

a <- “Whats your name”

b <- ‘What\’s your name?’

 

print(paste(a,b,sep = “:”))

 

print(substring(a,2,6))

 

print(tolower(a))

print(toupper(a))

 

vector1 = c(“Monday”, TRUE,5,”Thursday”)

print(vector1)

print(vector1[2])

print(vector1[-2])

print(vector1[-2])

 

print(vector1[c(2,4)])

 

list1 = list(“Monday”, TRUE,5,”Thursday”)

print(list1)

 

VIDEO RECORDING OF THE SESSION

library(ggplot2)

dataset2 <- data.frame(city=c(“City A”,”City B”,”City C”),

                       revenue=c(200,220,190))

 

ggplot(dataset2, aes(x=city,y=revenue)) +

  geom_bar(stat=”identity”)

 

##############################

# VECTORS

vec1 <- c(2,4,”HELLO”, 5,6)

print(vec1)

 

#built-in 

vec2 <- 5:50

print(vec2)

 

vec2 <- 5.4:30.8

print(vec2)

 

#start, end and increment by

vec3 <- seq(5,30.2,by=0.9)

print(vec3)

 

vec1 <- c(2,4,”HELLO”, 5,6,9,11)

print(vec1[c(2,3,6)])

 

vec1 <- c(2,4,6,8,10)

vec2 <- c(1,2,1,2,0)

print(vec1 + vec2)

 

vec1 <- c(2,4,6,8,10,12)

vec2 <- c(1,2)

print(vec1 + vec2)

 

vec1 <- c(2,4,16,18,10,12)

vec3 <- sort(vec1)

print(vec3)

vec3 <- sort(vec1, decreasing = TRUE)

print(vec3)

 

## LIST

list1 <- list(55,”Hello”,c(2,4,6), 5.4)

print(list1)

print(list1[c(1,3)])

list2 <- list(33,99)

 

mergedlist <- c(list1,list2)

print(mergedlist)

 

 

###MATRICES

mat1 <- matrix(c(2,4,6,8,10,12),nrow = 3,byrow=FALSE)

print(mat1)

mat2 <- matrix(c(2,4,6,8,10,12),nrow = 3,byrow=TRUE)

print(mat2)

 

print(mat1 + mat2)

print(mat1 – mat2)

 

print(mat1 * mat2)

 

print(mat1 / mat2)

 

## ARRAY

arr1 <- array(c(2:20),dim = c(2,2,2))

print(arr1)

print(arr1[1,2,1])

print(arr1[,2,1])

# c(1,2,1)

 

##  Factors

regions<- factor(c(“N”,”S”,”S”,”W”,”N”,”E”,”E”,”E”))

 

print(is.factor(regions))

 

 

dataset1 <- data.frame(

  quarter = c(“Q1″,”Q2″,”Q3″,”Q4”),

  revenue = c(100,150,200,170),

  fruits = c(“Apple”,”Banana”,”Mango”,”Oranges”)

)

print(dataset1)

shorterrow <- dataset1[2:3,]

print(shorterrow)

print(dataset1[,c(2,3)])

 

setwd(“D:\\dataset”)

dataset <- read.csv(“1_Data_PreProcessing.csv”)

print(dataset)

 

dataset$Salesperson = ifelse(is.na(dataset$Salesperson),

                             ave(dataset$Salesperson,FUN=function(x) mean(x,na.rm=TRUE)),

                             dataset$Salesperson) 

dataset$Quotation = ifelse(is.na(dataset$Quotation),

                             ave(dataset$Quotation,FUN=function(x) mean(x,na.rm=TRUE)),

                             dataset$Quotation) 

#connecting to SQL Server

#ipaddress, username, password, dbname

 

#install and run library – RODBC

#sql_connection = odbcConnect(“SQLSERVERODBC”)

#sqlQuery(sql_connection,”Select * from table1″)

 

#handling the categorical value

dataset$Region = factor(dataset$Region)

 

#step 3: breaking into training and test set

library(caTools)

split = sample.split(dataset$Win, SplitRatio = 0.8)

training_set = subset(dataset,split==TRUE)

test_set = subset(dataset,split==FALSE)

 

#Step 4: Feature Scaling

# to bring dataset in similar range

### 1. divide the column with higher value, inthis case quotation by 1000

### 2. Min-Max Scaling – values ranges between 0 to 1

### 3. Z Score normalization – preferred

training_set[,2:3] = scale(training_set[,2:3])

test_set[,2:3] = scale(test_set[,2:3])

test_set

 

setwd(‘D:\\dataset’)

dataset = read.csv(“2_Marks_Data.csv”)

scatter.smooth(x=dataset$Hours,y=dataset$Marks,main=”Hours Studied v Marks Obtained”)

#split the dataset into training set and test set

library(caTools)

split = sample.split(dataset$Marks, SplitRatio=0.8)

training_set = subset(dataset, split=TRUE)

test_set = subset(dataset, split=FALSE)

 

#create regression object

regressor=lm(formula = Marks~Hours, data = training_set)

summary(regressor)

# y = 20.76 + 7.57x

#

 

# While solving machine learning problem – 

## 1. Is my data in a ready state to run the algorithm

## 2. Run the algorithm and check the values

####  2.1. Is this the best performance of this model (can I improve this model)

####  2.2: Is this the best model

## 3. Evaluate the performance of the algorithm

## RMSE and Rsquare (o to 1) – closer to 1 means best formance

 

## training performance v test performance – over fitting and under fitting

setwd(‘D:\\dataset’)

dataset = read.csv(“2_Marks_Data.csv”)

print(dataset)

scatter.smooth(x=dataset$Hours,y=dataset$Marks,main=”Hours Studied v Marks Obtained”)

#split the dataset into training set and test set

library(caTools)

split = sample.split(dataset$Marks, SplitRatio=0.75)

#training_set = subset(dataset, split=TRUE)

training_set = dataset[split,]

print(training_set)

test_set = dataset[!split,]

print(test_set)

#create regression object

regressor=lm(formula = Marks~Hours, data = training_set)

summary(regressor)

# y = 20.76 + 7.57x

#

 

# While solving machine learning problem – 

## 1. Is my data in a ready state to run the algorithm

## 2. Run the algorithm and check the values

####  2.1. Is this the best performance of this model (can I improve this model)

####  2.2: Is this the best model

## 3. Evaluate the performance of the algorithm

## RMSE and Rsquare (o to 1) – closer to 1 means best formance

 

## training performance v test performance – over fitting and under fitting

 

y_predict = predict(regressor, newdata = test_set)

#y_predict = predict(regressor, newdata = training_set)

comparison = cbind(test_set, y_predict)

print(comparison)

 

mse = mean((comparison$Marks – comparison$y_predict)^2)

print(mse)

library(MLmetrics)

mape.value = MAPE(comparison$y_predict, comparison$Marks)

print(mape.value)

 

 

y_predict = predict(regressor, newdata = training_set)

#y_predict = predict(regressor, newdata = training_set)

comparison = cbind(test_set, y_predict)

print(comparison)

 

mse = mean((comparison$Marks – comparison$y_predict)^2)

print(mse)

library(MLmetrics)

mape.value = MAPE(comparison$y_predict, comparison$Marks)

print(mape.value)