PART 2: DATA SCIENCE NOV 2023

CLICK HERE TO ACCESS PART 1 oF THE TUTORIAL

# NUMPY
# pip install numpy
import numpy as np
nums = range(16)
nums = np.reshape(nums,(8,2))
print(nums)
nums = np.reshape(nums,(4,4))
print(nums)
print(“Shape: Rows = “,nums.shape[0], “and columns = “,nums.shape[1])
# indexing
print(nums[1,2], nums[-3,-2])
print(nums[1]) # 2nd row
print(nums[:,1]) # : rows from 0th to (n-1)th
print(nums[-1], nums[:,-2], nums[-1,-2])

# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1)
print(mat1)

print(np.zeros((3,3)))
print(np.ones((3,3)))
print(np.full((5,7),2.0))
print(np.full((5,7),9))

# eye – identity matrix: square matrix with 1 on its main diagonal
mat1 = np.eye(5)
print(mat1)

# NUMPY
import numpy as np
# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1) # 4 * 3 – shape
print(mat1)
l2 = [[2,3,4],[2,1,2],[5,2,3],[3,2,2]]
# array is a function to convert list into numpy
mat2 = np.array(l2)
print(mat2)

# Matrices operations
print(mat1 + mat2)
print(np.add(mat1, mat2))

print(mat1 – mat2)
print(np.subtract(mat1, mat2))

print(mat1 * mat2)
print(np.multiply(mat1, mat2))

print(mat1 / mat2)
print(np.divide(mat1, mat2))

# actual matrix multiplication is done using matmul()
l3 = [[2,3,4],[2,1,2],[5,2,3]]
# array is a function to convert list into numpy
mat3 = np.array(l3)
print(mat3)
print(“Matrix Multiplication”)
print(np.matmul(mat1, mat3))
print(mat1 @ mat3)
## calculating determinant

l4 = [[1,3,5],[1,3,1],[2,3,4]]
mat5 = np.array(l4)
det_mat5 = np.linalg.det(mat5)
print(“Determinant of matrix 5 is”,det_mat5)
print(“Inverse of matrix 5 is: \n,np.linalg.inv(mat5))

”’
Linear Algebra Equation:
x1 + 5×2 = 7
-2×1 – 7×2 = -5

x1 = -8, x2= 3,
”’
coeff_mat = np.array([[1,5],[-2,-7]])
#var_mat = np.array([[x1],[x2]])
result_mat = np.array([[7],[-5]])
# equation here is coeff_mat * var_mat = result_mat [eg: 5 * x = 10]
# which is, var_mat = coeff_mat inv * result_mat
det_coeff_mat = np.linalg.det(coeff_mat)
if det_coeff_mat !=0:
var_mat = np.linalg.inv(coeff_mat) @ result_mat
print(“X1 = “,var_mat[0,0])
print(“X2 = “,var_mat[1,0])
else:
print(“Solution is not possible”)

# # scipy = scientific python
# pip install scipy
”’
#Inequality = OPTIMIZATION or MAXIMIZATION / MINIMIZATION PROBLEM
Computer Parts Assembly:
Laptops & Desktops
profit: 1000, 600
objective: either maximize profit or minimize cost

constraints:
1. Demand: 500, 600
2. Parts: Memory card: 5000 cards available
3. Manpower: 25000 minutes


”’

”’
Optimization using Scipy
let’s assume d = desktop, n = notebooks

Constraints:
1. d + n <= 10000
2. 2d + n <= 15000
3. 3d + 4n <= 25000

profit: 1000 d + 750 n => maximize
-1000d – 750 n =>minimize

”’
import numpy as np
from scipy.optimize import minimize, linprog
d = 1
n = 1
profit_d = 1000
profit_n = 750
profit = d * profit_d + n * profit_n
obj = [-profit_d, -profit_n]
lhs_con = [[1,1],[2,1],[3,4]]
rhs_con = [10000, 15000, 25000]

boundary = [(0, float(“inf”)), # boundary condition for # of desktops
(10, 200000)] # we just added some limit for notebooks
opt = linprog(c=obj, A_ub=lhs_con, b_ub=rhs_con, bounds=boundary, method=“revised simplex”)
print(opt)
if opt.success:
print(f”Number of desktops = {opt.x[0]} and number of laptops = {opt.x[1]})
print(“Maximum profit that can be generated = “,-1 * opt.fun)
else:
print(“Solution can not be generated”)

### ### ### PANDAS
# Pandas – dataframe which resembles Table structure
# pip install pandas
import pandas as pd
df1 = pd.DataFrame()
print(df1)
print(type(df1))

# fruit production
data = [[“Apple”, 15000, 11000,6000],
[“Banana”, 18000,22000,29000],
[“Mango”, 2, 900, 19000],
[“Guava”, 19000,11000,25000]]

fruit_production = pd.DataFrame(data)
print(fruit_production)
print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[1:3,2:]) #based on title(names)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”])
print(fruit_production)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”],
index=[“Fruit 1”,“Fruit 2”,“Fruit 3”,“Fruit 4”])
print(fruit_production)

## dataframe.loc() dataframe.iloc()

print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[[“Fruit 2”, “Fruit 3”],[“February”,“March”]]) #based on title(names)

### ###

# pandas
# pip install pandas
import pandas as pd
l1 = [10,20,30,40,50]
l1 = [[“Sachin”,101,20000,“BATSMAN”],[“Kapil”,501,12000,“BOWLER”],
[“Sunil”,12,21000,“BATSMAN”],[“Zaheer”,725,2000,“BOWLER”]]
df1 = pd.DataFrame(l1,columns=[“Player”,“Wickets”,“Runs”,“Type”],
index=[“Player 1”,“Player 2”,“Player 3”,“Player 4”])
print(df1)

d1 = {‘Apple’:[12000,11000,13000],
‘Banana’: [17000,18000,19000],
‘Mango’:[11000,13000,15000]}
df2 = pd.DataFrame(d1)
print(df2)

# creating dataframe from list of dictionary
data1 = [{“Guava”:9000, “Oranges”: 5000},
{“Guava”:8000, “Oranges”: 7000},
{“Guava”:10000, “Oranges”: 6000}]
df3 = pd.DataFrame(data1)
print(df3)

print(df3.iloc[0,:]) #first row and all column values
print(df3.iloc[:,0])

print(df2.iloc[:,0:2])
print(df2.iloc[[0,2],[0,2]])

#
print(df2.loc[[0,2],[“Apple”,“Mango”]])
print(df1.loc[[“Player 1”,“Player 4”],[“Player”,“Runs”]])

df2.iloc[2,0] = 14000
print(df2)
print(“========= DF1 =============”)
df1[‘Avg’] = df1[‘Runs’] / df1[“Wickets”]
print(df1)
print(“Reading data from DF1: “)
df4 = df1[df1.Player !=‘Sachin’] #filter where clause
print(\n\n New dataset without Sachin: \n, df4)
df1 = df1.drop(“Player”,axis=1) # axis default is 0
# unlike pop() and del – drop() returns a new dataframe
print(df1)


print(“Average Wickets of all the players = “,df1[‘Wickets’].mean())
print(“Average Wickets of players by type = \n\n,df1.groupby(‘Type’).mean())
# axis = 0 refers to rows
# axis = 1 refers to columns

print(\n\nDropping columns from DF1: “)
del df1[‘Wickets’] #dropping column Wickets using del
print(df1)

df1.pop(‘Runs’) #dropping column using pop
print(df1)
#

import pandas as pd

ud_df = pd.read_csv(“D:/datasets/gitdataset/user_device.csv”)
print(ud_df) # 272 rows x 6 columns
print(“Rows: “,ud_df.shape[0])
print(“Columns: “,ud_df.shape[1])

print(ud_df.tail(1))
print(ud_df.head(1))

use_df = pd.read_csv(“D:/datasets/gitdataset/user_usage.csv”)
print(use_df) # 240 rows x 4 columns

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’)
print(result_df) # [159 rows x 9 columns] = ud_df: 159 + 113, use_df = 159 + 81

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘outer’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘left’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘right’)
print(result_df)

## Working with Pandas – Example ##
import pandas as pd
import numpy as np
df = pd.read_csv(“D:/datasets/gitdataset/hotel_bookings.csv”)
print(df.shape)
print(df.dtypes)
”’
numeric – int, float
categorical – 1) Nominal – there is no order 2) Ordinal – here order is imp
”’
df_numeric = df.select_dtypes(include=[np.number])
print(df_numeric)

df_object= df.select_dtypes(exclude=[np.number])
print(df_object) # categorical and date columns

print(df.columns)
for col in df.columns:
missing = np.mean(df[col].isnull())
if missing >0:
print(f”{col}{missing})

”’
Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING – Divide the train and test


”’
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
print(df)

Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING –
a. Divide the train and test
b. Run the model
6. EVALUATE THE MODEL:
a. Measure the performance of each algorithm on the test data
b. Metric to compare: based on Regression (MSE, RMSE, R square) or
classification (confusion matrix -accuracy, sensitivity..)
c. select the best performing model
7. DEPLOY THE BEST PERFORMING MODEL

Hypothesis test:
1. Null Hypothesis (H0): starting statement (objective)
Alternate Hypethesis (H1): Alternate of H0

Z or T test:
Chi square test: both are categorical

e.g. North zone: 50 WIN 5 LOSS – p = 0.005

# simple (single value) v composite (specifies range)
# two tailed test v one tailed test [H0: mean = 0,
H1 Left Tailed: mean <0
H1 Right Tailed: mean >0
# level of significance:
alpha value: confidence interval – 95%
p value: p value <0.05 – we reject Null Hypothesis

import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
print(X)

import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train)
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)

”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))
## Bias v Variance

import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)


# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Administration’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘R&D Spend’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘Marketing Spend’],y=df[‘Profit’])
plt.show()

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’


## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = -3791.2 x Florida -3090.1 x California + 0.82 R&D – 0.05 Admin + 0.022 Marketing+ 56650


y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

”’
Case 1: All the columns are taken into account:
Mean Absolute Error = 8696.887641252619
R Square is (Variance) 0.884599945166969
Root Mean Squared Error (Bias) = 7562.5657508560125
R Square is (Bias) 0.9624157828452926
”’
## Testing

import statsmodels.api as sm
import numpy as np
X = np.array(X, dtype=float)
print(“Y:\n,y)
summ1 = sm.OLS(y,X).fit().summary()
print(“Summary of All X \n—————-\n:”,summ1)

import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)

”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.
”’
X = X[:,2:] # after backward elemination

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Administration’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘R&D Spend’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘Marketing Spend’],y=df[‘Profit’])
plt.show()

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’


## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = -3791.2 x Florida -3090.1 x California + 0.82 R&D – 0.05 Admin + 0.022 Marketing+ 56650


y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

”’
Case 1: All the columns are taken into account:
Mean Absolute Error = 8696.887641252619
R Square is (Variance) 0.884599945166969
Root Mean Squared Error (Bias) = 7562.5657508560125
R Square is (Bias) 0.9624157828452926
”’
## Testing

import statsmodels.api as sm
import numpy as np
X = np.array(X, dtype=float)
#X = X[:,[2,3,4]]
print(“Y:\n,y)
summ1 = sm.OLS(y,X).fit().summary()
print(“Summary of All X \n—————-\n:”,summ1)

## Test for linearity
# 1. All features (X) should be correlated to Y
# 2. Multicollinearity: Within X there should not be any correlation,
# if its there then take any one for the analysis

import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/4_Position_Salaries.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,1:2].values
y = df.iloc[:,2].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)

”’
”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.

X = X[:,2:] # after backward elemination
”’
”’
# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.show()
”’

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=100)
print(X_train)

from sklearn.linear_model import LinearRegression
from sklearn import metrics
”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
”’
#Since dataset is too small, lets take entire data for training
X_train, y_train = X,y
X_test, y_test = X,y
”’
”’
## RUN THE MODEL

regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_}”)

# y =
y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output

mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

# Plotting the data for output
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.plot(X,y_pred)
plt.xlabel(“Level”)
plt.ylabel(“Salary”)
plt.show()
”’

# 3. Model – Polynomial regression analysis
# y = C + m1 * X + m2 * x square
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

for i in range(1,10):
#prepare the parameters
parameters = [(‘polynomial’, PolynomialFeatures(degree=i)),(‘modal’,LinearRegression())]
pipe = Pipeline(parameters)
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X)
## Bias is based on training data
y_pred_tr = pipe.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
rmse_tr = mse ** 0.5
print(“Root Mean Squared Error (Bias) = “,rmse_tr)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

## Variance is based on validation data
y_pred_tt = pipe.predict(X_test)
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred_tt)
rmse_tt = mse ** 0.5
print(“Root Mean Squared Error (Variance) = “, rmse_tt)
print(“R Square is (Variance)”, metrics.r2_score(y_test, y_pred_tt))
print(“Difference Between variance and bias = “,rmse_tt – rmse_tr)
# Plotting the data for output
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.plot(X,y_pred)
plt.title(“Polynomial Analysis degree =”+str(i))
plt.xlabel(“Level”)
plt.ylabel(“Salary”)
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
#link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/4_Position_Salaries.csv”
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)


”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.

X = X[:,2:] # after backward elemination
”’
”’
# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.show()
”’

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=100)
print(X_train)

from sklearn.linear_model import LinearRegression
from sklearn import metrics
”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
”’
#Since dataset is too small, lets take entire data for training
X_train, y_train = X,y
X_test, y_test = X,y
”’

## RUN THE MODEL – Support Vector Machine Regressor (SVR)
from sklearn.svm import SVR
#regressor = SVR(kernel=’linear’)
#regressor = SVR(kernel=’poly’,degree=2,C=10)
# Assignment – Best value for gamma: 0.01 to 1 (0.05)
regressor = SVR(kernel=“rbf”,gamma=0.1,C=10)
# fit – train the model
regressor.fit(X_train, y_train)


# y =
y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output

mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))


# Plotting the data for output
plt.scatter(X_train[:,2],y_pred_tr)
#plt.plot(X_train[:,2],y_pred_tr)
plt.show()

#Decision Tree & Random Forest
import pandas as pd
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
link = “D:\\datasets\\3_Startups.csv”
df = pd.read_csv(link)
print(df)

#X = df.iloc[:,:4].values
X = df.iloc[:,:1].values
y = df.iloc[:,:-1].values
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=100)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

# Baging, Boosting, Ensemble
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

## Assignment these algorithms and check the RMSE and R square values

# Ridge Lasso Elasticnet
import pandas as pd
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/student_scores_multi.csv”
df = pd.read_csv(link)
print(df)
X = df.iloc[:,0:3].values
y = df.iloc[:,3].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.85, random_state=100)

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
lr_ridge = Ridge(alpha=0.8)
lr_ridge.fit(X_train,y_train)
y_ridge_pred = lr_ridge.predict(X_test)

from sklearn.metrics import r2_score
r2_ridge_test = r2_score(y_test, y_ridge_pred)

y_ridge_pred_tr = lr_ridge.predict(X_train)
r2_ridge_train = r2_score(y_train, y_ridge_pred_tr)
print(f”Ridge Regression: Train R2 = {r2_ridge_train} and Test R2={r2_

# Classifications algorithm: supervised algo which predicts the class
”’
classifier: algorithm that we develop
model: training and predicting the outcome
features: the input data (columns)
target: class that we need to predict
classification: binary (2 class outcome) or multiclass (more than 2 classes)

Steps to run the model:
1. get the data
2. preprocess the data
3. eda
4. train the model
5. predict the model
6. evaluate the model

”’
#1. Logistic regression
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))
plt.show()

https://designrr.page/?id=155238&token=545210681&type=FP&h=7849

# Classifications algorithm: supervised algo which predicts the class
”’
classifier: algorithm that we develop
model: training and predicting the outcome
features: the input data (columns)
target: class that we need to predict
classification: binary (2 class outcome) or multiclass (more than 2 classes)

Steps to run the model:
1. get the data
2. preprocess the data
3. eda
4. train the model
5. predict the model
6. evaluate the model

”’
#1. Logistic regression
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
## LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
”’
from sklearn.svm import SVC
”’
## Support Vector Machine – Classifier
classifier = SVC(kernel=’linear’)

classifier = SVC(kernel=’rbf’,gamma=100, C=100)
”’
from sklearn.neighbors import KNeighborsClassifier
## Refer types of distances:
# https://designrr.page/?id=200944&token=2785938662&type=FP&h=7229

classifier = KNeighborsClassifier(n_neighbors=9, metric=‘minkowski’)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

import sklearn.tree

link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion=”gini”)
”’
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=39, criterion=“gini”)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

”’
# Show decision tree created

output = sklearn.tree.export_text(classifier)
print(output)
# visualize the tree
fig = plt.figure(figsize=(40,60))
tree_plot = sklearn.tree.plot_tree(classifier)
plt.show()
”’

”’
In Ensemble Algorithms – we run multiple algorithms to improve the performance
of a given business objective:
1. Boosting: When you run same algorithm – Input varies based on weights
2. Bagging: When you run same algorithm – average of all
3. Stacking: Over different algorithms – average of all
”’

import sklearn.tree

link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion=”gini”)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=39, criterion=”gini”)
”’
from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier(n_estimators=7)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

”’
# Show decision tree created

output = sklearn.tree.export_text(classifier)
print(output)
# visualize the tree
fig = plt.figure(figsize=(40,60))
tree_plot = sklearn.tree.plot_tree(classifier)
plt.show()
”’

”’
In Ensemble Algorithms – we run multiple algorithms to improve the performance
of a given business objective:
1. Boosting: When you run same algorithm – Input varies based on weights
2. Bagging: When you run same algorithm – average of all
3. Stacking: Over different algorithms – average of all
”’

https://designrr.page/?id=36743&token=2022711066&type=FP&h=3547

 

from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

 

X,y = make_blobs(n_samples=300, n_features=3, centers=4)
plt.scatter(X[:,0], X[:,1])
plt.show()

 

from sklearn.cluster import KMeans
km = KMeans(n_clusters=5, init=“random”,max_iter=100)
y_cluster =km.fit_predict(X)

 

plt.scatter(X[y_cluster==0,0],X[y_cluster==0,1],c=“blue”,label=“Cluster A”)
plt.scatter(X[y_cluster==1,0],X[y_cluster==1,1],c=“red”,label=“Cluster B”)
plt.scatter(X[y_cluster==2,0],X[y_cluster==2,1],c=“green”,label=“Cluster C”)
plt.scatter(X[y_cluster==3,0],X[y_cluster==3,1],c=“black”,label=“Cluster D”)
plt.scatter(X[y_cluster==4,0],X[y_cluster==4,1],c=“orange”,label=“Cluster E”)
plt.show()

 

distortion = []
max_centers = 30
for i in range(1,max_centers):
km = KMeans(n_clusters=i, init=“random”, max_iter=100)
y_cluster = km.fit(X)
distortion.append(km.inertia_)

 

print(“Distortion:\n,distortion)
plt.plot(range(1,max_centers),distortion,marker=“o”)
plt.show()

 

import pandas as pd
import matplotlib.pyplot as plt
link = “D:\\Datasets\\USArrests.csv”
df = pd.read_csv(link)
#print(df)
X = df.iloc[:,1:]
from sklearn.preprocessing import normalize
data = normalize(X)
data = pd.DataFrame(data)
print(data)

## plotting dendogram
import scipy.cluster.hierarchy as sch
dendo = sch.dendrogram(sch.linkage(data, method=‘ward’))
plt.axhline(y=0.7,color=“red”)
plt.show()

link = “D:\\datasets\\Market_Basket_Optimisation.csv”
import pandas as pd
df = pd.read_csv(link)
print(df)
from apyori import apriori
transactions = []
for i in range(len(df)):
if i%100==0:
print(“I = “,i)
transactions.append([str(df.values[i,j]) for j in range(20)])

## remove nan from the list
print(“Transactions:\n,transactions)

association_algo = apriori(transactions, min_confidence=0.2, min_support=0.02, min_lift=2)
print(“Association = “,list(association_algo))

”’
Time Series Forecasting – ARIMA method

1. Read and visualize the data
2. Stationary series
3. Optimal parameters
4. Build the model
5. Prediction
”’
import pandas as pd
#Step 1: read the data
link = “D:\\datasets\\gitdataset\\AirPassengers.csv”
air_passengers = pd.read_csv(link)

”’
#Step 2: visualize the data
import plotly.express as pe
fig = pe.line(air_passengers,x=”Month”,y=”#Passengers”)
fig.show()
”’
# Cleaning the data
from datetime import datetime
air_passengers[‘Month’] = pd.to_datetime(air_passengers[‘Month’])
air_passengers.set_index(‘Month’,inplace=True)

#converting to time series data
import numpy as np
ts_log = np.log(air_passengers[‘#Passengers’])
#creating rolling period – 12 months
import matplotlib.pyplot as plt
”’
moving_avg = ts_log.rolling(12).mean
plt.plot(ts_log)
plt.plot(moving_avg)
plt.show()
”’
#Step 3: Decomposition into: trend, seasonality, error ( or residual or noise)
”’
Additive decomposition: linear combination of above 3 factors:
Y(t) =T(t) + S(t) + E(t)

Multiplicative decomposition: product of 3 factors:
Y(t) =T(t) * S(t) * E(t)
”’
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(ts_log,model=“multiplicative”)
decomposed.plot()
plt.show()

# Step 4: Stationary test
”’
To make Time series analysis, the TS should be stationary.
A time series is said to be stationary if its statistical properties
(mean, variance, autocorrelation) doesnt change by a large value
over a period of time.
Types of tests:
1. Augmented Dickey Fuller test (ADH Test)
2. Kwiatkowski Phillips Schnidt Shin (KPSS) test
3. Phillips Perron (PP) Test

Null Hypothesis: The time series is not stationary
Alternate Hypothesis: Time series is stationary
If p >0.05 we reject Null Hypothesis
”’
from statsmodels.tsa.stattools import adfuller
result = adfuller(air_passengers[‘#Passengers’])
print(“ADF Stats: \n,result[0])
print(“p value = “,result[1])
”’
To reject Null hypothesis, result[0] less than 5% critical region value
and p > 0.05
”’

# Run the model
”’
ARIMA model: Auto-Regressive Integrative Moving Average
AR: p predicts the current value
I: d integrative by removing trend and seasonality component from previous period
MA: q represents Moving Average

AIC- Akaike’s Information Criterion (AIC) – helps to find optimal p,d,q values
BIC – Bayesian Information Criterion (BIC) – alternative to AIC
”’
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(air_passengers[‘#Passengers’].diff().dropna())
plot_pacf(air_passengers[‘#Passengers’].diff().dropna())
plt.show()
”’
How to read above graph:
To find q (MA), we look at the Autocorrelation graph and see where there is a drastic change:
here, its at 1, so q = 1 (or 2 as at 2, it goes to -ve)

To find p (AR) – sharp drop in Partial Autocorrelation graph:
here, its at 1, so p = 1 (or 2 as at 2, it goes to -ve)

for d (I) – we need to try with multiple values
intially we will take as 1

”’
”’
Time Series Forecasting – ARIMA method

1. Read and visualize the data
2. Stationary series
3. Optimal parameters
4. Build the model
5. Prediction
”’
import pandas as pd
#Step 1: read the data
link = “D:\\datasets\\gitdataset\\AirPassengers.csv”
air_passengers = pd.read_csv(link)

”’
#Step 2: visualize the data
import plotly.express as pe
fig = pe.line(air_passengers,x=”Month”,y=”#Passengers”)
fig.show()
”’
# Cleaning the data
from datetime import datetime
air_passengers[‘Month’] = pd.to_datetime(air_passengers[‘Month’])
air_passengers.set_index(‘Month’,inplace=True)

#converting to time series data
import numpy as np
ts_log = np.log(air_passengers[‘#Passengers’])
#creating rolling period – 12 months
import matplotlib.pyplot as plt
”’
moving_avg = ts_log.rolling(12).mean
plt.plot(ts_log)
plt.plot(moving_avg)
plt.show()
”’
#Step 3: Decomposition into: trend, seasonality, error ( or residual or noise)
”’
Additive decomposition: linear combination of above 3 factors:
Y(t) =T(t) + S(t) + E(t)

Multiplicative decomposition: product of 3 factors:
Y(t) =T(t) * S(t) * E(t)
”’
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(ts_log,model=“multiplicative”)
decomposed.plot()
plt.show()

# Step 4: Stationary test
”’
To make Time series analysis, the TS should be stationary.
A time series is said to be stationary if its statistical properties
(mean, variance, autocorrelation) doesnt change by a large value
over a period of time.
Types of tests:
1. Augmented Dickey Fuller test (ADH Test)
2. Kwiatkowski Phillips Schnidt Shin (KPSS) test
3. Phillips Perron (PP) Test

Null Hypothesis: The time series is not stationary
Alternate Hypothesis: Time series is stationary
If p >0.05 we reject Null Hypothesis
”’
from statsmodels.tsa.stattools import adfuller
result = adfuller(air_passengers[‘#Passengers’])
print(“ADF Stats: \n,result[0])
print(“p value = “,result[1])
”’
To reject Null hypothesis, result[0] less than 5% critical region value
and p > 0.05
”’

# Run the model
”’
ARIMA model: Auto-Regressive Integrative Moving Average
AR: p predicts the current value
I: d integrative by removing trend and seasonality component from previous period
MA: q represents Moving Average

AIC- Akaike’s Information Criterion (AIC) – helps to find optimal p,d,q values
BIC – Bayesian Information Criterion (BIC) – alternative to AIC
”’
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(air_passengers[‘#Passengers’].diff().dropna())
plot_pacf(air_passengers[‘#Passengers’].diff().dropna())
plt.show()
”’
How to read above graph:
To find q (MA), we look at the Autocorrelation graph and see where there is a drastic change:
here, its at 1, so q = 1 (or 2 as at 2, it goes to -ve)

To find p (AR) – sharp drop in Partial Autocorrelation graph:
here, its at 1, so p = 1 (or 2 as at 2, it goes to -ve)

for d (I) – we need to try with multiple values
intially we will take as 1

”’
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(air_passengers[‘#Passengers’], order=(1,1,1))
result = model.fit()
plt.plot(air_passengers[‘#Passengers’])
plt.plot(result.fittedvalues)
plt.show()
print(“ARIMA Model Summary”)
print(result.summary())

model = ARIMA(air_passengers[‘#Passengers’], order=(4,1,4))
result = model.fit()
plt.plot(air_passengers[‘#Passengers’])
plt.plot(result.fittedvalues)
plt.show()
print(“ARIMA Model Summary”)
print(result.summary())

# Prediction using ARIMA model
air_passengers[‘Forecasted’] = result.predict(start=120,end=246)
air_passengers[[‘#Passengers’,‘Forecasted’]].plot()
plt.show()

# predict using SARIMAX Model
import statsmodels.api as sm
model = sm.tsa.statespace.SARIMAX(air_passengers[‘#Passengers’],order=(7,1,1), seasonal_order=(1,1,1,12))
result = model.fit()
air_passengers[‘Forecast_SARIMAX’] = result.predict(start=120,end=246)
air_passengers[[‘#Passengers’,‘Forecast_SARIMAX’]].plot()
plt.show()

https://drive.google.com/drive/folders/1Xe3HftLxL1T6HsEBUfjq_zXANjTnr6Cz?usp=drive_link

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“https://raw.githubusercontent.com/swapnilsaurav/OnlineRetail/master/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])

df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)
”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
txt_pp = text.lower()
print(txt_pp)
#remove accent

# applying basic preprocessing:
reviews_df[‘review_comment_message’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“D:/datasets/OnlineRetail/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
#df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])
#df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)

# remove accents
def remove_accent(text):
return unicodedata.normalize(‘NFKD’,text).encode(‘ascii’,errors=‘ignore’).decode(‘utf-8’)
#STOP WORDS LIST:
STOP_WORDS = set(remove_accent(w) for w in nltk.corpus.stopwords.words(‘portuguese’))

”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
#converting to lower case
txt_pp = text.lower()
#print(txt_pp)

#remove the accent
#txt_pp = unicodedata.normalize(‘NFKD’,txt_pp).encode(‘ascii’,errors=’ignore’).decode(‘utf-8’)
txt_pp =remove_accent(txt_pp)
#print(txt_pp)
#tokenize
txt_token = nltk.tokenize.word_tokenize(txt_pp)
#print(txt_token)

# removing stop words
txt_token = (w for w in txt_token if w not in STOP_WORDS and w.isalpha())
return txt_token

# applying basic preprocessing:
reviews_df[‘review_comment_words’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

#get positive reviews – all 5 ratings in review_score
reviews_5 = reviews_df[reviews_df[‘review_score’]==5]

#get negative reviews – all 1 ratings
reviews_1 = reviews_df[reviews_df[‘review_score’]==1]

## write a function to creaet unigram, bigram, trigram
def create_ngrams(words):
unigram,bigrams,trigram = [],[],[]
for comment in words:
unigram.extend(comment)
bigrams.extend(.join(bigram) for bigram in nltk.bigrams(comment))
trigram.extend(‘ ‘.join(trigram) for trigram in nltk.trigrams(comment))
return unigram,bigrams,trigram

#create ngrams for rating 5 and rating 1
uni_5, bi_5, tri_5 = create_ngrams(reviews_5[‘review_comment_words’])
print(uni_5)
print(‘””””””””””””””””””‘)
print(bi_5)
print(” =========================================”)
print(tri_5)

uni_1, bi_1, tri_1 = create_ngrams(reviews_1[‘review_comment_words’])
#print(uni_5)

# distribution plot
def plot_dist(words, color):
nltk.FreqDist(words).plot()

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“D:/datasets/OnlineRetail/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
#df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])
#df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)

# remove accents
def remove_accent(text):
return unicodedata.normalize(‘NFKD’,text).encode(‘ascii’,errors=‘ignore’).decode(‘utf-8’)
#STOP WORDS LIST:
STOP_WORDS = set(remove_accent(w) for w in nltk.corpus.stopwords.words(‘portuguese’))

”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
#converting to lower case
txt_pp = text.lower()
#print(txt_pp)

#remove the accent
#txt_pp = unicodedata.normalize(‘NFKD’,txt_pp).encode(‘ascii’,errors=’ignore’).decode(‘utf-8’)
txt_pp =remove_accent(txt_pp)
#print(txt_pp)
#tokenize
txt_token = nltk.tokenize.word_tokenize(txt_pp)
#print(txt_token)

# removing stop words
txt_token = tuple(w for w in txt_token if w not in STOP_WORDS and w.isalpha())
return txt_token



## write a function to creaet unigram, bigram, trigram
def create_ngrams(words):
unigrams,bigrams,trigrams = [],[],[]
for comment in words:
unigrams.extend(comment)
bigrams.extend(‘ ‘.join(bigram) for bigram in nltk.bigrams(comment))
trigrams.extend(‘ ‘.join(trigram) for trigram in nltk.trigrams(comment))


return unigrams, bigrams, trigrams


# applying basic preprocessing:
reviews_df[‘review_comment_words’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

#get positive reviews – all 5 ratings in review_score
reviews_5 = reviews_df[reviews_df[‘review_score’]==5]

#get negative reviews – all 1 ratings
reviews_1 = reviews_df[reviews_df[‘review_score’]==1]
#create ngrams for rating 5 and rating 1
uni_5, bi_5, tri_5 = create_ngrams(reviews_5[‘review_comment_words’])
print(uni_5)
print(bi_5)
print(tri_5)

# Assignment: perform similar tasks for reviews that are negative (review score = 1)
#uni_1, bi_1, tri_1 = create_ngrams(reviews_1[‘review_comment_words’])
#print(uni_5)

# distribution plot
def plot_dist(words, color):
nltk.FreqDist(words).plot(20,cumulative=False, color=color)

plot_dist(tri_5, “red”)

#NLP – Natural Language processing:
# sentiments: Positive, Neutral, Negative
#
”’
we will use nltk library for NLP:
pip install nltk
”’
import nltk
#1. Convert into lowercase
text = “Product is great but I amn’t liking the colors as they are worst”
text = text.lower()

”’
2. Tokenize the content: break it into words or sentences
”’
text1 = text.split()
#using nltk
from nltk.tokenize import sent_tokenize,word_tokenize
text = word_tokenize(text)
#print(“Text =\n”,text)
#print(“Text =\n”,text1)

”’
3. Removing Stop words: Words which are not significant
for your analysis. E.g. an, a, the, is, are
”’
my_stopwords = [‘is’,‘i’,‘the’]
text1 = text
for in text1:
    
if in my_stopwords:
        text.remove(w)
print(“Text after my stopwords:”,text1)

nltk.download(
“stopwords”)
from nltk.corpus import stopwords
nltk_eng_stopwords = 
set(stopwords.words(“english”))
#print(“NLTK list of stop words in English: “,nltk_eng_stopwords)
”’
Just for example: we see the word but in the STOP WORDS but
we want to include it, then we need to remove the word from the set
”’
# removing but from the NLTK stop words
nltk_eng_stopwords.remove(‘but’)

for in text:
    
if in nltk_eng_stopwords:
        text.remove(w)
print(“Text after NLTK stopwords:”,text)

”’
4. Stemming: changing the word to its root
eg: {help: [help, helped, helping, helper]}

One of the method is Porter stemmer
”’
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
text = [stemmer.stem(w) 
for in text]
”’ above line is like below:
t_list=[]
for w in text:
    a = stemmer.stem(w)
    t_list.append(a)
”’
print(“Text after Stemming:”,text)
”’
5. Part of Speech Tagging (POS Tagging)
grammatical word which deals with the roles they place
like – 8 parts of speeches – noun, verb, …

Reference: https://www.educba.com/nltk-pos-tag/
POS Tagging will give Tags like

CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional

But to perform this, we need to download any one tagger:
e.g. averaged_perceptron_tagger
nltk.download(‘averaged_perceptron_tagger’)
”’
nltk.download(‘averaged_perceptron_tagger’)

import nltk
from nltk.tag import DefaultTagger
py_tag = DefaultTagger (
‘NN’)
tag_eg1 = py_tag.tag ([
‘Example’‘tag’])
print(tag_eg1)

#txt = “Example of nltk pos tag list”
#txt = [‘product’, ‘great’, ‘but’, “not”, ‘like’, ‘color’]
#txt = word_tokenize(txt)
#txt = [‘Example’,’of’,’nltk’,’pos’,’tag’,’list’]
pos_txt = nltk.pos_tag(text)
print(“POS Tagging:”, pos_txt)

”’
6. Lemmetising
takes a word to its core meaning
We need to download:  wordnet
”’
nltk.download(‘wordnet’)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(“Very good = “,lemmatizer.lemmatize(“very good”))
print(“Halves = “,lemmatizer.lemmatize(“halves”))

text = 
“Product is great but I amn’t liking the colors as they are worst”
text = word_tokenize(text)
text = [lemmatizer.lemmatize(w) 
for in text]
print(“Text after Lemmatizer: “,text)


# Sentiment analysis – read the sentiments of each sentence
”’
If you need more data for your analysis, this is a good source:
https://github.com/pycaret/pycaret/tree/master/datasets

We will use Amazon.csv for this program

”’
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

link = “https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv”
df = pd.read_csv(link)
print(df)

#Let’s create a function to perform all the preprocessing steps
# of a nlp analysis
def preprocess_nlp(text):
#tokenise
#print(“0”)
text = text.lower() #lowercase
#print(“1”)
text = word_tokenize(text) #tokenize
#print(“2”)
text = [w for w in text if w not in stopwords.words(“english”)]
#lemmatize
#print(“3”)
lemm = WordNetLemmatizer()
#print(“4”)
text = [lemm.lemmatize(w) for w in text]
#print(“5”)
# now join all the words as we are predicting on each line of text
text_out = ‘ ‘.join(text)
#print(“6”)
return text_out

# import Resource vader_lexicon
import nltk
nltk.download(‘vader_lexicon’)


df[‘reviewText’] = df[‘reviewText’].apply(preprocess_nlp)
print(df)

# NLTK Sentiment Analyzer
# we will now define a function get_sentiment() which will return
# 1 for positive and 0 for non-positive
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
score = analyzer.polarity_scores(text)
sentiment = 1 if score[‘pos’] > 0 else 0
return sentiment

df[‘sentiment’] = df[‘reviewText’].apply(get_sentiment)

print(“Dataframe after analyzing the sentiments: \n,df)

#confusion matrix
from sklearn.metrics import confusion_matrix
print(“Confusion matrix:\n,confusion_matrix(df[‘Positive’],df[‘sentiment’]))

”’ RESULT

Confusion matrix:
[[ 1131 3636]
[ 576 14657]]
Accuracy: (1131 + 14657) / (1131 + 14657 + 576 + 3636) = 15788/20000 = 78.94%
”’
# Visualization
import matplotlib.pyplot as plt
import numpy as np
data = np.random.randn(1000)
plt.hist(data, bins=30, histtype=‘stepfilled’, color=“red”)
plt.title(“Histogram Display”)
plt.xlabel(“Marks”)
plt.ylabel(“Number of Students”)
plt.show()
# Analyzing Hotel Bookings data
# https://github.com/swapnilsaurav/Dataset/blob/master/hotel_bookings.csv
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/hotel_bookings.csv”
import pandas as pd
df = pd.read_csv(link)
#print(“Shape of the data: “,df.shape)
#print(“Data types of the columns:”,df.dtypes)
import numpy as np
df_numeric = df.select_dtypes(include=[np.number])
#print(df_numeric)
numeric_cols = df_numeric.columns.values
#print(“Numeric column names: “,numeric_cols)
df_nonnumeric = df.select_dtypes(exclude=[np.number])
#print(df_nonnumeric)
nonnumeric_cols = df_nonnumeric.columns.values
#print(“Non Numeric column names: “,nonnumeric_cols)

####
#preprocessing the data
import seaborn as sns
import matplotlib.pyplot as plt
colors = [“#091AEA”,“#EA5E09”]
cols = df.columns
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colors))
plt.show()

cols_to_drop = []
for col in cols:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
#print(f”{col} -> {pct_miss}”)
cols_to_drop.append(col) #column list to drop

# remove column since it has more than 80% missing value
df = df.drop(cols_to_drop, axis=1)

for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
print(f”{col} -> {pct_miss})
# check for rows to see the missing values
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing >0:
df[f’{col}_ismissing’] = missing
print(f”Created Missing Indicator for {cols})

### keeping track of the missing values
ismissing_cols = [col for col in df.columns if ‘_ismissing’ in col]
df[‘num_missing’] = df[ismissing_cols].sum(axis=1)
print(df[‘num_missing’])

# drop rows with > 12 missing values
ind_missing = df[df[‘num_missing’] > 12].index
df = df.drop(ind_missing,axis=0) # ROWS DROPPED

#count for missing values
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

”’
Still we are left with following missing values:
children -> 2.0498257606219004
babies -> 11.311318858061922
meal -> 11.467129071170085
country -> 0.40879238707947996
deposit_type -> 8.232810615199035
agent -> 13.687005763302507
”’

# Analyzing Hotel Bookings data
# https://github.com/swapnilsaurav/Dataset/blob/master/hotel_bookings.csv
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/hotel_bookings.csv”
import pandas as pd
df = pd.read_csv(link)
#print(“Shape of the data: “,df.shape)
#print(“Data types of the columns:”,df.dtypes)
import numpy as np
df_numeric = df.select_dtypes(include=[np.number])
#print(df_numeric)
numeric_cols = df_numeric.columns.values
print(“Numeric column names: “,numeric_cols)
df_nonnumeric = df.select_dtypes(exclude=[np.number])
#print(df_nonnumeric)
nonnumeric_cols = df_nonnumeric.columns.values
print(“Non Numeric column names: “,nonnumeric_cols)

####
#preprocessing the data
import seaborn as sns
import matplotlib.pyplot as plt
colors = [“#091AEA”,“#EA5E09”]
cols = df.columns
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colors))
plt.show()

cols_to_drop = []
for col in cols:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
#print(f”{col} -> {pct_miss}”)
cols_to_drop.append(col) #column list to drop

# remove column since it has more than 80% missing value
df = df.drop(cols_to_drop, axis=1)

for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
print(f”{col} -> {pct_miss})
# check for rows to see the missing values
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing >0:
df[f’{col}_ismissing’] = missing
#print(f”Created Missing Indicator for {cols}”)

### keeping track of the missing values
ismissing_cols = [col for col in df.columns if ‘_ismissing’ in col]
df[‘num_missing’] = df[ismissing_cols].sum(axis=1)
print(df[‘num_missing’])

# drop rows with > 12 missing values
ind_missing = df[df[‘num_missing’] > 12].index
df = df.drop(ind_missing,axis=0) # ROWS DROPPED

#count for missing values
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

”’
Still we are left with following missing values:
children -> 2.0498257606219004 # numeric
babies -> 11.311318858061922 #numeric
meal -> 11.467129071170085 # non-numeric
country -> 0.40879238707947996 # non-numeric
deposit_type -> 8.232810615199035 # non-numeric
agent -> 13.687005763302507 #numeric
”’
#HANDLING NUMERIC MISSING VALUES
df_numeric = df.select_dtypes(include=[np.number])
for col in df_numeric.columns.values:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss > 0:
med = df[col].median()
df[col] = df[col].fillna(med)

#HANDLING non-NUMERIC MISSING VALUES
df_nonnumeric = df.select_dtypes(exclude=[np.number])
for col in df_nonnumeric.columns.values:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss > 0:
mode = df[col].describe()[‘top’]
df[col] = df[col].fillna(mode)


print(“#count for missing values”)
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

#drop duplicate values
print(“Shape before dropping duplicates: “,df.shape)
df = df.drop(‘id’,axis=1).drop_duplicates()
print(“Shape after dropping duplicates: “,df.shape)