Swapnil Saurav

PART 2: DATA SCIENCE NOV 2023

CLICK HERE TO ACCESS PART 1 oF THE TUTORIAL

# NUMPY
# pip install numpy
import numpy as np
nums = range(16)
nums = np.reshape(nums,(8,2))
print(nums)
nums = np.reshape(nums,(4,4))
print(nums)
print(“Shape: Rows = “,nums.shape[0], “and columns = “,nums.shape[1])
# indexing
print(nums[1,2], nums[-3,-2])
print(nums[1]) # 2nd row
print(nums[:,1]) # : rows from 0th to (n-1)th
print(nums[-1], nums[:,-2], nums[-1,-2])

# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1)
print(mat1)

print(np.zeros((3,3)))
print(np.ones((3,3)))
print(np.full((5,7),2.0))
print(np.full((5,7),9))

# eye – identity matrix: square matrix with 1 on its main diagonal
mat1 = np.eye(5)
print(mat1)

# NUMPY
import numpy as np
# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1) # 4 * 3 – shape
print(mat1)
l2 = [[2,3,4],[2,1,2],[5,2,3],[3,2,2]]
# array is a function to convert list into numpy
mat2 = np.array(l2)
print(mat2)

# Matrices operations
print(mat1 + mat2)
print(np.add(mat1, mat2))

print(mat1 – mat2)
print(np.subtract(mat1, mat2))

print(mat1 * mat2)
print(np.multiply(mat1, mat2))

print(mat1 / mat2)
print(np.divide(mat1, mat2))

# actual matrix multiplication is done using matmul()
l3 = [[2,3,4],[2,1,2],[5,2,3]]
# array is a function to convert list into numpy
mat3 = np.array(l3)
print(mat3)
print(“Matrix Multiplication”)
print(np.matmul(mat1, mat3))
print(mat1 @ mat3)
## calculating determinant

l4 = [[1,3,5],[1,3,1],[2,3,4]]
mat5 = np.array(l4)
det_mat5 = np.linalg.det(mat5)
print(“Determinant of matrix 5 is”,det_mat5)
print(“Inverse of matrix 5 is: \n,np.linalg.inv(mat5))

”’
Linear Algebra Equation:
x1 + 5×2 = 7
-2×1 – 7×2 = -5

x1 = -8, x2= 3,
”’
coeff_mat = np.array([[1,5],[-2,-7]])
#var_mat = np.array([[x1],[x2]])
result_mat = np.array([[7],[-5]])
# equation here is coeff_mat * var_mat = result_mat [eg: 5 * x = 10]
# which is, var_mat = coeff_mat inv * result_mat
det_coeff_mat = np.linalg.det(coeff_mat)
if det_coeff_mat !=0:
var_mat = np.linalg.inv(coeff_mat) @ result_mat
print(“X1 = “,var_mat[0,0])
print(“X2 = “,var_mat[1,0])
else:
print(“Solution is not possible”)

# # scipy = scientific python
# pip install scipy
”’
#Inequality = OPTIMIZATION or MAXIMIZATION / MINIMIZATION PROBLEM
Computer Parts Assembly:
Laptops & Desktops
profit: 1000, 600
objective: either maximize profit or minimize cost

constraints:
1. Demand: 500, 600
2. Parts: Memory card: 5000 cards available
3. Manpower: 25000 minutes


”’

”’
Optimization using Scipy
let’s assume d = desktop, n = notebooks

Constraints:
1. d + n <= 10000
2. 2d + n <= 15000
3. 3d + 4n <= 25000

profit: 1000 d + 750 n => maximize
-1000d – 750 n =>minimize

”’
import numpy as np
from scipy.optimize import minimize, linprog
d = 1
n = 1
profit_d = 1000
profit_n = 750
profit = d * profit_d + n * profit_n
obj = [-profit_d, -profit_n]
lhs_con = [[1,1],[2,1],[3,4]]
rhs_con = [10000, 15000, 25000]

boundary = [(0, float(“inf”)), # boundary condition for # of desktops
(10, 200000)] # we just added some limit for notebooks
opt = linprog(c=obj, A_ub=lhs_con, b_ub=rhs_con, bounds=boundary, method=“revised simplex”)
print(opt)
if opt.success:
print(f”Number of desktops = {opt.x[0]} and number of laptops = {opt.x[1]})
print(“Maximum profit that can be generated = “,-1 * opt.fun)
else:
print(“Solution can not be generated”)

### ### ### PANDAS
# Pandas – dataframe which resembles Table structure
# pip install pandas
import pandas as pd
df1 = pd.DataFrame()
print(df1)
print(type(df1))

# fruit production
data = [[“Apple”, 15000, 11000,6000],
[“Banana”, 18000,22000,29000],
[“Mango”, 2, 900, 19000],
[“Guava”, 19000,11000,25000]]

fruit_production = pd.DataFrame(data)
print(fruit_production)
print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[1:3,2:]) #based on title(names)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”])
print(fruit_production)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”],
index=[“Fruit 1”,“Fruit 2”,“Fruit 3”,“Fruit 4”])
print(fruit_production)

## dataframe.loc() dataframe.iloc()

print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[[“Fruit 2”, “Fruit 3”],[“February”,“March”]]) #based on title(names)

### ###

# pandas
# pip install pandas
import pandas as pd
l1 = [10,20,30,40,50]
l1 = [[“Sachin”,101,20000,“BATSMAN”],[“Kapil”,501,12000,“BOWLER”],
[“Sunil”,12,21000,“BATSMAN”],[“Zaheer”,725,2000,“BOWLER”]]
df1 = pd.DataFrame(l1,columns=[“Player”,“Wickets”,“Runs”,“Type”],
index=[“Player 1”,“Player 2”,“Player 3”,“Player 4”])
print(df1)

d1 = {‘Apple’:[12000,11000,13000],
‘Banana’: [17000,18000,19000],
‘Mango’:[11000,13000,15000]}
df2 = pd.DataFrame(d1)
print(df2)

# creating dataframe from list of dictionary
data1 = [{“Guava”:9000, “Oranges”: 5000},
{“Guava”:8000, “Oranges”: 7000},
{“Guava”:10000, “Oranges”: 6000}]
df3 = pd.DataFrame(data1)
print(df3)

print(df3.iloc[0,:]) #first row and all column values
print(df3.iloc[:,0])

print(df2.iloc[:,0:2])
print(df2.iloc[[0,2],[0,2]])

#
print(df2.loc[[0,2],[“Apple”,“Mango”]])
print(df1.loc[[“Player 1”,“Player 4”],[“Player”,“Runs”]])

df2.iloc[2,0] = 14000
print(df2)
print(“========= DF1 =============”)
df1[‘Avg’] = df1[‘Runs’] / df1[“Wickets”]
print(df1)
print(“Reading data from DF1: “)
df4 = df1[df1.Player !=‘Sachin’] #filter where clause
print(\n\n New dataset without Sachin: \n, df4)
df1 = df1.drop(“Player”,axis=1) # axis default is 0
# unlike pop() and del – drop() returns a new dataframe
print(df1)


print(“Average Wickets of all the players = “,df1[‘Wickets’].mean())
print(“Average Wickets of players by type = \n\n,df1.groupby(‘Type’).mean())
# axis = 0 refers to rows
# axis = 1 refers to columns

print(\n\nDropping columns from DF1: “)
del df1[‘Wickets’] #dropping column Wickets using del
print(df1)

df1.pop(‘Runs’) #dropping column using pop
print(df1)
#

import pandas as pd

ud_df = pd.read_csv(“D:/datasets/gitdataset/user_device.csv”)
print(ud_df) # 272 rows x 6 columns
print(“Rows: “,ud_df.shape[0])
print(“Columns: “,ud_df.shape[1])

print(ud_df.tail(1))
print(ud_df.head(1))

use_df = pd.read_csv(“D:/datasets/gitdataset/user_usage.csv”)
print(use_df) # 240 rows x 4 columns

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’)
print(result_df) # [159 rows x 9 columns] = ud_df: 159 + 113, use_df = 159 + 81

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘outer’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘left’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘right’)
print(result_df)

## Working with Pandas – Example ##
import pandas as pd
import numpy as np
df = pd.read_csv(“D:/datasets/gitdataset/hotel_bookings.csv”)
print(df.shape)
print(df.dtypes)
”’
numeric – int, float
categorical – 1) Nominal – there is no order 2) Ordinal – here order is imp
”’
df_numeric = df.select_dtypes(include=[np.number])
print(df_numeric)

df_object= df.select_dtypes(exclude=[np.number])
print(df_object) # categorical and date columns

print(df.columns)
for col in df.columns:
missing = np.mean(df[col].isnull())
if missing >0:
print(f”{col}{missing})

”’
Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING – Divide the train and test


”’
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
print(df)

Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING –
a. Divide the train and test
b. Run the model
6. EVALUATE THE MODEL:
a. Measure the performance of each algorithm on the test data
b. Metric to compare: based on Regression (MSE, RMSE, R square) or
classification (confusion matrix -accuracy, sensitivity..)
c. select the best performing model
7. DEPLOY THE BEST PERFORMING MODEL

Hypothesis test:
1. Null Hypothesis (H0): starting statement (objective)
Alternate Hypethesis (H1): Alternate of H0

Z or T test:
Chi square test: both are categorical

e.g. North zone: 50 WIN 5 LOSS – p = 0.005

# simple (single value) v composite (specifies range)
# two tailed test v one tailed test [H0: mean = 0,
H1 Left Tailed: mean <0
H1 Right Tailed: mean >0
# level of significance:
alpha value: confidence interval – 95%
p value: p value <0.05 – we reject Null Hypothesis

import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
print(X)

import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train)
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)

”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))
## Bias v Variance

import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)


# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Administration’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘R&D Spend’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘Marketing Spend’],y=df[‘Profit’])
plt.show()

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’


## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = -3791.2 x Florida -3090.1 x California + 0.82 R&D – 0.05 Admin + 0.022 Marketing+ 56650


y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

”’
Case 1: All the columns are taken into account:
Mean Absolute Error = 8696.887641252619
R Square is (Variance) 0.884599945166969
Root Mean Squared Error (Bias) = 7562.5657508560125
R Square is (Bias) 0.9624157828452926
”’
## Testing

import statsmodels.api as sm
import numpy as np
X = np.array(X, dtype=float)
print(“Y:\n,y)
summ1 = sm.OLS(y,X).fit().summary()
print(“Summary of All X \n—————-\n:”,summ1)

import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)

”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.
”’
X = X[:,2:] # after backward elemination

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Administration’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘R&D Spend’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘Marketing Spend’],y=df[‘Profit’])
plt.show()

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’


## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = -3791.2 x Florida -3090.1 x California + 0.82 R&D – 0.05 Admin + 0.022 Marketing+ 56650


y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

”’
Case 1: All the columns are taken into account:
Mean Absolute Error = 8696.887641252619
R Square is (Variance) 0.884599945166969
Root Mean Squared Error (Bias) = 7562.5657508560125
R Square is (Bias) 0.9624157828452926
”’
## Testing

import statsmodels.api as sm
import numpy as np
X = np.array(X, dtype=float)
#X = X[:,[2,3,4]]
print(“Y:\n,y)
summ1 = sm.OLS(y,X).fit().summary()
print(“Summary of All X \n—————-\n:”,summ1)

## Test for linearity
# 1. All features (X) should be correlated to Y
# 2. Multicollinearity: Within X there should not be any correlation,
# if its there then take any one for the analysis

import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/4_Position_Salaries.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,1:2].values
y = df.iloc[:,2].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)

”’
”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.

X = X[:,2:] # after backward elemination
”’
”’
# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.show()
”’

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=100)
print(X_train)

from sklearn.linear_model import LinearRegression
from sklearn import metrics
”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
”’
#Since dataset is too small, lets take entire data for training
X_train, y_train = X,y
X_test, y_test = X,y
”’
”’
## RUN THE MODEL

regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_}”)

# y =
y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output

mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

# Plotting the data for output
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.plot(X,y_pred)
plt.xlabel(“Level”)
plt.ylabel(“Salary”)
plt.show()
”’

# 3. Model – Polynomial regression analysis
# y = C + m1 * X + m2 * x square
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

for i in range(1,10):
#prepare the parameters
parameters = [(‘polynomial’, PolynomialFeatures(degree=i)),(‘modal’,LinearRegression())]
pipe = Pipeline(parameters)
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X)
## Bias is based on training data
y_pred_tr = pipe.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
rmse_tr = mse ** 0.5
print(“Root Mean Squared Error (Bias) = “,rmse_tr)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

## Variance is based on validation data
y_pred_tt = pipe.predict(X_test)
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred_tt)
rmse_tt = mse ** 0.5
print(“Root Mean Squared Error (Variance) = “, rmse_tt)
print(“R Square is (Variance)”, metrics.r2_score(y_test, y_pred_tt))
print(“Difference Between variance and bias = “,rmse_tt – rmse_tr)
# Plotting the data for output
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.plot(X,y_pred)
plt.title(“Polynomial Analysis degree =”+str(i))
plt.xlabel(“Level”)
plt.ylabel(“Salary”)
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
#link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/4_Position_Salaries.csv”
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)


”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.

X = X[:,2:] # after backward elemination
”’
”’
# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.show()
”’

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=100)
print(X_train)

from sklearn.linear_model import LinearRegression
from sklearn import metrics
”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
”’
#Since dataset is too small, lets take entire data for training
X_train, y_train = X,y
X_test, y_test = X,y
”’

## RUN THE MODEL – Support Vector Machine Regressor (SVR)
from sklearn.svm import SVR
#regressor = SVR(kernel=’linear’)
#regressor = SVR(kernel=’poly’,degree=2,C=10)
# Assignment – Best value for gamma: 0.01 to 1 (0.05)
regressor = SVR(kernel=“rbf”,gamma=0.1,C=10)
# fit – train the model
regressor.fit(X_train, y_train)


# y =
y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output

mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))


# Plotting the data for output
plt.scatter(X_train[:,2],y_pred_tr)
#plt.plot(X_train[:,2],y_pred_tr)
plt.show()

#Decision Tree & Random Forest
import pandas as pd
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
link = “D:\\datasets\\3_Startups.csv”
df = pd.read_csv(link)
print(df)

#X = df.iloc[:,:4].values
X = df.iloc[:,:1].values
y = df.iloc[:,:-1].values
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=100)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

# Baging, Boosting, Ensemble
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

## Assignment these algorithms and check the RMSE and R square values

# Ridge Lasso Elasticnet
import pandas as pd
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/student_scores_multi.csv”
df = pd.read_csv(link)
print(df)
X = df.iloc[:,0:3].values
y = df.iloc[:,3].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.85, random_state=100)

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
lr_ridge = Ridge(alpha=0.8)
lr_ridge.fit(X_train,y_train)
y_ridge_pred = lr_ridge.predict(X_test)

from sklearn.metrics import r2_score
r2_ridge_test = r2_score(y_test, y_ridge_pred)

y_ridge_pred_tr = lr_ridge.predict(X_train)
r2_ridge_train = r2_score(y_train, y_ridge_pred_tr)
print(f”Ridge Regression: Train R2 = {r2_ridge_train} and Test R2={r2_

# Classifications algorithm: supervised algo which predicts the class
”’
classifier: algorithm that we develop
model: training and predicting the outcome
features: the input data (columns)
target: class that we need to predict
classification: binary (2 class outcome) or multiclass (more than 2 classes)

Steps to run the model:
1. get the data
2. preprocess the data
3. eda
4. train the model
5. predict the model
6. evaluate the model

”’
#1. Logistic regression
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))
plt.show()

https://designrr.page/?id=155238&token=545210681&type=FP&h=7849

# Classifications algorithm: supervised algo which predicts the class
”’
classifier: algorithm that we develop
model: training and predicting the outcome
features: the input data (columns)
target: class that we need to predict
classification: binary (2 class outcome) or multiclass (more than 2 classes)

Steps to run the model:
1. get the data
2. preprocess the data
3. eda
4. train the model
5. predict the model
6. evaluate the model

”’
#1. Logistic regression
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
## LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
”’
from sklearn.svm import SVC
”’
## Support Vector Machine – Classifier
classifier = SVC(kernel=’linear’)

classifier = SVC(kernel=’rbf’,gamma=100, C=100)
”’
from sklearn.neighbors import KNeighborsClassifier
## Refer types of distances:
# https://designrr.page/?id=200944&token=2785938662&type=FP&h=7229

classifier = KNeighborsClassifier(n_neighbors=9, metric=‘minkowski’)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

import sklearn.tree

link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion=”gini”)
”’
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=39, criterion=“gini”)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

”’
# Show decision tree created

output = sklearn.tree.export_text(classifier)
print(output)
# visualize the tree
fig = plt.figure(figsize=(40,60))
tree_plot = sklearn.tree.plot_tree(classifier)
plt.show()
”’

”’
In Ensemble Algorithms – we run multiple algorithms to improve the performance
of a given business objective:
1. Boosting: When you run same algorithm – Input varies based on weights
2. Bagging: When you run same algorithm – average of all
3. Stacking: Over different algorithms – average of all
”’

import sklearn.tree

link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion=”gini”)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=39, criterion=”gini”)
”’
from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier(n_estimators=7)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

”’
# Show decision tree created

output = sklearn.tree.export_text(classifier)
print(output)
# visualize the tree
fig = plt.figure(figsize=(40,60))
tree_plot = sklearn.tree.plot_tree(classifier)
plt.show()
”’

”’
In Ensemble Algorithms – we run multiple algorithms to improve the performance
of a given business objective:
1. Boosting: When you run same algorithm – Input varies based on weights
2. Bagging: When you run same algorithm – average of all
3. Stacking: Over different algorithms – average of all
”’

https://designrr.page/?id=36743&token=2022711066&type=FP&h=3547

 

from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

 

X,y = make_blobs(n_samples=300, n_features=3, centers=4)
plt.scatter(X[:,0], X[:,1])
plt.show()

 

from sklearn.cluster import KMeans
km = KMeans(n_clusters=5, init=“random”,max_iter=100)
y_cluster =km.fit_predict(X)

 

plt.scatter(X[y_cluster==0,0],X[y_cluster==0,1],c=“blue”,label=“Cluster A”)
plt.scatter(X[y_cluster==1,0],X[y_cluster==1,1],c=“red”,label=“Cluster B”)
plt.scatter(X[y_cluster==2,0],X[y_cluster==2,1],c=“green”,label=“Cluster C”)
plt.scatter(X[y_cluster==3,0],X[y_cluster==3,1],c=“black”,label=“Cluster D”)
plt.scatter(X[y_cluster==4,0],X[y_cluster==4,1],c=“orange”,label=“Cluster E”)
plt.show()

 

distortion = []
max_centers = 30
for i in range(1,max_centers):
km = KMeans(n_clusters=i, init=“random”, max_iter=100)
y_cluster = km.fit(X)
distortion.append(km.inertia_)

 

print(“Distortion:\n,distortion)
plt.plot(range(1,max_centers),distortion,marker=“o”)
plt.show()

 

import pandas as pd
import matplotlib.pyplot as plt
link = “D:\\Datasets\\USArrests.csv”
df = pd.read_csv(link)
#print(df)
X = df.iloc[:,1:]
from sklearn.preprocessing import normalize
data = normalize(X)
data = pd.DataFrame(data)
print(data)

## plotting dendogram
import scipy.cluster.hierarchy as sch
dendo = sch.dendrogram(sch.linkage(data, method=‘ward’))
plt.axhline(y=0.7,color=“red”)
plt.show()

link = “D:\\datasets\\Market_Basket_Optimisation.csv”
import pandas as pd
df = pd.read_csv(link)
print(df)
from apyori import apriori
transactions = []
for i in range(len(df)):
if i%100==0:
print(“I = “,i)
transactions.append([str(df.values[i,j]) for j in range(20)])

## remove nan from the list
print(“Transactions:\n,transactions)

association_algo = apriori(transactions, min_confidence=0.2, min_support=0.02, min_lift=2)
print(“Association = “,list(association_algo))

”’
Time Series Forecasting – ARIMA method

1. Read and visualize the data
2. Stationary series
3. Optimal parameters
4. Build the model
5. Prediction
”’
import pandas as pd
#Step 1: read the data
link = “D:\\datasets\\gitdataset\\AirPassengers.csv”
air_passengers = pd.read_csv(link)

”’
#Step 2: visualize the data
import plotly.express as pe
fig = pe.line(air_passengers,x=”Month”,y=”#Passengers”)
fig.show()
”’
# Cleaning the data
from datetime import datetime
air_passengers[‘Month’] = pd.to_datetime(air_passengers[‘Month’])
air_passengers.set_index(‘Month’,inplace=True)

#converting to time series data
import numpy as np
ts_log = np.log(air_passengers[‘#Passengers’])
#creating rolling period – 12 months
import matplotlib.pyplot as plt
”’
moving_avg = ts_log.rolling(12).mean
plt.plot(ts_log)
plt.plot(moving_avg)
plt.show()
”’
#Step 3: Decomposition into: trend, seasonality, error ( or residual or noise)
”’
Additive decomposition: linear combination of above 3 factors:
Y(t) =T(t) + S(t) + E(t)

Multiplicative decomposition: product of 3 factors:
Y(t) =T(t) * S(t) * E(t)
”’
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(ts_log,model=“multiplicative”)
decomposed.plot()
plt.show()

# Step 4: Stationary test
”’
To make Time series analysis, the TS should be stationary.
A time series is said to be stationary if its statistical properties
(mean, variance, autocorrelation) doesnt change by a large value
over a period of time.
Types of tests:
1. Augmented Dickey Fuller test (ADH Test)
2. Kwiatkowski Phillips Schnidt Shin (KPSS) test
3. Phillips Perron (PP) Test

Null Hypothesis: The time series is not stationary
Alternate Hypothesis: Time series is stationary
If p >0.05 we reject Null Hypothesis
”’
from statsmodels.tsa.stattools import adfuller
result = adfuller(air_passengers[‘#Passengers’])
print(“ADF Stats: \n,result[0])
print(“p value = “,result[1])
”’
To reject Null hypothesis, result[0] less than 5% critical region value
and p > 0.05
”’

# Run the model
”’
ARIMA model: Auto-Regressive Integrative Moving Average
AR: p predicts the current value
I: d integrative by removing trend and seasonality component from previous period
MA: q represents Moving Average

AIC- Akaike’s Information Criterion (AIC) – helps to find optimal p,d,q values
BIC – Bayesian Information Criterion (BIC) – alternative to AIC
”’
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(air_passengers[‘#Passengers’].diff().dropna())
plot_pacf(air_passengers[‘#Passengers’].diff().dropna())
plt.show()
”’
How to read above graph:
To find q (MA), we look at the Autocorrelation graph and see where there is a drastic change:
here, its at 1, so q = 1 (or 2 as at 2, it goes to -ve)

To find p (AR) – sharp drop in Partial Autocorrelation graph:
here, its at 1, so p = 1 (or 2 as at 2, it goes to -ve)

for d (I) – we need to try with multiple values
intially we will take as 1

”’
”’
Time Series Forecasting – ARIMA method

1. Read and visualize the data
2. Stationary series
3. Optimal parameters
4. Build the model
5. Prediction
”’
import pandas as pd
#Step 1: read the data
link = “D:\\datasets\\gitdataset\\AirPassengers.csv”
air_passengers = pd.read_csv(link)

”’
#Step 2: visualize the data
import plotly.express as pe
fig = pe.line(air_passengers,x=”Month”,y=”#Passengers”)
fig.show()
”’
# Cleaning the data
from datetime import datetime
air_passengers[‘Month’] = pd.to_datetime(air_passengers[‘Month’])
air_passengers.set_index(‘Month’,inplace=True)

#converting to time series data
import numpy as np
ts_log = np.log(air_passengers[‘#Passengers’])
#creating rolling period – 12 months
import matplotlib.pyplot as plt
”’
moving_avg = ts_log.rolling(12).mean
plt.plot(ts_log)
plt.plot(moving_avg)
plt.show()
”’
#Step 3: Decomposition into: trend, seasonality, error ( or residual or noise)
”’
Additive decomposition: linear combination of above 3 factors:
Y(t) =T(t) + S(t) + E(t)

Multiplicative decomposition: product of 3 factors:
Y(t) =T(t) * S(t) * E(t)
”’
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(ts_log,model=“multiplicative”)
decomposed.plot()
plt.show()

# Step 4: Stationary test
”’
To make Time series analysis, the TS should be stationary.
A time series is said to be stationary if its statistical properties
(mean, variance, autocorrelation) doesnt change by a large value
over a period of time.
Types of tests:
1. Augmented Dickey Fuller test (ADH Test)
2. Kwiatkowski Phillips Schnidt Shin (KPSS) test
3. Phillips Perron (PP) Test

Null Hypothesis: The time series is not stationary
Alternate Hypothesis: Time series is stationary
If p >0.05 we reject Null Hypothesis
”’
from statsmodels.tsa.stattools import adfuller
result = adfuller(air_passengers[‘#Passengers’])
print(“ADF Stats: \n,result[0])
print(“p value = “,result[1])
”’
To reject Null hypothesis, result[0] less than 5% critical region value
and p > 0.05
”’

# Run the model
”’
ARIMA model: Auto-Regressive Integrative Moving Average
AR: p predicts the current value
I: d integrative by removing trend and seasonality component from previous period
MA: q represents Moving Average

AIC- Akaike’s Information Criterion (AIC) – helps to find optimal p,d,q values
BIC – Bayesian Information Criterion (BIC) – alternative to AIC
”’
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(air_passengers[‘#Passengers’].diff().dropna())
plot_pacf(air_passengers[‘#Passengers’].diff().dropna())
plt.show()
”’
How to read above graph:
To find q (MA), we look at the Autocorrelation graph and see where there is a drastic change:
here, its at 1, so q = 1 (or 2 as at 2, it goes to -ve)

To find p (AR) – sharp drop in Partial Autocorrelation graph:
here, its at 1, so p = 1 (or 2 as at 2, it goes to -ve)

for d (I) – we need to try with multiple values
intially we will take as 1

”’
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(air_passengers[‘#Passengers’], order=(1,1,1))
result = model.fit()
plt.plot(air_passengers[‘#Passengers’])
plt.plot(result.fittedvalues)
plt.show()
print(“ARIMA Model Summary”)
print(result.summary())

model = ARIMA(air_passengers[‘#Passengers’], order=(4,1,4))
result = model.fit()
plt.plot(air_passengers[‘#Passengers’])
plt.plot(result.fittedvalues)
plt.show()
print(“ARIMA Model Summary”)
print(result.summary())

# Prediction using ARIMA model
air_passengers[‘Forecasted’] = result.predict(start=120,end=246)
air_passengers[[‘#Passengers’,‘Forecasted’]].plot()
plt.show()

# predict using SARIMAX Model
import statsmodels.api as sm
model = sm.tsa.statespace.SARIMAX(air_passengers[‘#Passengers’],order=(7,1,1), seasonal_order=(1,1,1,12))
result = model.fit()
air_passengers[‘Forecast_SARIMAX’] = result.predict(start=120,end=246)
air_passengers[[‘#Passengers’,‘Forecast_SARIMAX’]].plot()
plt.show()

https://drive.google.com/drive/folders/1Xe3HftLxL1T6HsEBUfjq_zXANjTnr6Cz?usp=drive_link

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“https://raw.githubusercontent.com/swapnilsaurav/OnlineRetail/master/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])

df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)
”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
txt_pp = text.lower()
print(txt_pp)
#remove accent

# applying basic preprocessing:
reviews_df[‘review_comment_message’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“D:/datasets/OnlineRetail/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
#df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])
#df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)

# remove accents
def remove_accent(text):
return unicodedata.normalize(‘NFKD’,text).encode(‘ascii’,errors=‘ignore’).decode(‘utf-8’)
#STOP WORDS LIST:
STOP_WORDS = set(remove_accent(w) for w in nltk.corpus.stopwords.words(‘portuguese’))

”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
#converting to lower case
txt_pp = text.lower()
#print(txt_pp)

#remove the accent
#txt_pp = unicodedata.normalize(‘NFKD’,txt_pp).encode(‘ascii’,errors=’ignore’).decode(‘utf-8’)
txt_pp =remove_accent(txt_pp)
#print(txt_pp)
#tokenize
txt_token = nltk.tokenize.word_tokenize(txt_pp)
#print(txt_token)

# removing stop words
txt_token = (w for w in txt_token if w not in STOP_WORDS and w.isalpha())
return txt_token

# applying basic preprocessing:
reviews_df[‘review_comment_words’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

#get positive reviews – all 5 ratings in review_score
reviews_5 = reviews_df[reviews_df[‘review_score’]==5]

#get negative reviews – all 1 ratings
reviews_1 = reviews_df[reviews_df[‘review_score’]==1]

## write a function to creaet unigram, bigram, trigram
def create_ngrams(words):
unigram,bigrams,trigram = [],[],[]
for comment in words:
unigram.extend(comment)
bigrams.extend(.join(bigram) for bigram in nltk.bigrams(comment))
trigram.extend(‘ ‘.join(trigram) for trigram in nltk.trigrams(comment))
return unigram,bigrams,trigram

#create ngrams for rating 5 and rating 1
uni_5, bi_5, tri_5 = create_ngrams(reviews_5[‘review_comment_words’])
print(uni_5)
print(‘””””””””””””””””””‘)
print(bi_5)
print(” =========================================”)
print(tri_5)

uni_1, bi_1, tri_1 = create_ngrams(reviews_1[‘review_comment_words’])
#print(uni_5)

# distribution plot
def plot_dist(words, color):
nltk.FreqDist(words).plot()

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“D:/datasets/OnlineRetail/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
#df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])
#df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)

# remove accents
def remove_accent(text):
return unicodedata.normalize(‘NFKD’,text).encode(‘ascii’,errors=‘ignore’).decode(‘utf-8’)
#STOP WORDS LIST:
STOP_WORDS = set(remove_accent(w) for w in nltk.corpus.stopwords.words(‘portuguese’))

”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
#converting to lower case
txt_pp = text.lower()
#print(txt_pp)

#remove the accent
#txt_pp = unicodedata.normalize(‘NFKD’,txt_pp).encode(‘ascii’,errors=’ignore’).decode(‘utf-8’)
txt_pp =remove_accent(txt_pp)
#print(txt_pp)
#tokenize
txt_token = nltk.tokenize.word_tokenize(txt_pp)
#print(txt_token)

# removing stop words
txt_token = tuple(w for w in txt_token if w not in STOP_WORDS and w.isalpha())
return txt_token



## write a function to creaet unigram, bigram, trigram
def create_ngrams(words):
unigrams,bigrams,trigrams = [],[],[]
for comment in words:
unigrams.extend(comment)
bigrams.extend(‘ ‘.join(bigram) for bigram in nltk.bigrams(comment))
trigrams.extend(‘ ‘.join(trigram) for trigram in nltk.trigrams(comment))


return unigrams, bigrams, trigrams


# applying basic preprocessing:
reviews_df[‘review_comment_words’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

#get positive reviews – all 5 ratings in review_score
reviews_5 = reviews_df[reviews_df[‘review_score’]==5]

#get negative reviews – all 1 ratings
reviews_1 = reviews_df[reviews_df[‘review_score’]==1]
#create ngrams for rating 5 and rating 1
uni_5, bi_5, tri_5 = create_ngrams(reviews_5[‘review_comment_words’])
print(uni_5)
print(bi_5)
print(tri_5)

# Assignment: perform similar tasks for reviews that are negative (review score = 1)
#uni_1, bi_1, tri_1 = create_ngrams(reviews_1[‘review_comment_words’])
#print(uni_5)

# distribution plot
def plot_dist(words, color):
nltk.FreqDist(words).plot(20,cumulative=False, color=color)

plot_dist(tri_5, “red”)

#NLP – Natural Language processing:
# sentiments: Positive, Neutral, Negative
#
”’
we will use nltk library for NLP:
pip install nltk
”’
import nltk
#1. Convert into lowercase
text = “Product is great but I amn’t liking the colors as they are worst”
text = text.lower()

”’
2. Tokenize the content: break it into words or sentences
”’
text1 = text.split()
#using nltk
from nltk.tokenize import sent_tokenize,word_tokenize
text = word_tokenize(text)
#print(“Text =\n”,text)
#print(“Text =\n”,text1)

”’
3. Removing Stop words: Words which are not significant
for your analysis. E.g. an, a, the, is, are
”’
my_stopwords = [‘is’,‘i’,‘the’]
text1 = text
for in text1:
    
if in my_stopwords:
        text.remove(w)
print(“Text after my stopwords:”,text1)

nltk.download(
“stopwords”)
from nltk.corpus import stopwords
nltk_eng_stopwords = 
set(stopwords.words(“english”))
#print(“NLTK list of stop words in English: “,nltk_eng_stopwords)
”’
Just for example: we see the word but in the STOP WORDS but
we want to include it, then we need to remove the word from the set
”’
# removing but from the NLTK stop words
nltk_eng_stopwords.remove(‘but’)

for in text:
    
if in nltk_eng_stopwords:
        text.remove(w)
print(“Text after NLTK stopwords:”,text)

”’
4. Stemming: changing the word to its root
eg: {help: [help, helped, helping, helper]}

One of the method is Porter stemmer
”’
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
text = [stemmer.stem(w) 
for in text]
”’ above line is like below:
t_list=[]
for w in text:
    a = stemmer.stem(w)
    t_list.append(a)
”’
print(“Text after Stemming:”,text)
”’
5. Part of Speech Tagging (POS Tagging)
grammatical word which deals with the roles they place
like – 8 parts of speeches – noun, verb, …

Reference: https://www.educba.com/nltk-pos-tag/
POS Tagging will give Tags like

CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional

But to perform this, we need to download any one tagger:
e.g. averaged_perceptron_tagger
nltk.download(‘averaged_perceptron_tagger’)
”’
nltk.download(‘averaged_perceptron_tagger’)

import nltk
from nltk.tag import DefaultTagger
py_tag = DefaultTagger (
‘NN’)
tag_eg1 = py_tag.tag ([
‘Example’‘tag’])
print(tag_eg1)

#txt = “Example of nltk pos tag list”
#txt = [‘product’, ‘great’, ‘but’, “not”, ‘like’, ‘color’]
#txt = word_tokenize(txt)
#txt = [‘Example’,’of’,’nltk’,’pos’,’tag’,’list’]
pos_txt = nltk.pos_tag(text)
print(“POS Tagging:”, pos_txt)

”’
6. Lemmetising
takes a word to its core meaning
We need to download:  wordnet
”’
nltk.download(‘wordnet’)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(“Very good = “,lemmatizer.lemmatize(“very good”))
print(“Halves = “,lemmatizer.lemmatize(“halves”))

text = 
“Product is great but I amn’t liking the colors as they are worst”
text = word_tokenize(text)
text = [lemmatizer.lemmatize(w) 
for in text]
print(“Text after Lemmatizer: “,text)


# Sentiment analysis – read the sentiments of each sentence
”’
If you need more data for your analysis, this is a good source:
https://github.com/pycaret/pycaret/tree/master/datasets

We will use Amazon.csv for this program

”’
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

link = “https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv”
df = pd.read_csv(link)
print(df)

#Let’s create a function to perform all the preprocessing steps
# of a nlp analysis
def preprocess_nlp(text):
#tokenise
#print(“0”)
text = text.lower() #lowercase
#print(“1”)
text = word_tokenize(text) #tokenize
#print(“2”)
text = [w for w in text if w not in stopwords.words(“english”)]
#lemmatize
#print(“3”)
lemm = WordNetLemmatizer()
#print(“4”)
text = [lemm.lemmatize(w) for w in text]
#print(“5”)
# now join all the words as we are predicting on each line of text
text_out = ‘ ‘.join(text)
#print(“6”)
return text_out

# import Resource vader_lexicon
import nltk
nltk.download(‘vader_lexicon’)


df[‘reviewText’] = df[‘reviewText’].apply(preprocess_nlp)
print(df)

# NLTK Sentiment Analyzer
# we will now define a function get_sentiment() which will return
# 1 for positive and 0 for non-positive
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
score = analyzer.polarity_scores(text)
sentiment = 1 if score[‘pos’] > 0 else 0
return sentiment

df[‘sentiment’] = df[‘reviewText’].apply(get_sentiment)

print(“Dataframe after analyzing the sentiments: \n,df)

#confusion matrix
from sklearn.metrics import confusion_matrix
print(“Confusion matrix:\n,confusion_matrix(df[‘Positive’],df[‘sentiment’]))

”’ RESULT

Confusion matrix:
[[ 1131 3636]
[ 576 14657]]
Accuracy: (1131 + 14657) / (1131 + 14657 + 576 + 3636) = 15788/20000 = 78.94%
”’
# Visualization
import matplotlib.pyplot as plt
import numpy as np
data = np.random.randn(1000)
plt.hist(data, bins=30, histtype=‘stepfilled’, color=“red”)
plt.title(“Histogram Display”)
plt.xlabel(“Marks”)
plt.ylabel(“Number of Students”)
plt.show()
# Analyzing Hotel Bookings data
# https://github.com/swapnilsaurav/Dataset/blob/master/hotel_bookings.csv
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/hotel_bookings.csv”
import pandas as pd
df = pd.read_csv(link)
#print(“Shape of the data: “,df.shape)
#print(“Data types of the columns:”,df.dtypes)
import numpy as np
df_numeric = df.select_dtypes(include=[np.number])
#print(df_numeric)
numeric_cols = df_numeric.columns.values
#print(“Numeric column names: “,numeric_cols)
df_nonnumeric = df.select_dtypes(exclude=[np.number])
#print(df_nonnumeric)
nonnumeric_cols = df_nonnumeric.columns.values
#print(“Non Numeric column names: “,nonnumeric_cols)

####
#preprocessing the data
import seaborn as sns
import matplotlib.pyplot as plt
colors = [“#091AEA”,“#EA5E09”]
cols = df.columns
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colors))
plt.show()

cols_to_drop = []
for col in cols:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
#print(f”{col} -> {pct_miss}”)
cols_to_drop.append(col) #column list to drop

# remove column since it has more than 80% missing value
df = df.drop(cols_to_drop, axis=1)

for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
print(f”{col} -> {pct_miss})
# check for rows to see the missing values
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing >0:
df[f’{col}_ismissing’] = missing
print(f”Created Missing Indicator for {cols})

### keeping track of the missing values
ismissing_cols = [col for col in df.columns if ‘_ismissing’ in col]
df[‘num_missing’] = df[ismissing_cols].sum(axis=1)
print(df[‘num_missing’])

# drop rows with > 12 missing values
ind_missing = df[df[‘num_missing’] > 12].index
df = df.drop(ind_missing,axis=0) # ROWS DROPPED

#count for missing values
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

”’
Still we are left with following missing values:
children -> 2.0498257606219004
babies -> 11.311318858061922
meal -> 11.467129071170085
country -> 0.40879238707947996
deposit_type -> 8.232810615199035
agent -> 13.687005763302507
”’

# Analyzing Hotel Bookings data
# https://github.com/swapnilsaurav/Dataset/blob/master/hotel_bookings.csv
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/hotel_bookings.csv”
import pandas as pd
df = pd.read_csv(link)
#print(“Shape of the data: “,df.shape)
#print(“Data types of the columns:”,df.dtypes)
import numpy as np
df_numeric = df.select_dtypes(include=[np.number])
#print(df_numeric)
numeric_cols = df_numeric.columns.values
print(“Numeric column names: “,numeric_cols)
df_nonnumeric = df.select_dtypes(exclude=[np.number])
#print(df_nonnumeric)
nonnumeric_cols = df_nonnumeric.columns.values
print(“Non Numeric column names: “,nonnumeric_cols)

####
#preprocessing the data
import seaborn as sns
import matplotlib.pyplot as plt
colors = [“#091AEA”,“#EA5E09”]
cols = df.columns
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colors))
plt.show()

cols_to_drop = []
for col in cols:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
#print(f”{col} -> {pct_miss}”)
cols_to_drop.append(col) #column list to drop

# remove column since it has more than 80% missing value
df = df.drop(cols_to_drop, axis=1)

for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
print(f”{col} -> {pct_miss})
# check for rows to see the missing values
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing >0:
df[f’{col}_ismissing’] = missing
#print(f”Created Missing Indicator for {cols}”)

### keeping track of the missing values
ismissing_cols = [col for col in df.columns if ‘_ismissing’ in col]
df[‘num_missing’] = df[ismissing_cols].sum(axis=1)
print(df[‘num_missing’])

# drop rows with > 12 missing values
ind_missing = df[df[‘num_missing’] > 12].index
df = df.drop(ind_missing,axis=0) # ROWS DROPPED

#count for missing values
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

”’
Still we are left with following missing values:
children -> 2.0498257606219004 # numeric
babies -> 11.311318858061922 #numeric
meal -> 11.467129071170085 # non-numeric
country -> 0.40879238707947996 # non-numeric
deposit_type -> 8.232810615199035 # non-numeric
agent -> 13.687005763302507 #numeric
”’
#HANDLING NUMERIC MISSING VALUES
df_numeric = df.select_dtypes(include=[np.number])
for col in df_numeric.columns.values:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss > 0:
med = df[col].median()
df[col] = df[col].fillna(med)

#HANDLING non-NUMERIC MISSING VALUES
df_nonnumeric = df.select_dtypes(exclude=[np.number])
for col in df_nonnumeric.columns.values:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss > 0:
mode = df[col].describe()[‘top’]
df[col] = df[col].fillna(mode)


print(“#count for missing values”)
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

#drop duplicate values
print(“Shape before dropping duplicates: “,df.shape)
df = df.drop(‘id’,axis=1).drop_duplicates()
print(“Shape after dropping duplicates: “,df.shape)

 

 

 

DATABRICKS LEARNING
# We will implement Stack operations using List
# Last in First Out

class MyStack:
def __init__(self):
self.mystack = []

def add_stack(self,val):
#adding member
self.mystack.append(val)

def remove_stack(self):
#remove from the list
self.mystack.pop()

def print_stack(self):
print(“Values in the list are: \n,self.mystack)

stack1 = MyStack()
stack1.add_stack(40)
stack1.add_stack(10)
stack1.add_stack(30)
stack1.add_stack(60)
stack1.add_stack(20)
stack1.print_stack()
stack1.remove_stack()
stack1.print_stack()
stack1.remove_stack()
stack1.print_stack()
PYTHON NOV 2023
”’
print() – displays the content on the screen
functions have () after the name

python commands are case sensitive- Print is not same as print
”’

# idgjdsigjfigj
# comments mean that you are asking computer to ignore them

print(5)
print(5+3)
print(‘5+3’)
print(“5+3”)
print(‘5+2*3=’,5+2*3,“and 4*3=”,4*3)
print(“Hello How are you?”);print(‘Hello How are you?’)
# print always starts from a new line
# escape sequence: \n (newline) \t for tab spaces
print(“How are you doing? \nWhere are you \tgoing?”);
# What’s your name?
print(“What’s your name?”)
# He asked me,”What’s your name?”
print(“He asked me,\”What’s your name?\”,end=\n)

# He asked me,\”What’s your name?\”
print(“He asked me,\\\”What’s your name?\\\”,end=\n)
print(“Hello”,end=” – “)
print(“How are you?”)

print(“Basic data types in Python”)
# numeric – int (integer)- -99, -4,0,5,888: no decimal values
marks1 = 43

marks2 = 87
print(“Marks1 =”,marks1)
marks1 = 99
print(marks1)
# function: type() – it gives the datatype
print(type(marks1)) #<class ‘int’>

marks = 87.0 # <class ‘float’>
print(type(marks))

# complex: square root of -1: j
calc = 3j * 4j
print(calc) # 12 j-square = -12 + 0j
print(‘Data type of calc = ‘,type(calc))

# int float complex
a = –55
print(type(a))
a = –55.0
print(type(a))
a = –55j
print(type(a))

# str – string – text
print(“HELLO”)
name=“Sachin”
print(name)
print(“type = “,type(name))
name=‘Virat kohli leads \nbangalore team in IPL’
print(name)
print(“type = “,type(name))

name=”’Rohit is the captain
of Indian team
He opens in the ODIs”’
print(name)
print(“type = “,type(name))

name=“””Rohit led the Indian team
in 2023 ODI World cup and
reached finals”””
print(name)
print(“type = “,type(name))

#5th data type – Bool boolean – 2 values: True and False
val1 = True # False
print(type(val1))

# Formatting the print statement
quantity = 12
price = 39
total = quantity * price
print(“Total cost of”,quantity,“books which costs per copy Rs”,price,“will be Rs”,total)
# f – string is used to format the output
print(f”Total cost of {quantity} books which costs per copy Rs {price} will be Rs {total})

# f-string is used to format float values as well
quantity, total = 12, 231.35
price = total/quantity
print(f”Total cost of {quantity} books which costs per copy Rs {price:.1f} will be Rs {total})

# f-string for string values
name,country,title=“Rohit”,“India”,“Captain”
print(f”Player {name:<12} plays for {country:^10} and is the {title:>15} of the team”)
name,country,title=“Mangbwabe”,“Zimbabwe”,“Wicket-keeper”
print(f”Player {name:<12} plays for {country:^10} and is the {title:>15} of the team”)

### INPUT
## to take input from the user
## input can take no or at max 1 parameter
inp_val = int(input(“Enter first number: “))
print(inp_val)
print(“Datatype of input=”,type(inp_val))
inp_val2 = int(input(“Enter second number: “))
print(“Sum of two numbers=”,inp_val+inp_val2)

## change below programs to accept the values from the user using input

# 1. write a program to calculate area and perimeter of a rectangle
l=50
b=20
area = l*b
peri = 2*(l+b)
print(f”Area and perimeter of a rectangle with length {l} and breadth {b} is {area} and {peri} respectively”)
# 2. write a program to calculate area and perimeter of a square
#### Assignment ##
# 3. write a program to calculate volume and surface area of a cone
#### Assignment ##
# 4. write a program to calculate volume and surface area of a cylinder
#### Assignment ##
# 5. write a program to calculate area and circumference of a circle
r=50
pi = 3.12
area = pi*r**2
cir = 2*pi*r
print(f”Area and circumference of a circle with radius {r} is {area} and {cir} respectively”)
# input() – read input from the user
num1 = int(input(“Enter first number:”))
print(“type = “,type(num1))
num2 = int(input(“Enter second number:”))
print(“Sum is “,num1+num2)

# calculate area and perimeter for a rectangle
length=float(input(“Enter length of the rectangle:”))
breadth=float(input(“Enter breadth of the rectangle:”))
perimeter = (length+breadth)*2
print(“Perimeter of the rectangle is”,perimeter)

# int() -to convert to int
#similarly you can use float(), str() bool() complex()
# operators:
# Arithmatic operators: + – * / ** // % (modulo – remainder)
num1 = 11 #assignment operator = we are assigning value 11 to num1
num2 = 3
print(num1 + num2)
print(num1 – num2)
print(num1 * num2)
print(num1 / num2)
print(num1 ** num2) #power
print(num1 // num2) #integer division
print(num1 % num2) # remainder

## relational operators (comparision)
## > >= < <= == != (is it?)
## output is always bool (True or False)
num1,num2,num3 = 11,9,11
print(“Relational : “, num1 > num2) # T
print(“Relational : “, num1 >= num3) # T
print(“Relational : “, num1 < num2) # F
print(“Relational : “, num1 <= num3) # T
print(“Relational : “, num1 == num2) # F
print(“Relational : “, num1 == num3) # T
print(“Relational : “, num1 != num2) # T
print(“Relational : “, num1 != num3) # F
print(“Relational : “, num1 > num3) # F
print(“Relational : “, num1 < num3) # F

# Logical operators: and or not
# input and output are both bool values
”’
Prediction 1: Rohit and Ishan will open the batting
Prediction 2: Rohit or Ishan will open the batting
Actual: Rohit and Gill opened the batting
Prediction 1 False
Prediction 2 True

Truth Table: AND (*)
T and T = T
T and F = F
F and T = F
F and F = F

OR (+)
T or T = T
T or F = T
F or T = T
F or F = F

not T = F
not F = T
”’
num1,num2,num3 = 11,9,11
print( not(num1 > num2 and num1 >= num3 or num1 < num2 or num1 <= num3 and num1 == num2
and num1 == num3 or num1 != num2 or num1 != num3 and num1 > num3 or num1 < num3))
# T and T or F or T and F and T or T or F and F or F
# T or F or F or T or F or F
# T
# int to binary and vice-versa
num1 = 34
print(“Binary of num1=”,bin(34))
num2 = 0b100010
print(“Integer of num2=”,int(num2))
print(oct(34)) # 0o42
print(hex(34)) # 0x22

#Bitwise: & (bitwise and) | (bitwise or) >> (right shift) << (left shift)
num1 = 23 #0b10111
num2 = 31 #0b11111
print(bin(num1),“and”,bin(num2))
”’
bitwise &
10111
11111
——–
10111
”’
print(int(0b10111)) # 23
print(“23 & 31 = “,23 & 31) # 23

”’
bitwise |
10111
11111
——–
11111
”’
print(“23 | 31 = “,23 | 31) # 31

”’
THTO
54320
”’
print(“23 << 2:”,23 << 2) # 92
”’
1011100 << 2
”’
print(int(0b1011100))

print(“23 << 2:”,23 >> 2) # 5
”’
101
”’
print(int(0b101))

# conditions
”’
display message after checking if the student has passed or failed the exam
condition is avg >= 40 to pass

if command checks the condition is Python
syntax:
if condition :
# perform things when the condition is true


Title
* sub
o ss
i.
ii.
”’
avg =82
if avg >=40:
print(“Congratulations!”)
print(“You’ve passed!”)

print(“Thank you”)
”’
Check avg and print Pass or Fail
”’
avg = 19
if avg >=40:
print(“Pass”)
else:
print(“Fail”)
# IF – condition – will always result into True or False

num1 = 71.000000001
num2 = 71
# if num1 is greater than num2 then I want to print How are you? otherwise do nothing
if num1 > num2:
print(“How are you?”)
print(“Where are you going?”)

print(“Thank you”)

# if num1 is greater than num2 then I want to print How are you? otherwise print Do nothing
if num1 > num2:
print(“How are you?”)
print(“Where are you going?”)
else:
print(“Do Nothing”)

”’
Input a number from the user and check if its +ve, -ve or zero
”’
val = int(input(“Enter a number: “))
print(“Type of data =”,type(val))

# IF – ELIF – ELSE
if val==0: # == is to check the equality
print(“Its Zero”)
elif val <= 0:
print(“Its -ve number”)
else:
print(“Its +ve number”)

if val==0:
print(“Its Zero”)
if val<=0:
print(“Its -ve number”)
if val>=0:
print(“Its +ve number”)

”’
Write a program to take 2 inputs from the user and check if the first
number is greater, smaller or equal to the second one
”’
num1 = int(input(“Enter first number: “))
num2 = int(input(“Enter second number: “))
if num1 > num2:
print(num1,“is greater than”,num2)
elif num1 < num2:
print(num1,“is less than”,num2)
else:
print(num1, “and”, num2,“are equal”)

”’
WAP to take marks in 5 subjects as input, calculate total and average
and assign grade based on below condition:
a. avg 85 – Grade A
b. avg 70-85 – Grade B
c. avg 60-70 – Grade C
d. avg 50-60 – Grade D
e. avg 40 -50 – Grade E
f. avg <40 – Grade F
”’
marks1 = float(input(“Enter the marks in subject 1: “))
marks2 = float(input(“Enter the marks in subject 2: “))
marks3 = float(input(“Enter the marks in subject 3: “))
marks4 = float(input(“Enter the marks in subject 4: “))
marks5 = float(input(“Enter the marks in subject 5: “))
total = marks1 + marks5 + marks4 + marks3 + marks2
avg = total / 5
print(f”Total marks is {total:.2f} and average is {avg:.2f})
if avg>=85:
print(“Grade A”)
elif avg>=70:
print(“Grade B”)
elif avg>=60:
print(“Grade C”)
elif avg>=50:
print(“Grade D”)
elif avg>=40:
print(“Grade E”)
else:
print(“Grade F”)

”’
Let’s write a program to read length and breadth from the user
check if its square or rectangle and calculate area and perimeter
”’
length = int(input(‘Enter the length: ‘))
breadth = int(input(‘Enter the breadth: ‘))
#and & or are logical operator which connects you conditonal statements
# and: both the statements need to be true for True else its false
# or: both the statements need to be false for False else its True
if length>0 and breadth >0:
print(“Rectangle and Square both possible”)
if length==breadth:
print(“Square”)
print(f”Area is {length**2} and the perimeter is {4*length})
else:
print(“Rectangle”)
print(f”Area is {length * breadth} and the perimeter is {2 * (length+breadth)})
else:
print(“Neither Rectangle nor Square possible”)
”’
check if a number is positive, negative or zero
if the number is -ve, find the square root
if number is positive, check if its 2 digit or not
if 2 digits then interchange the values
otherwise, check if its divisible by 15,
”’

num1 = int(input(“Enter a number: “))
if num1<0:
print(“This is negative”)
print(f”Square root of {num1} is {num1**0.5})
elif num1==0:
print(“This is zero”)
else:
print(“This is positive”)
if num1>9 and num1<100:
#interchange the values: eg 35 = 53
# divide number by 10 =
d = num1 // 10
r = num1 % 10
new_num1 = r*10+d
print(f”{num1} is now made into {new_num1})

else:
if num1 % 15==0: # % mod – will give you remainder
print(“Number is divisible by 15”)
else:
print(“Number is not divisible by 15”)

#LOOPS – repeat the give block of code multiple times
# when you know exactly how many times to run – for
# repeatition is done based on a certain condition – while

# range(start,end,increment)- generates range of values from start upto end
# by increasing each element ny increment
# range(6,18,3): 6,9,12, 15
# range(start,end): increment is default 1
# range(15,19): 15,16,17,18
# range(end): start = 0, increment = 1
# range(6): 0, 1, 2, 3, 4, 5
#print(), input(), type(), int(),str(),complex(),bool(), float()

for var in range(6,18,3):
print(“Hello from the loop!”)
print(“Value of var is”,var)

for count in range(15,19):
print(“Hello from the loop2!”)
print(“Value of var is”,count)

for count in range(4):
print(“Hello from the loop3!”)
print(“Value of var is”,count)

###
for i in range(5):
print(“*”,end=” “)
print()
for i in range(1,101):
print(i,end=“, “)
print()
”’
Generate odd numbers between 1 and 30
”’
for i in range(1,30,2):
print(i,end=“, “)
print()
”’
Generate first 10 even numbers
”’
start = 0
for i in range(10):
print(start,end=“, “)
start=start+2

print()
# for loop examples
”’
Print all the numbers between 1 and 1000 which is perfectly divisible by 19 and 51
”’
start,end = 1, 10001
num1,num2 = 19,51
for n in range(start,end):
if n%num1==0 and n%num2==0:
print(n,end=“, “)
print()
”’
Generate prime numbers between 10000 and 50000
”’
start,end = 40000, 42000
for n in range(start,end):
isPrime = True
for num in range(2,n//2+1):
if n %num==0:
isPrime = False
break
if isPrime:
print(n,end=“, “)

”’
Print different * patterns
”’
for i in range(5):
print(“*”)

”’
* * * * *
* * * * *
* * * * *
* * * * *
* * * * *
”’
for j in range(5):
for i in range(5):
print(“*”,end=” “)
print()

”’
*
* *
* * *
* * * *
* * * * *
”’

for j in range(5):
for i in range(1+j):
print(“*”,end=” “)
print()

”’
* * * * *
* * * *
* * *
* *
*
”’

for j in range(5):
for i in range(5-j):
print(“*”,end=” “)
print()
”’
* * * * *
* * * *
* * *
* *
*
”’

for j in range(5):
for i in range(j):
print(” “,end=“”)
for i in range(5-j):
print(“*”,end=” “)
print()

”’
Assignment:
*
* *
* * *
* * * *
* * * * *

Solve assignments from the website
”’
## WHILE Loop
”’
WAP to print hello till user says no
”’
while True:
print(“HELLO 1”)
usr_inp=input(“Enter N to stop: “)
if usr_inp.lower()==“n”:
break
print(“====”)
usr_inp=input(“Enter N to stop: “)
while usr_inp.lower() !=‘n’:
print(“HELLO 2”)
usr_inp = input(“Enter N to stop: “)
”’
A company offers dearness allowance (DA) of 40% of basic pay and house
rent allowance (HRA) of 10% of basic pay. Input basic pay of an employee,
calculate his/her DA, HRA and Gross pay (Gross = Basic Pay + DA+ HRA).
a. Modify the above scenario, such that the DA and HRA
percentages are also given as inputs.
b. Update the program such that the program uses a user-defined
function for calculating the Gross pay. The function takes Basic pay,
DA percentage and HRA percentage as inputs and returns the gross pay.
”’
#Case 1
basic_pay = int(input(“Enter your basic pay:”))
da = basic_pay *0.4
hra = basic_pay*0.1
gross_pay = basic_pay + da + hra
print(“Your gross pay for this month is Rs”,gross_pay)

#Case 2
basic_pay = int(input(“Enter your basic pay:”))
da = int(input(“Enter the dearness allowance (%): “))
da = da/100
hra = int(input(“Enter the House rent allowance (%): “))
hra = hra/100
gross_pay = basic_pay + basic_pay*da + basic_pay*hra
print(“Your gross pay for this month is Rs”,gross_pay)
#case 3

# defining a user defined function (udf)
# input taken by the function – passing the value
# and anything returned from the function – function returns the output
def calc_gross_pay(bp,da,hra=10):
hra = hra / 100
da = da / 100
gross_pay = bp + bp * da + bp * hra
return gross_pay


basic_pay = int(input(“Enter your basic pay:”))
da = int(input(“Enter the dearness allowance (%): “))
hra = int(input(“Enter the House rent allowance (%): “))

result = calc_gross_pay(basic_pay,da,hra)
print(“Your gross pay for this month is Rs”,result)

result = calc_gross_pay(basic_pay,da)
print(“Your gross pay with default hra for this month is Rs”,result)

result = calc_gross_pay(da=da,bp=basic_pay,hra=hra)
print(“Your gross pay with non-positional for this month is Rs”,result)

# required positional arguments
# default (non-required)

”’
You have a monthly income of Rs 1100. Your monthly outgoings are as follows.
• Rent – Rs.500
• Food – Rs.300
• Electricity – Rs.40
• Phone – Rs 60
• Cable TV – Rs 30.
Calculate the Monthly Expenses and the remainder (what’s left over each month).
a. Modify the above program by inputting the income as well as values
for expenses and calculate Monthly expense.
b. Include a function to check whether you will have savings or you
have to borrow money based on the monthly income and total expenses.
The function should print an appropriate message for each case.
”’
#case 1
income = 1100
Rent=500
Food=300
Electricity=40
Phone=60
Cable=30
expenses = Rent+Food+Electricity+Phone+Cable
remainder = income-expenses
print(“Your expenses for this month is”,expenses)
print(“You remainder for this month is”,remainder)

#case 2
income = int(input(“Enter your Income:”))
Rent= int(input(“Enter your rent:”))
Food= int(input(“Enter your food expenses:”))
Electricity= int(input(“Enter your Electricity charges:”))
Phone= int(input(“Enter your Phone expenses:”))
Cable= int(input(“Enter your Cable TV expenses:”))
expenses = Rent+Food+Electricity+Phone+Cable
remainder = income-expenses
print(“Your expenses for this month is”,expenses)
print(“You remainder for this month is”,remainder)


# case 3
def check_remainder(income,expenses):
remainder = income-expenses
if remainder<0:
print(f”You need to borrow Rs {remainder} for this month”)
elif remainder>0:
print(f”You have a savings of Rs {remainder} for this month”)
else:
print(“This month you neither have savings nor need to borrow any money”)

income = int(input(“Enter your Income:”))
Rent= int(input(“Enter your rent:”))
Food= int(input(“Enter your food expenses:”))
Electricity= int(input(“Enter your Electricity charges:”))
Phone= int(input(“Enter your Phone expenses:”))
Cable= int(input(“Enter your Cable TV expenses:”))
expenses = Rent+Food+Electricity+Phone+Cable
check_remainder(income,expenses)


########## PRACTICE #################

# defining a user defined function (udf)
# input taken by the function – passing the value
# and anything returned from the function – function returns the output
def calc_gross_pay(n1,n2):
print(“Hi, I am in calc_gross_pay_function”)
total = n1 + n2
#print(total)
return total


val1 = 100
val2 = 150
ret_val = calc_gross_pay(val1,val2) #calling the function pass the value
print(“Value returned from the function is”,ret_val)
val1 = 10
val2 = 50
result = calc_gross_pay(val1,val2) #calling the function pass the value
print(“Value returned from the function is”,result)


# Guessing the number game: Computer v Human
# computer will think of the number and we will guess it
import random
num1 = random.randint(1,100)
attempts = 0
fouls = 0
while True:
guess = int(input(“Guess the number between 1 and 100: “))
if guess<1 or guess>100:
print(“Your guess is outside the valid number range! “,end=” “)
if fouls==0:
print(“This is your first foul, so you can continue but another foul will make you lose.”)
else:
print(“This is your second foul, sorry you lose.”)
break
fouls+=1
continue
attempts+=1
if num1 == guess:
print(f”You guessed it right in {attempts} attempts!”)
break
elif num1 > guess:
print(“Sorry! Its Incorrect! Guess a higher number”)
else:
print(“Sorry! Its Incorrect! Guess a lower number”)

#############
# Guessing the number game: Computer v Computer
# computer will think of the number and it will only guess it
import random
import time # date, datetime

start = time.time()
### finding average attempts of running this program
total_attempts = 0
for i in range(10000):
num1 = random.randint(1,100)
attempts = 0
fouls = 0
low,high=1,100
while True:
guess = random.randint(low,high)
if guess<1 or guess>100:
print(“Your guess is outside the valid number range! “,end=” “)
if fouls==0:
print(“This is your first foul, so you can continue but another foul will make you lose.”)
else:
print(“This is your second foul, sorry you lose.”)
break
fouls+=1
continue
attempts+=1
if num1 == guess:
print(f”You guessed it right in {attempts} attempts!”)
total_attempts+=attempts # a+=b => a = a+b ; a/=c => a =a/c
break
elif num1 > guess:
print(“Sorry! Its Incorrect! Guess a higher number”)
low=guess+1

else:
print(“Sorry! Its Incorrect! Guess a lower number”)
high=guess-1
end = time.time()
print(f”On average this program has taken {total_attempts/10000:.1f} attempts”)
print(f”Total time taken by the program to run 10000 times is {end-start} seconds”)

#############
# LIST
# collections: list, tuple, sets, dictionary, numpy, pandas
l1 = [10,20,“30”,False,“Hello”,[1,3,5]]
print(“Type of the variable = “,type(l1))
print(“Size/Length of the list = “,len(l1))
# read the values of a list:
print(l1[0])

# LIST: ordered mutable linear collection
list1 = [34,“Hello”,[2,3,4], True, False, 45]

#indexing – forward
print(“First value – “,list1[0])
print(“third value – “,list1[2])
list1[0] = 55.5
print(list1)
# backward indexing – right to left
print(“Last element – indexed as -1: “,list1[-1])
print(“First value – “,list1[-3])
print(“1,3,5 values – “,list1[0],list1[2],list1[4])
#
print(“First to third values – “,list1[0:3],list1[:3])
print(“First to third values – “,list1[0:5:2])
print(“First to last values – “,list1[:])
print(“last three values – “,list1[-3:])

list2 = [3,4,5]
list3 = list1 + list2
print(list3)
list4 = list2*3
print(“List4 = “,list4)

## using list in a for loop
for counter in list1:
print(“HELLO : “,counter,“has a data type of”,type(counter))


## Properties of a list
l1 = [2,3,4]
l1.pop() #pop without index will remove last element from the list
print(“1. l1 = “,l1)
l1.pop(0) #pop will remove the element at the given index
print(“2. l1 = “,l1)
l1.append(5) #append always adds the value at the end of the list
print(“3. l1 = “,l1)
l1.append(1)
print(“4. l1 = “,l1)
l1.append(8)
print(“5. l1 = “,l1)
l1.sort() #default it sorts in ascending
print(“6. l1 = “,l1)
l1.sort(reverse=True) #will sort in descending
print(“6. l1 = “,l1)
## creating duplicate list
l2 = l1 #deep copy – both variables point to the same data
l3 = l1.copy() # shallow copy – you create a different copy
print(“11 L1 = “,l1)
print(“11 L2 = “,l2)
print(“11 L3 = “,l3)
l1.append(33)
l2.append(43)
l3.append(53)
l1.append(3)
print(“12 L1 = “,l1)
print(“12 L2 = “,l2)
print(“12 L3 = “,l3)
# (value, start, stop) – whats the index of the value between start and stop
# start =0, stop default is -1
print(“Index of 3: “,l1.index(3,3,10))
# REMEMBER: Index will throw error when value not the in list
# count() will do the count and its used exactly like index
num= l1.count(3)
print(“Number of 3 in the list is”,num)
l1_dup = l1[3:11]
num = l1_dup.count(3)
# above 2 statements can be clubbed as one shown below:
num = l1[3:11].count(3)
print(“Number of 3 in the given range is”,num)
print(“List before reverse is: “,l1)
l1.reverse()
print(“List after reverse is: “,l1)

# + will perform: c = a+ b
#extend will be like a = a+b
list4 = [11,22,33]
l1.extend(list4)
print(“L1 after extend: “,l1)

#pop takes index – remove takes value to remove/delete from the list
l1.remove(3)
cnt = l1.count(18)
if cnt>0:
l1.remove(18)
print(“1. After remove: “,l1)
#append() will always add at the end, insert takes the position also along with the values
# first it takes index, then the value to add
l1.insert(2,32)
print(“1. INSERT 1=”,l1)
l1.insert(2,42)
print(“2. INSERT 2=”,l1)
l1.clear() # will clear the data from the list
print(“99 List1 = “,l1)
# I want to input marks of 5 students in 5 subjects

students_marks = []

for j in range(5):

all_marks=[]
for i in range(5):
marks = int(input(“Enter the marks in subject “+str(i+1)+“: “))
all_marks.append(marks)
print(f”Marks obtained by student {j+1}: {all_marks})
students_marks.append(all_marks)
print(“Marks obtained by students are:\n,students_marks)

students_marks=[[66, 55, 77, 88, 99], [45, 65, 76, 78, 98],
[90, 80, 45, 55, 55], [54, 64, 74, 84, 94],
[34, 53, 99, 66, 76]]
subjects = [“Maths”,“Stats”,“Physics”,“Programming”,“SQL”]
for k in range(len(students_marks)):
total = sum(students_marks[k])
print(f”Total marks obtained by student {k+1} is {total} and average “
f”is {total/len(students_marks)})
max_marks = max(students_marks[k])
print(f”Highest marks obtained by student {k + 1} is {max_marks}
f”in subject {subjects[students_marks[k].index(max_marks)]})


# TUPLE: linear ordered immutable collection

# tuple declared using ()
t1 = ()
print(“Type of t1 = “,type(t1))
t1 = (“hello”,) # (5+3)*2 =
print(t1)
print(“Type of t1 = “,type(t1))

t1 = (5,4,6,9,1)
print(t1)
print(“Type of t1 = “,type(t1))
# indexing is exactly same as list
#t1[0]=8 – ‘tuple’ object does not support item assignment

for i in t1:
print(“from tuple: “,i)

t1=list(t1) # converting tuple to list
t1=tuple(t1) #converting list to tuple

############
## STRING – str
###########
# there is no difference between declaring string using ‘ or ” quotes
# and there is no difference between ”’ and “”” strings
# ‘ or ” declares only 1 line of text but ”’ and “”” can be used
# to declare multi line of text
str1 = “hello”
#str1[0]=”H” – ‘str’ object does not support item assignment
# strings are immutable
# strings are same as list or tuple
# 0 to n-1 indexing and -1 to -n indexing

str2 = ‘hi there’

str3 = “””How are you?
Where are you?
What are you doing?”””

str4 = ”’I am fine
I am here
I am doing nnothing”’
print(type(str1), type(str2),type(str3), type(str4))
print(“Str1 \n————“)
print(str1)
print(“Str2 \n————“)
print(str2)
print(“Str3 \n————“)
print(str3)
print(“Str4 \n————“)
print(str4)

str11=str1.upper()
print(str1,str11)
str22 = “Hello ” + “There ” * 2
print(“Str22 = “,str22)
# str are used in for loop exactly same way as list or tuple
for i in str1:
print(“STR = “,i)
# Strings – in python
str1 = “HELLO”
str3 = ”’How Are YoU?”’
str2 = “123456”
# methods with is…() – is it … ?
print(“isupper: “,str1.isupper())
print(“islower: “,str3.islower())
print(“istitle: “,str3.istitle())
print(“isnumeric: “,str2.isnumeric())
print(“”,str2.isalnum())
print(“title: “,str3.title())
print(“lower: “,str3.lower())
print(“upper: “,str3.upper())

str3 = ”’How Are YoU?”’
print(“startswith: “,str3.startswith(“H”))
print(“endswith: “,str3.endswith(“?”))
usname = input(“Enter your username (only text and numbers allowed: “)
if usname.isalnum():
print(“Username accepted”)
else:
print(“Invalid username!”)
num1 = input(“Enter length: “)
if num1.isnumeric():
num1 = int(num1)
else:
print(“Invalid number”)

str4=“abcdefghijklmnopqrstuvwxyz”
# I want to check if the starting
# character is A and ending is Z
if str4.upper().startswith(“A”) and str4.upper().endswith(“Z”):
print(“Your condition is true”)
else:
print(“Incorrect condition”)

while True:
inp = input(“Enter Yes to stop and any key to continue: “)
if inp.title()==“Yes”:
break



str5 = “Enter Yes to stop and any key to continue: “
str_words = str5.split()
print(str_words)
# join() will take list as input
print(“JOIN: “,” “.join(str_words))
str_hyphen = “-:-“.join(str_words)
print(“New Statement: “,str_hyphen)
# need to split this special text
str_words = str_hyphen.split(“-:-“)
print(“STR HYPHEN: “,str_words)

str1 = “How are you going?”
str1 = str1.replace(“g”,“d”)
print(“1. Str1 = “,str1)

str1 = “How are you going you?”
str1 = str1.replace(“g”,“d”,1)
print(“2. Str1 = “,str1)

# you in str1 or not
# -1 indicates value not found
# positive number indicates first matching index
print(str1)
print(str1.upper().find(“YOU”,9,21))

str1 = ” How are you going you? “
print(str1.strip())
str1 = str1.split()
str1 = ” “.join(str1)
print(str1)

######### DICTIONARY ##########
## mutable unordered collection: pair of key and value (key:value)
dict1 = {1:“Hello”,“Name”:“Sachin”,“Runs”:35000}
print(dict1)
print(dict1[1])
print(dict1[“Runs”])
print(dict1.values())
print(dict1.keys())
print(dict1.items())

# Dictionary: immutable unordered collection
dict1 = {}
print(“Type of dictionary: “,type(dict1))

t_dict ={“Name”:“Sachin”}
dict1.update(t_dict)
print(dict1)

”’
Write a program to store marks of 5 subjects along with names
”’
master_data = {}
for i in range(3):
name=input(“Enter the student’s name: “)
marks = []
for j in range(5):
m1=int(input(“Enter the marks in Subject “+str(j+1)+“: “))
marks.append(m1)
t_dict={name:marks}
master_data.update(t_dict)

#
print(“The details are:\n,master_data)
”’
{‘Sachin’: [56, 76, 39, 76, 54], ‘Virat’: [89, 90, 33, 59, 90], ‘Mahi’: [88, 77, 99, 88, 99]}
”’
data = {‘Sachin’: [56, 76, 39, 76, 54],
‘Virat’: [89, 90, 33, 59, 90],
‘Mahi’: [88, 77, 99, 88, 99]}
print(list(data.keys())[1])

for k in data.keys():
print(k)

# deep & shallow
data2 = data # both will point to the same memory location
data3 = data.copy() # shallow – create photocopy- another dict object

data2.update({‘Rohit’:[66,67,78,77,82]})
print(“Data: “,data)
print(“Data 2: “,data2)
print(“Data 3: “,data3)

#SETS – linear mutable unordered collection

set1 = set({})
print(“Set1 = “,set1)
set1.add(“Apple”)
print(“Set1 = “,set1)

set1 = {1,3,5,7,9}
print(‘0. SET1: ‘,set1)
set2 = {3,4,5,6,7}


# properties of sets
print(“# Union”)
print(set1 | set2)
set3 = set1.union(set2)
print(set3)
print(“#Intersection”)
print(set1 & set2)
set3 = set1.intersection(set2)
print(set3)

print(“# Difference”)
print(set1 – set2)
set3 = set1.difference(set2)
print(set3)
print(set2 – set1)
set3 = set2.difference(set1)
print(set3)
print(“#Symmetric difference”)
print(set1 ^ set2)
set3 = set1.symmetric_difference(set2)
print(set3)

set1.remove(7)
print(“1. Set1:”, set1)
set1.pop()
print(“2. Set1:”, set1)
set1.clear()
print(“3. Set1:”, set1)

### List, Tuple, Set – they can be converted into each other format
l1=[1,2,2,3,3,3,4,4,4,4,5,5,5,5,5]
l1=list(set(l1))
print(l1)

###########
## Functions: user defined functions
def whatever():
print(“Hello”)
print(“Hello 2”)
print(“hello 3”)

whatever()

## 4 types:
# required positional parameters
# default keyword
# variable length paramets
# Functions
”’
Write a function to check if the number is prime or not
and use this to generate prime numbers
”’
def check_prime(val=53):
”’
This is a user defined function which takes
a value as input and checks if its a prime or not
@Written by Sachin Kohli
:param val:
:return:
”’
isPrime = True
for i in range(2,val//2+1):
if val%i==0:
isPrime = False
break
”’
if isPrime:
print(f”{val} is a prime number”)
else:
print(f”{val} is not a prime number”)
”’
return isPrime
# required positional argument
# default
# keyword

# check if val1 is greater than val2 then subtract
#otherwise add them
#SyntaxError: non-default argument follows default argument
def myfunction(val1, val2=50):
”’

:param val1:
:param val2:
:return:
”’
print(f”input values are: {val1} and {val2})
if val1 > val2:
print(“Subtraction = “,val1-val2)
else:
print(“Addition = “,val1+val2)

# write a function to add all the given numbers
#* against the argument makes it take values as a tuple
#** against the argument makes it take values as a dictionary
def add_all_num(*values, **data):
print(“add_all_num: values Values passed are: “,values)
print(“add_all_num: **data Values passed are: “, data)

if __name__==“__main__”: # current file is running
res = check_prime(41)
myfunction(43,33)
myfunction(val2=10,val1=20) #keywords, use exact same variable name
res = check_prime()
add_all_num(5,6,10,12,15, name=“Sachin”, game=“Cricket”,runs=50000)

”’ generate prime numbers between 10,000 to 15,000”’
for num in range(10000,15001):
res = check_prime(num)
if res:
print(num,end=“, “)
print()

print(“#################”)
print(help(input))
print(“——————–“)

print(input.__doc__)
print(“#################”)
print(check_prime.__doc__)

else:
print(“Thanks for using my program”)
# Class and Objects

class Books:
#functions which are part of a class are called methods
# members of class can be variables and methods
#object level members & class level members
total_books = 0

def __init__(self,title,author,price):
self.title = title
self.author = author
self.cost = price
Books.total_books +=1

def print_info(self):
print(“Title of the book is”,self.title)
print(“Author of the book is”, self.author)
print(“Cost of the book is”, self.cost)

@classmethod
def print_total(cls):
print(“Total books in the library are”,cls.total_books)


class Library:
total_lib = 0

def __init__(self,name,loc,pincode):
self.name = name
self.location = loc
self.pin = pincode

def print_info(self):
print(“Library: “,self.name)
print(f”Location: {self.location}{self.pin})

if __name__==“__main__”:
# CREATE OBJECT OF CLASS BOOKS
book1 = Books(“Python Programming”,“Sachin”,399)
book2 = Books(“SQL Programming”,“Virat”,299)
#creating objects call __init__() automatically
book3 = Books(“Machine Learning”,“Rohit”,499)
#book1.add_info(“Python Programming”,”Sachin”,399)
#book2.add_info(“SQL Programming”,”Virat”,299)
#book3.add_info(“Machine Learning”,”Rohit”,499)
book2.print_info()

Books.print_total()
book1.print_total()
book2.print_total()
book3.print_total()

###############################
###### Another file
###############################
import prog1

b1 = prog1.Books(“Data Analytics”, “Saurav”, 298)
b1.print_info()
l1 = prog1.Library(“ABC International Library”, “Hyderabad”, 500081)
l1.print_info()

”’
Create a class called MyMathOps and add functionalities for
Addition, Subtraction, Power, Multiplication and Division
You should have following methods:
1. init() – to get 2 values
2. calc_add() – perform addition
3. display_add() – to print total
4. calc_sub() – perform addition
5. display_sub() – to print total
6. calc_power() – perform addition
7. display_power() – to print total
8. calc_mul() – perform addition
9. display_mul() – to print total
10. calc_div() – perform addition
11. display_div() – to print total
”’
”’
Properties of class & objects:
1. Encapsulation
2. Inheritance
3. Polymorphism
4. Abstraction

#Accessibility: public (var), private (__var), protected (_var)
”’
#magazines
class LibraryContent:
def __init__(self,title,price):
self.title = title
self.cost = price

def __print_data(self):
print(“data from Library Content”)

def print_info(self):
print(“info from Library content”)

def display_something(self):
print(“Do Nothing”)
class Magazines(LibraryContent):
total_mags = 0
def __init__(self,title,issn,price):
LibraryContent.__init__(self, title, price)
self.issn = issn
Books.total_books +=1

def print_info(self):
print(“Title of the book is”,self.title)
print(“ISSN of the book is”, self.issn)
print(“Cost of the book is”, self.cost)

@classmethod
def print_total(cls):
print(“Total books in the library are”,cls.total_books)

class Books(LibraryContent):
#functions which are part of a class are called methods
# members of class can be variables and methods
#object level members & class level members
total_books = 0

def __init__(self,title,author,price):
LibraryContent.__init__(self,title,price)
self.author = author
Books.total_books +=1

def print_info(self):
print(“Title of the book is”,self.title)
print(“Author of the book is”, self.author)
print(“Cost of the book is”, self.cost)

@classmethod
def print_total(cls):
print(“Total books in the library are”,cls.total_books)


class Library:
total_lib = 0

def __init__(self,name,loc,pincode):
self.name = name
self.location = loc
self.pin = pincode

def print_info(self):
print(“Library: “,self.name)
print(f”Location: {self.location}{self.pin})

if __name__==“__main__”:
m1 = Magazines(“International Journal for Robotics”,“247-9988”,19800)
m1.print_info()
b1 = Books(“Python Book”,“Virat”,299)
b1.print_info()
m1.print_info()
#m1.__print_data()
#b1.print_data()
m1.display_something()

############ ANOTHER FILE #################
#Working with files:
# modes: r(read), w(write), a (append)
# r+, w+, a+

filename = “17DEC.txt”
fileobj = open(filename,“a+”) #by default read mode
fileobj.write(”’Twinkle Twinkle little star
How I wonder what you are”’)
fileobj.seek(0)
fileobj.write(”’Twinkle X Twinkle X little star
How I wonder what you are”’)
cont = fileobj.read()
print(cont)
fileobj.close()
stamp_5 = 2
stamp_2 = 2
stamp_1 = 2
total = 51
stamp_1+=3

rest_amount = total – (stamp_5*5 +stamp_2*2+stamp_1*1)
# we need to distribute rest_amount to 5,2,1
more_5 =rest_amount//5
stamp_5 +=more_5
rest_amount = rest_amount%5

more_2 =rest_amount//2
stamp_2 +=more_2
rest_amount = rest_amount%2
stamp_1+=rest_amount

print(f”Rs 5 ={stamp_5}, Rs 2 = {stamp_2}, Rs 1 = {stamp_1})
l1 = [10,40,20,50,30,60]
# l1 = [10,20,30,40,50,60]
”’
5
4
3
2
1
0
”’
#Print – Bubble Sort
l1 = [60,50,40,30,20,10]
for i in range(len(l1)-1):
for j in range(len(l1)-1-i):
if l1[j] > l1[j+1]:
l1[j],l1[j + 1] = l1[j + 1], l1[j]
print(“Sorted L1 = “,l1)

l1 = [10,40,20,50,30,60]
l1 = [60,50,40,30,20,10]
#Print – Selection Sort
for i in range(len(l1)-1):
for j in range(1+i, len(l1)):
if l1[i] > l1[j]:
l1[i],l1[j] = l1[j], l1[i]
print(“Sorted L1 = “,l1)

## if 55 is in the list or not
element = 50
l1 = [10,40,20,50,30,60]
found = False
for i in l1:
if element==i:
found = True
if found:
print(“Element is in the list!”)
else:
print(“Element is not in the list”)
# sequential sort – method
found = False
for i in range(len(l1)):
if element==l1[i]:
found = True
break
if found:
print(“Element is in the list!”)
else:
print(“Element is not in the list”)

# Binary search works on sorted list
L1 = [10, 20, 30, 40, 50, 60]
low,high=0,len(L1)-1

element = 51
found = False
while low<=high:
mid = (low + high) // 2
if L1[mid]==element:
found=True
break
else:
if element > L1[mid]:
low=mid+1
else:
high=mid-1

if found:
print(f”Binary Search: {element} is in the list!”)
else:
print(f”Binary Search: {element} is not in the list”)

############

# Files
filename=“17DEC.txt”
filobj = open(filename, “r+”)
content = filobj.read()
print(“1. =============\n,content)
filobj.seek(0)
content = filobj.read(20)
print(“2. =============\n,content)
content = filobj.read(20)
print(“3. =============\n,content)
filobj.seek(0)
content = filobj.readline()
print(“4. =============\n,content)
content = filobj.readline(5000) #read 5000 characters (in current)
print(“5. =============\n,content)

filobj.seek(0)
content = filobj.readlines()
print(“6. =============\n,content)
filobj.close()

# opening again in write mode
filobj = open(filename, “w”)
content=“””filename=”17DEC.txt”
filobj = open(filename, “r+”)
content = filobj.read()
print(“1. =============\n“,content)
filobj.seek(0)
content = filobj.read(20)
print(“2. =============\n“,content)
content = filobj.read(20)
print(“3. =============\n“,content)
filobj.seek(0)”””
filobj.write(content)

content= [‘filename=”17DEC.txt”\n, ‘filobj = open(filename, “r+”)\n,
‘content = filobj.read()\n, ‘print(“1. =============\n,
‘”,content)\n, ‘filobj.seek(0)\n, ‘content = filobj.read(20)\n,
‘print(“2. =============\n, ‘”,content)\n,
‘content = filobj.read(20)\n, ‘print(“3. =============\n,
‘”,content)\n, ‘filobj.seek(0)’]

filobj.writelines(content)
filobj.close()
”’
two types:
1. OLTP – Online Transaction Processing
2. OLAP – Online Analytical Processing

RDBMS – Relational Database Management System

SQL – Structured Query Language – language of Database
SELECT, INSERT, UPDATE, DELETE
CREATE, DROP

Table: EMPLOYEES
EMPID ENAME EPHONE EEMAIL DEPT DHEAD DCODE DLOC
1 AA 123 aa@aa.com Executive AA E01 NY
2 AB 223 ab@aa.com Executive AA E01 ML

Relationship:
1:1 – All the columns in same table
1:M / M:1 – Put them in 2 tables and connect them using Foreign Key
M:M – Pu them in 2 different tables and connect them using 3rd table

— MYSQL database
https://dev.mysql.com/downloads/mysql/
Download and install 8.0.35
1. Server
2. Client
3. Workbench
Connection requires: you know the server location, username, password, database_name

# Data types in MYSQL:
https://dev.mysql.com/doc/refman/8.0/en/data-types.html


use ouremployees;

create table departments (
DID integer primary key,
dname varchar(20),
dhod varchar(20),
dcode varchar(10));

insert into departments values(101,’PD’,’MR PD’,’234AWER439′);
select * from departments;

select * from employees;

”’
# connect to MYSQL
import pymysql
hostname,dbname,username,password = “localhost”,“ouremployees”,“root”,“learnSQL”
db_con = pymysql.connect(host=hostname,database=dbname, user=username,password=password)

db_cursor = db_con.cursor()


sql1=”’Create table employees(empid integer primary key,
name varchar(30),
phone varchar(10),
did integer,
Foreign key (DID) references Departments(DID))
”’
#db_cursor.execute(sql1)

sql1 = ”’Insert into Employees values(
101, ‘Sachin T’,’3456555′,101)”’
#db_cursor.execute(sql1)
db_con.commit()

sql1=”’Select * from Employees”’
db_cursor.execute(sql1)
results = db_cursor.fetchall()
for data in results:
print(data)
db_con.close()

”’
Two types of stats:
Descriptive stats: Central tendency: mean median mode
Measure of variance: range, variance & standard deviation


MF1 – 15% – 10-20%
MF2 – 15% – -10% – 50%

5 & 6 = 5.5
1 & 10 – 5.5

Inferential stats:
”’

# NUMPY – core scientific library
import numpy as np
vals = range(15)
mat1 = np.reshape(vals,(5,3))
print(mat1)
print(“1: “,mat1[3,1])
print(“2: “,mat1[:5,1])
print(“3: \n,mat1[1:4,1:])
print(“Shape: “,mat1.shape)
print(“Number of rows =”,mat1.shape[0])
print(“Number of columns =”,mat1.shape[1])
”’
2x + 5y = 15
3x + 4y = 20
scipy –
”’

mat2=np.zeros((4,4))
print(“Matrix 2 = \n,mat2)
mat2=np.ones((4,4))
print(“Matrix 2 = \n,mat2)

mat2=np.full((4,4),5.001)
print(“Matrix 2 = \n,mat2)

mat3 = np.random.random((3,3))
print(