Swapnil Saurav

PART 2: DATA SCIENCE NOV 2023

CLICK HERE TO ACCESS PART 1 oF THE TUTORIAL

# NUMPY
# pip install numpy
import numpy as np
nums = range(16)
nums = np.reshape(nums,(8,2))
print(nums)
nums = np.reshape(nums,(4,4))
print(nums)
print(“Shape: Rows = “,nums.shape[0], “and columns = “,nums.shape[1])
# indexing
print(nums[1,2], nums[-3,-2])
print(nums[1]) # 2nd row
print(nums[:,1]) # : rows from 0th to (n-1)th
print(nums[-1], nums[:,-2], nums[-1,-2])

# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1)
print(mat1)

print(np.zeros((3,3)))
print(np.ones((3,3)))
print(np.full((5,7),2.0))
print(np.full((5,7),9))

# eye – identity matrix: square matrix with 1 on its main diagonal
mat1 = np.eye(5)
print(mat1)

# NUMPY
import numpy as np
# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1) # 4 * 3 – shape
print(mat1)
l2 = [[2,3,4],[2,1,2],[5,2,3],[3,2,2]]
# array is a function to convert list into numpy
mat2 = np.array(l2)
print(mat2)

# Matrices operations
print(mat1 + mat2)
print(np.add(mat1, mat2))

print(mat1 – mat2)
print(np.subtract(mat1, mat2))

print(mat1 * mat2)
print(np.multiply(mat1, mat2))

print(mat1 / mat2)
print(np.divide(mat1, mat2))

# actual matrix multiplication is done using matmul()
l3 = [[2,3,4],[2,1,2],[5,2,3]]
# array is a function to convert list into numpy
mat3 = np.array(l3)
print(mat3)
print(“Matrix Multiplication”)
print(np.matmul(mat1, mat3))
print(mat1 @ mat3)
## calculating determinant

l4 = [[1,3,5],[1,3,1],[2,3,4]]
mat5 = np.array(l4)
det_mat5 = np.linalg.det(mat5)
print(“Determinant of matrix 5 is”,det_mat5)
print(“Inverse of matrix 5 is: \n,np.linalg.inv(mat5))

”’
Linear Algebra Equation:
x1 + 5×2 = 7
-2×1 – 7×2 = -5

x1 = -8, x2= 3,
”’
coeff_mat = np.array([[1,5],[-2,-7]])
#var_mat = np.array([[x1],[x2]])
result_mat = np.array([[7],[-5]])
# equation here is coeff_mat * var_mat = result_mat [eg: 5 * x = 10]
# which is, var_mat = coeff_mat inv * result_mat
det_coeff_mat = np.linalg.det(coeff_mat)
if det_coeff_mat !=0:
var_mat = np.linalg.inv(coeff_mat) @ result_mat
print(“X1 = “,var_mat[0,0])
print(“X2 = “,var_mat[1,0])
else:
print(“Solution is not possible”)

# # scipy = scientific python
# pip install scipy
”’
#Inequality = OPTIMIZATION or MAXIMIZATION / MINIMIZATION PROBLEM
Computer Parts Assembly:
Laptops & Desktops
profit: 1000, 600
objective: either maximize profit or minimize cost

constraints:
1. Demand: 500, 600
2. Parts: Memory card: 5000 cards available
3. Manpower: 25000 minutes


”’

”’
Optimization using Scipy
let’s assume d = desktop, n = notebooks

Constraints:
1. d + n <= 10000
2. 2d + n <= 15000
3. 3d + 4n <= 25000

profit: 1000 d + 750 n => maximize
-1000d – 750 n =>minimize

”’
import numpy as np
from scipy.optimize import minimize, linprog
d = 1
n = 1
profit_d = 1000
profit_n = 750
profit = d * profit_d + n * profit_n
obj = [-profit_d, -profit_n]
lhs_con = [[1,1],[2,1],[3,4]]
rhs_con = [10000, 15000, 25000]

boundary = [(0, float(“inf”)), # boundary condition for # of desktops
(10, 200000)] # we just added some limit for notebooks
opt = linprog(c=obj, A_ub=lhs_con, b_ub=rhs_con, bounds=boundary, method=“revised simplex”)
print(opt)
if opt.success:
print(f”Number of desktops = {opt.x[0]} and number of laptops = {opt.x[1]})
print(“Maximum profit that can be generated = “,-1 * opt.fun)
else:
print(“Solution can not be generated”)

### ### ### PANDAS
# Pandas – dataframe which resembles Table structure
# pip install pandas
import pandas as pd
df1 = pd.DataFrame()
print(df1)
print(type(df1))

# fruit production
data = [[“Apple”, 15000, 11000,6000],
[“Banana”, 18000,22000,29000],
[“Mango”, 2, 900, 19000],
[“Guava”, 19000,11000,25000]]

fruit_production = pd.DataFrame(data)
print(fruit_production)
print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[1:3,2:]) #based on title(names)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”])
print(fruit_production)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”],
index=[“Fruit 1”,“Fruit 2”,“Fruit 3”,“Fruit 4”])
print(fruit_production)

## dataframe.loc() dataframe.iloc()

print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[[“Fruit 2”, “Fruit 3”],[“February”,“March”]]) #based on title(names)

### ###

# pandas
# pip install pandas
import pandas as pd
l1 = [10,20,30,40,50]
l1 = [[“Sachin”,101,20000,“BATSMAN”],[“Kapil”,501,12000,“BOWLER”],
[“Sunil”,12,21000,“BATSMAN”],[“Zaheer”,725,2000,“BOWLER”]]
df1 = pd.DataFrame(l1,columns=[“Player”,“Wickets”,“Runs”,“Type”],
index=[“Player 1”,“Player 2”,“Player 3”,“Player 4”])
print(df1)

d1 = {‘Apple’:[12000,11000,13000],
‘Banana’: [17000,18000,19000],
‘Mango’:[11000,13000,15000]}
df2 = pd.DataFrame(d1)
print(df2)

# creating dataframe from list of dictionary
data1 = [{“Guava”:9000, “Oranges”: 5000},
{“Guava”:8000, “Oranges”: 7000},
{“Guava”:10000, “Oranges”: 6000}]
df3 = pd.DataFrame(data1)
print(df3)

print(df3.iloc[0,:]) #first row and all column values
print(df3.iloc[:,0])

print(df2.iloc[:,0:2])
print(df2.iloc[[0,2],[0,2]])

#
print(df2.loc[[0,2],[“Apple”,“Mango”]])
print(df1.loc[[“Player 1”,“Player 4”],[“Player”,“Runs”]])

df2.iloc[2,0] = 14000
print(df2)
print(“========= DF1 =============”)
df1[‘Avg’] = df1[‘Runs’] / df1[“Wickets”]
print(df1)
print(“Reading data from DF1: “)
df4 = df1[df1.Player !=‘Sachin’] #filter where clause
print(\n\n New dataset without Sachin: \n, df4)
df1 = df1.drop(“Player”,axis=1) # axis default is 0
# unlike pop() and del – drop() returns a new dataframe
print(df1)


print(“Average Wickets of all the players = “,df1[‘Wickets’].mean())
print(“Average Wickets of players by type = \n\n,df1.groupby(‘Type’).mean())
# axis = 0 refers to rows
# axis = 1 refers to columns

print(\n\nDropping columns from DF1: “)
del df1[‘Wickets’] #dropping column Wickets using del
print(df1)

df1.pop(‘Runs’) #dropping column using pop
print(df1)
#

import pandas as pd

ud_df = pd.read_csv(“D:/datasets/gitdataset/user_device.csv”)
print(ud_df) # 272 rows x 6 columns
print(“Rows: “,ud_df.shape[0])
print(“Columns: “,ud_df.shape[1])

print(ud_df.tail(1))
print(ud_df.head(1))

use_df = pd.read_csv(“D:/datasets/gitdataset/user_usage.csv”)
print(use_df) # 240 rows x 4 columns

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’)
print(result_df) # [159 rows x 9 columns] = ud_df: 159 + 113, use_df = 159 + 81

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘outer’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘left’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘right’)
print(result_df)

## Working with Pandas – Example ##
import pandas as pd
import numpy as np
df = pd.read_csv(“D:/datasets/gitdataset/hotel_bookings.csv”)
print(df.shape)
print(df.dtypes)
”’
numeric – int, float
categorical – 1) Nominal – there is no order 2) Ordinal – here order is imp
”’
df_numeric = df.select_dtypes(include=[np.number])
print(df_numeric)

df_object= df.select_dtypes(exclude=[np.number])
print(df_object) # categorical and date columns

print(df.columns)
for col in df.columns:
missing = np.mean(df[col].isnull())
if missing >0:
print(f”{col}{missing})

”’
Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING – Divide the train and test


”’
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
print(df)

Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING –
a. Divide the train and test
b. Run the model
6. EVALUATE THE MODEL:
a. Measure the performance of each algorithm on the test data
b. Metric to compare: based on Regression (MSE, RMSE, R square) or
classification (confusion matrix -accuracy, sensitivity..)
c. select the best performing model
7. DEPLOY THE BEST PERFORMING MODEL

Hypothesis test:
1. Null Hypothesis (H0): starting statement (objective)
Alternate Hypethesis (H1): Alternate of H0

Z or T test:
Chi square test: both are categorical

e.g. North zone: 50 WIN 5 LOSS – p = 0.005

# simple (single value) v composite (specifies range)
# two tailed test v one tailed test [H0: mean = 0,
H1 Left Tailed: mean <0
H1 Right Tailed: mean >0
# level of significance:
alpha value: confidence interval – 95%
p value: p value <0.05 – we reject Null Hypothesis

import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
print(X)

import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train)
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)

”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))
## Bias v Variance

import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)


# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Administration’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘R&D Spend’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘Marketing Spend’],y=df[‘Profit’])
plt.show()

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’


## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = -3791.2 x Florida -3090.1 x California + 0.82 R&D – 0.05 Admin + 0.022 Marketing+ 56650


y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

”’
Case 1: All the columns are taken into account:
Mean Absolute Error = 8696.887641252619
R Square is (Variance) 0.884599945166969
Root Mean Squared Error (Bias) = 7562.5657508560125
R Square is (Bias) 0.9624157828452926
”’
## Testing

import statsmodels.api as sm
import numpy as np
X = np.array(X, dtype=float)
print(“Y:\n,y)
summ1 = sm.OLS(y,X).fit().summary()
print(“Summary of All X \n—————-\n:”,summ1)

import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)

”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.
”’
X = X[:,2:] # after backward elemination

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Administration’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘R&D Spend’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘Marketing Spend’],y=df[‘Profit’])
plt.show()

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’


## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = -3791.2 x Florida -3090.1 x California + 0.82 R&D – 0.05 Admin + 0.022 Marketing+ 56650


y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

”’
Case 1: All the columns are taken into account:
Mean Absolute Error = 8696.887641252619
R Square is (Variance) 0.884599945166969
Root Mean Squared Error (Bias) = 7562.5657508560125
R Square is (Bias) 0.9624157828452926
”’
## Testing

import statsmodels.api as sm
import numpy as np
X = np.array(X, dtype=float)
#X = X[:,[2,3,4]]
print(“Y:\n,y)
summ1 = sm.OLS(y,X).fit().summary()
print(“Summary of All X \n—————-\n:”,summ1)

## Test for linearity
# 1. All features (X) should be correlated to Y
# 2. Multicollinearity: Within X there should not be any correlation,
# if its there then take any one for the analysis

import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/4_Position_Salaries.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,1:2].values
y = df.iloc[:,2].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)

”’
”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.

X = X[:,2:] # after backward elemination
”’
”’
# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.show()
”’

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=100)
print(X_train)

from sklearn.linear_model import LinearRegression
from sklearn import metrics
”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
”’
#Since dataset is too small, lets take entire data for training
X_train, y_train = X,y
X_test, y_test = X,y
”’
”’
## RUN THE MODEL

regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_}”)

# y =
y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output

mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

# Plotting the data for output
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.plot(X,y_pred)
plt.xlabel(“Level”)
plt.ylabel(“Salary”)
plt.show()
”’

# 3. Model – Polynomial regression analysis
# y = C + m1 * X + m2 * x square
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

for i in range(1,10):
#prepare the parameters
parameters = [(‘polynomial’, PolynomialFeatures(degree=i)),(‘modal’,LinearRegression())]
pipe = Pipeline(parameters)
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X)
## Bias is based on training data
y_pred_tr = pipe.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
rmse_tr = mse ** 0.5
print(“Root Mean Squared Error (Bias) = “,rmse_tr)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

## Variance is based on validation data
y_pred_tt = pipe.predict(X_test)
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred_tt)
rmse_tt = mse ** 0.5
print(“Root Mean Squared Error (Variance) = “, rmse_tt)
print(“R Square is (Variance)”, metrics.r2_score(y_test, y_pred_tt))
print(“Difference Between variance and bias = “,rmse_tt – rmse_tr)
# Plotting the data for output
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.plot(X,y_pred)
plt.title(“Polynomial Analysis degree =”+str(i))
plt.xlabel(“Level”)
plt.ylabel(“Salary”)
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
#link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/4_Position_Salaries.csv”
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)


”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.

X = X[:,2:] # after backward elemination
”’
”’
# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.show()
”’

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=100)
print(X_train)

from sklearn.linear_model import LinearRegression
from sklearn import metrics
”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
”’
#Since dataset is too small, lets take entire data for training
X_train, y_train = X,y
X_test, y_test = X,y
”’

## RUN THE MODEL – Support Vector Machine Regressor (SVR)
from sklearn.svm import SVR
#regressor = SVR(kernel=’linear’)
#regressor = SVR(kernel=’poly’,degree=2,C=10)
# Assignment – Best value for gamma: 0.01 to 1 (0.05)
regressor = SVR(kernel=“rbf”,gamma=0.1,C=10)
# fit – train the model
regressor.fit(X_train, y_train)


# y =
y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output

mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))


# Plotting the data for output
plt.scatter(X_train[:,2],y_pred_tr)
#plt.plot(X_train[:,2],y_pred_tr)
plt.show()

#Decision Tree & Random Forest
import pandas as pd
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
link = “D:\\datasets\\3_Startups.csv”
df = pd.read_csv(link)
print(df)

#X = df.iloc[:,:4].values
X = df.iloc[:,:1].values
y = df.iloc[:,:-1].values
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=100)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

# Baging, Boosting, Ensemble
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

## Assignment these algorithms and check the RMSE and R square values

# Ridge Lasso Elasticnet
import pandas as pd
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/student_scores_multi.csv”
df = pd.read_csv(link)
print(df)
X = df.iloc[:,0:3].values
y = df.iloc[:,3].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.85, random_state=100)

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
lr_ridge = Ridge(alpha=0.8)
lr_ridge.fit(X_train,y_train)
y_ridge_pred = lr_ridge.predict(X_test)

from sklearn.metrics import r2_score
r2_ridge_test = r2_score(y_test, y_ridge_pred)

y_ridge_pred_tr = lr_ridge.predict(X_train)
r2_ridge_train = r2_score(y_train, y_ridge_pred_tr)
print(f”Ridge Regression: Train R2 = {r2_ridge_train} and Test R2={r2_

# Classifications algorithm: supervised algo which predicts the class
”’
classifier: algorithm that we develop
model: training and predicting the outcome
features: the input data (columns)
target: class that we need to predict
classification: binary (2 class outcome) or multiclass (more than 2 classes)

Steps to run the model:
1. get the data
2. preprocess the data
3. eda
4. train the model
5. predict the model
6. evaluate the model

”’
#1. Logistic regression
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))
plt.show()

https://designrr.page/?id=155238&token=545210681&type=FP&h=7849

# Classifications algorithm: supervised algo which predicts the class
”’
classifier: algorithm that we develop
model: training and predicting the outcome
features: the input data (columns)
target: class that we need to predict
classification: binary (2 class outcome) or multiclass (more than 2 classes)

Steps to run the model:
1. get the data
2. preprocess the data
3. eda
4. train the model
5. predict the model
6. evaluate the model

”’
#1. Logistic regression
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
## LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
”’
from sklearn.svm import SVC
”’
## Support Vector Machine – Classifier
classifier = SVC(kernel=’linear’)

classifier = SVC(kernel=’rbf’,gamma=100, C=100)
”’
from sklearn.neighbors import KNeighborsClassifier
## Refer types of distances:
# https://designrr.page/?id=200944&token=2785938662&type=FP&h=7229

classifier = KNeighborsClassifier(n_neighbors=9, metric=‘minkowski’)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

import sklearn.tree

link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion=”gini”)
”’
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=39, criterion=“gini”)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

”’
# Show decision tree created

output = sklearn.tree.export_text(classifier)
print(output)
# visualize the tree
fig = plt.figure(figsize=(40,60))
tree_plot = sklearn.tree.plot_tree(classifier)
plt.show()
”’

”’
In Ensemble Algorithms – we run multiple algorithms to improve the performance
of a given business objective:
1. Boosting: When you run same algorithm – Input varies based on weights
2. Bagging: When you run same algorithm – average of all
3. Stacking: Over different algorithms – average of all
”’

import sklearn.tree

link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion=”gini”)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=39, criterion=”gini”)
”’
from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier(n_estimators=7)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

”’
# Show decision tree created

output = sklearn.tree.export_text(classifier)
print(output)
# visualize the tree
fig = plt.figure(figsize=(40,60))
tree_plot = sklearn.tree.plot_tree(classifier)
plt.show()
”’

”’
In Ensemble Algorithms – we run multiple algorithms to improve the performance
of a given business objective:
1. Boosting: When you run same algorithm – Input varies based on weights
2. Bagging: When you run same algorithm – average of all
3. Stacking: Over different algorithms – average of all
”’

https://designrr.page/?id=36743&token=2022711066&type=FP&h=3547

 

from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

 

X,y = make_blobs(n_samples=300, n_features=3, centers=4)
plt.scatter(X[:,0], X[:,1])
plt.show()

 

from sklearn.cluster import KMeans
km = KMeans(n_clusters=5, init=“random”,max_iter=100)
y_cluster =km.fit_predict(X)

 

plt.scatter(X[y_cluster==0,0],X[y_cluster==0,1],c=“blue”,label=“Cluster A”)
plt.scatter(X[y_cluster==1,0],X[y_cluster==1,1],c=“red”,label=“Cluster B”)
plt.scatter(X[y_cluster==2,0],X[y_cluster==2,1],c=“green”,label=“Cluster C”)
plt.scatter(X[y_cluster==3,0],X[y_cluster==3,1],c=“black”,label=“Cluster D”)
plt.scatter(X[y_cluster==4,0],X[y_cluster==4,1],c=“orange”,label=“Cluster E”)
plt.show()

 

distortion = []
max_centers = 30
for i in range(1,max_centers):
km = KMeans(n_clusters=i, init=“random”, max_iter=100)
y_cluster = km.fit(X)
distortion.append(km.inertia_)

 

print(“Distortion:\n,distortion)
plt.plot(range(1,max_centers),distortion,marker=“o”)
plt.show()

 

import pandas as pd
import matplotlib.pyplot as plt
link = “D:\\Datasets\\USArrests.csv”
df = pd.read_csv(link)
#print(df)
X = df.iloc[:,1:]
from sklearn.preprocessing import normalize
data = normalize(X)
data = pd.DataFrame(data)
print(data)

## plotting dendogram
import scipy.cluster.hierarchy as sch
dendo = sch.dendrogram(sch.linkage(data, method=‘ward’))
plt.axhline(y=0.7,color=“red”)
plt.show()

link = “D:\\datasets\\Market_Basket_Optimisation.csv”
import pandas as pd
df = pd.read_csv(link)
print(df)
from apyori import apriori
transactions = []
for i in range(len(df)):
if i%100==0:
print(“I = “,i)
transactions.append([str(df.values[i,j]) for j in range(20)])

## remove nan from the list
print(“Transactions:\n,transactions)

association_algo = apriori(transactions, min_confidence=0.2, min_support=0.02, min_lift=2)
print(“Association = “,list(association_algo))

”’
Time Series Forecasting – ARIMA method

1. Read and visualize the data
2. Stationary series
3. Optimal parameters
4. Build the model
5. Prediction
”’
import pandas as pd
#Step 1: read the data
link = “D:\\datasets\\gitdataset\\AirPassengers.csv”
air_passengers = pd.read_csv(link)

”’
#Step 2: visualize the data
import plotly.express as pe
fig = pe.line(air_passengers,x=”Month”,y=”#Passengers”)
fig.show()
”’
# Cleaning the data
from datetime import datetime
air_passengers[‘Month’] = pd.to_datetime(air_passengers[‘Month’])
air_passengers.set_index(‘Month’,inplace=True)

#converting to time series data
import numpy as np
ts_log = np.log(air_passengers[‘#Passengers’])
#creating rolling period – 12 months
import matplotlib.pyplot as plt
”’
moving_avg = ts_log.rolling(12).mean
plt.plot(ts_log)
plt.plot(moving_avg)
plt.show()
”’
#Step 3: Decomposition into: trend, seasonality, error ( or residual or noise)
”’
Additive decomposition: linear combination of above 3 factors:
Y(t) =T(t) + S(t) + E(t)

Multiplicative decomposition: product of 3 factors:
Y(t) =T(t) * S(t) * E(t)
”’
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(ts_log,model=“multiplicative”)
decomposed.plot()
plt.show()

# Step 4: Stationary test
”’
To make Time series analysis, the TS should be stationary.
A time series is said to be stationary if its statistical properties
(mean, variance, autocorrelation) doesnt change by a large value
over a period of time.
Types of tests:
1. Augmented Dickey Fuller test (ADH Test)
2. Kwiatkowski Phillips Schnidt Shin (KPSS) test
3. Phillips Perron (PP) Test

Null Hypothesis: The time series is not stationary
Alternate Hypothesis: Time series is stationary
If p >0.05 we reject Null Hypothesis
”’
from statsmodels.tsa.stattools import adfuller
result = adfuller(air_passengers[‘#Passengers’])
print(“ADF Stats: \n,result[0])
print(“p value = “,result[1])
”’
To reject Null hypothesis, result[0] less than 5% critical region value
and p > 0.05
”’

# Run the model
”’
ARIMA model: Auto-Regressive Integrative Moving Average
AR: p predicts the current value
I: d integrative by removing trend and seasonality component from previous period
MA: q represents Moving Average

AIC- Akaike’s Information Criterion (AIC) – helps to find optimal p,d,q values
BIC – Bayesian Information Criterion (BIC) – alternative to AIC
”’
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(air_passengers[‘#Passengers’].diff().dropna())
plot_pacf(air_passengers[‘#Passengers’].diff().dropna())
plt.show()
”’
How to read above graph:
To find q (MA), we look at the Autocorrelation graph and see where there is a drastic change:
here, its at 1, so q = 1 (or 2 as at 2, it goes to -ve)

To find p (AR) – sharp drop in Partial Autocorrelation graph:
here, its at 1, so p = 1 (or 2 as at 2, it goes to -ve)

for d (I) – we need to try with multiple values
intially we will take as 1

”’
”’
Time Series Forecasting – ARIMA method

1. Read and visualize the data
2. Stationary series
3. Optimal parameters
4. Build the model
5. Prediction
”’
import pandas as pd
#Step 1: read the data
link = “D:\\datasets\\gitdataset\\AirPassengers.csv”
air_passengers = pd.read_csv(link)

”’
#Step 2: visualize the data
import plotly.express as pe
fig = pe.line(air_passengers,x=”Month”,y=”#Passengers”)
fig.show()
”’
# Cleaning the data
from datetime import datetime
air_passengers[‘Month’] = pd.to_datetime(air_passengers[‘Month’])
air_passengers.set_index(‘Month’,inplace=True)

#converting to time series data
import numpy as np
ts_log = np.log(air_passengers[‘#Passengers’])
#creating rolling period – 12 months
import matplotlib.pyplot as plt
”’
moving_avg = ts_log.rolling(12).mean
plt.plot(ts_log)
plt.plot(moving_avg)
plt.show()
”’
#Step 3: Decomposition into: trend, seasonality, error ( or residual or noise)
”’
Additive decomposition: linear combination of above 3 factors:
Y(t) =T(t) + S(t) + E(t)

Multiplicative decomposition: product of 3 factors:
Y(t) =T(t) * S(t) * E(t)
”’
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(ts_log,model=“multiplicative”)
decomposed.plot()
plt.show()

# Step 4: Stationary test
”’
To make Time series analysis, the TS should be stationary.
A time series is said to be stationary if its statistical properties
(mean, variance, autocorrelation) doesnt change by a large value
over a period of time.
Types of tests:
1. Augmented Dickey Fuller test (ADH Test)
2. Kwiatkowski Phillips Schnidt Shin (KPSS) test
3. Phillips Perron (PP) Test

Null Hypothesis: The time series is not stationary
Alternate Hypothesis: Time series is stationary
If p >0.05 we reject Null Hypothesis
”’
from statsmodels.tsa.stattools import adfuller
result = adfuller(air_passengers[‘#Passengers’])
print(“ADF Stats: \n,result[0])
print(“p value = “,result[1])
”’
To reject Null hypothesis, result[0] less than 5% critical region value
and p > 0.05
”’

# Run the model
”’
ARIMA model: Auto-Regressive Integrative Moving Average
AR: p predicts the current value
I: d integrative by removing trend and seasonality component from previous period
MA: q represents Moving Average

AIC- Akaike’s Information Criterion (AIC) – helps to find optimal p,d,q values
BIC – Bayesian Information Criterion (BIC) – alternative to AIC
”’
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(air_passengers[‘#Passengers’].diff().dropna())
plot_pacf(air_passengers[‘#Passengers’].diff().dropna())
plt.show()
”’
How to read above graph:
To find q (MA), we look at the Autocorrelation graph and see where there is a drastic change:
here, its at 1, so q = 1 (or 2 as at 2, it goes to -ve)

To find p (AR) – sharp drop in Partial Autocorrelation graph:
here, its at 1, so p = 1 (or 2 as at 2, it goes to -ve)

for d (I) – we need to try with multiple values
intially we will take as 1

”’
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(air_passengers[‘#Passengers’], order=(1,1,1))
result = model.fit()
plt.plot(air_passengers[‘#Passengers’])
plt.plot(result.fittedvalues)
plt.show()
print(“ARIMA Model Summary”)
print(result.summary())

model = ARIMA(air_passengers[‘#Passengers’], order=(4,1,4))
result = model.fit()
plt.plot(air_passengers[‘#Passengers’])
plt.plot(result.fittedvalues)
plt.show()
print(“ARIMA Model Summary”)
print(result.summary())

# Prediction using ARIMA model
air_passengers[‘Forecasted’] = result.predict(start=120,end=246)
air_passengers[[‘#Passengers’,‘Forecasted’]].plot()
plt.show()

# predict using SARIMAX Model
import statsmodels.api as sm
model = sm.tsa.statespace.SARIMAX(air_passengers[‘#Passengers’],order=(7,1,1), seasonal_order=(1,1,1,12))
result = model.fit()
air_passengers[‘Forecast_SARIMAX’] = result.predict(start=120,end=246)
air_passengers[[‘#Passengers’,‘Forecast_SARIMAX’]].plot()
plt.show()

https://drive.google.com/drive/folders/1Xe3HftLxL1T6HsEBUfjq_zXANjTnr6Cz?usp=drive_link

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“https://raw.githubusercontent.com/swapnilsaurav/OnlineRetail/master/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])

df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)
”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
txt_pp = text.lower()
print(txt_pp)
#remove accent

# applying basic preprocessing:
reviews_df[‘review_comment_message’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“D:/datasets/OnlineRetail/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
#df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])
#df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)

# remove accents
def remove_accent(text):
return unicodedata.normalize(‘NFKD’,text).encode(‘ascii’,errors=‘ignore’).decode(‘utf-8’)
#STOP WORDS LIST:
STOP_WORDS = set(remove_accent(w) for w in nltk.corpus.stopwords.words(‘portuguese’))

”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
#converting to lower case
txt_pp = text.lower()
#print(txt_pp)

#remove the accent
#txt_pp = unicodedata.normalize(‘NFKD’,txt_pp).encode(‘ascii’,errors=’ignore’).decode(‘utf-8’)
txt_pp =remove_accent(txt_pp)
#print(txt_pp)
#tokenize
txt_token = nltk.tokenize.word_tokenize(txt_pp)
#print(txt_token)

# removing stop words
txt_token = (w for w in txt_token if w not in STOP_WORDS and w.isalpha())
return txt_token

# applying basic preprocessing:
reviews_df[‘review_comment_words’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

#get positive reviews – all 5 ratings in review_score
reviews_5 = reviews_df[reviews_df[‘review_score’]==5]

#get negative reviews – all 1 ratings
reviews_1 = reviews_df[reviews_df[‘review_score’]==1]

## write a function to creaet unigram, bigram, trigram
def create_ngrams(words):
unigram,bigrams,trigram = [],[],[]
for comment in words:
unigram.extend(comment)
bigrams.extend(.join(bigram) for bigram in nltk.bigrams(comment))
trigram.extend(‘ ‘.join(trigram) for trigram in nltk.trigrams(comment))
return unigram,bigrams,trigram

#create ngrams for rating 5 and rating 1
uni_5, bi_5, tri_5 = create_ngrams(reviews_5[‘review_comment_words’])
print(uni_5)
print(‘””””””””””””””””””‘)
print(bi_5)
print(” =========================================”)
print(tri_5)

uni_1, bi_1, tri_1 = create_ngrams(reviews_1[‘review_comment_words’])
#print(uni_5)

# distribution plot
def plot_dist(words, color):
nltk.FreqDist(words).plot()

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“D:/datasets/OnlineRetail/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
#df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])
#df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)

# remove accents
def remove_accent(text):
return unicodedata.normalize(‘NFKD’,text).encode(‘ascii’,errors=‘ignore’).decode(‘utf-8’)
#STOP WORDS LIST:
STOP_WORDS = set(remove_accent(w) for w in nltk.corpus.stopwords.words(‘portuguese’))

”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
#converting to lower case
txt_pp = text.lower()
#print(txt_pp)

#remove the accent
#txt_pp = unicodedata.normalize(‘NFKD’,txt_pp).encode(‘ascii’,errors=’ignore’).decode(‘utf-8’)
txt_pp =remove_accent(txt_pp)
#print(txt_pp)
#tokenize
txt_token = nltk.tokenize.word_tokenize(txt_pp)
#print(txt_token)

# removing stop words
txt_token = tuple(w for w in txt_token if w not in STOP_WORDS and w.isalpha())
return txt_token



## write a function to creaet unigram, bigram, trigram
def create_ngrams(words):
unigrams,bigrams,trigrams = [],[],[]
for comment in words:
unigrams.extend(comment)
bigrams.extend(‘ ‘.join(bigram) for bigram in nltk.bigrams(comment))
trigrams.extend(‘ ‘.join(trigram) for trigram in nltk.trigrams(comment))


return unigrams, bigrams, trigrams


# applying basic preprocessing:
reviews_df[‘review_comment_words’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

#get positive reviews – all 5 ratings in review_score
reviews_5 = reviews_df[reviews_df[‘review_score’]==5]

#get negative reviews – all 1 ratings
reviews_1 = reviews_df[reviews_df[‘review_score’]==1]
#create ngrams for rating 5 and rating 1
uni_5, bi_5, tri_5 = create_ngrams(reviews_5[‘review_comment_words’])
print(uni_5)
print(bi_5)
print(tri_5)

# Assignment: perform similar tasks for reviews that are negative (review score = 1)
#uni_1, bi_1, tri_1 = create_ngrams(reviews_1[‘review_comment_words’])
#print(uni_5)

# distribution plot
def plot_dist(words, color):
nltk.FreqDist(words).plot(20,cumulative=False, color=color)

plot_dist(tri_5, “red”)

#NLP – Natural Language processing:
# sentiments: Positive, Neutral, Negative
#
”’
we will use nltk library for NLP:
pip install nltk
”’
import nltk
#1. Convert into lowercase
text = “Product is great but I amn’t liking the colors as they are worst”
text = text.lower()

”’
2. Tokenize the content: break it into words or sentences
”’
text1 = text.split()
#using nltk
from nltk.tokenize import sent_tokenize,word_tokenize
text = word_tokenize(text)
#print(“Text =\n”,text)
#print(“Text =\n”,text1)

”’
3. Removing Stop words: Words which are not significant
for your analysis. E.g. an, a, the, is, are
”’
my_stopwords = [‘is’,‘i’,‘the’]
text1 = text
for in text1:
    
if in my_stopwords:
        text.remove(w)
print(“Text after my stopwords:”,text1)

nltk.download(
“stopwords”)
from nltk.corpus import stopwords
nltk_eng_stopwords = 
set(stopwords.words(“english”))
#print(“NLTK list of stop words in English: “,nltk_eng_stopwords)
”’
Just for example: we see the word but in the STOP WORDS but
we want to include it, then we need to remove the word from the set
”’
# removing but from the NLTK stop words
nltk_eng_stopwords.remove(‘but’)

for in text:
    
if in nltk_eng_stopwords:
        text.remove(w)
print(“Text after NLTK stopwords:”,text)

”’
4. Stemming: changing the word to its root
eg: {help: [help, helped, helping, helper]}

One of the method is Porter stemmer
”’
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
text = [stemmer.stem(w) 
for in text]
”’ above line is like below:
t_list=[]
for w in text:
    a = stemmer.stem(w)
    t_list.append(a)
”’
print(“Text after Stemming:”,text)
”’
5. Part of Speech Tagging (POS Tagging)
grammatical word which deals with the roles they place
like – 8 parts of speeches – noun, verb, …

Reference: https://www.educba.com/nltk-pos-tag/
POS Tagging will give Tags like

CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional

But to perform this, we need to download any one tagger:
e.g. averaged_perceptron_tagger
nltk.download(‘averaged_perceptron_tagger’)
”’
nltk.download(‘averaged_perceptron_tagger’)

import nltk
from nltk.tag import DefaultTagger
py_tag = DefaultTagger (
‘NN’)
tag_eg1 = py_tag.tag ([
‘Example’‘tag’])
print(tag_eg1)

#txt = “Example of nltk pos tag list”
#txt = [‘product’, ‘great’, ‘but’, “not”, ‘like’, ‘color’]
#txt = word_tokenize(txt)
#txt = [‘Example’,’of’,’nltk’,’pos’,’tag’,’list’]
pos_txt = nltk.pos_tag(text)
print(“POS Tagging:”, pos_txt)

”’
6. Lemmetising
takes a word to its core meaning
We need to download:  wordnet
”’
nltk.download(‘wordnet’)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(“Very good = “,lemmatizer.lemmatize(“very good”))
print(“Halves = “,lemmatizer.lemmatize(“halves”))

text = 
“Product is great but I amn’t liking the colors as they are worst”
text = word_tokenize(text)
text = [lemmatizer.lemmatize(w) 
for in text]
print(“Text after Lemmatizer: “,text)


# Sentiment analysis – read the sentiments of each sentence
”’
If you need more data for your analysis, this is a good source:
https://github.com/pycaret/pycaret/tree/master/datasets

We will use Amazon.csv for this program

”’
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

link = “https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv”
df = pd.read_csv(link)
print(df)

#Let’s create a function to perform all the preprocessing steps
# of a nlp analysis
def preprocess_nlp(text):
#tokenise
#print(“0”)
text = text.lower() #lowercase
#print(“1”)
text = word_tokenize(text) #tokenize
#print(“2”)
text = [w for w in text if w not in stopwords.words(“english”)]
#lemmatize
#print(“3”)
lemm = WordNetLemmatizer()
#print(“4”)
text = [lemm.lemmatize(w) for w in text]
#print(“5”)
# now join all the words as we are predicting on each line of text
text_out = ‘ ‘.join(text)
#print(“6”)
return text_out

# import Resource vader_lexicon
import nltk
nltk.download(‘vader_lexicon’)


df[‘reviewText’] = df[‘reviewText’].apply(preprocess_nlp)
print(df)

# NLTK Sentiment Analyzer
# we will now define a function get_sentiment() which will return
# 1 for positive and 0 for non-positive
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
score = analyzer.polarity_scores(text)
sentiment = 1 if score[‘pos’] > 0 else 0
return sentiment

df[‘sentiment’] = df[‘reviewText’].apply(get_sentiment)

print(“Dataframe after analyzing the sentiments: \n,df)

#confusion matrix
from sklearn.metrics import confusion_matrix
print(“Confusion matrix:\n,confusion_matrix(df[‘Positive’],df[‘sentiment’]))

”’ RESULT

Confusion matrix:
[[ 1131 3636]
[ 576 14657]]
Accuracy: (1131 + 14657) / (1131 + 14657 + 576 + 3636) = 15788/20000 = 78.94%
”’
# Visualization
import matplotlib.pyplot as plt
import numpy as np
data = np.random.randn(1000)
plt.hist(data, bins=30, histtype=‘stepfilled’, color=“red”)
plt.title(“Histogram Display”)
plt.xlabel(“Marks”)
plt.ylabel(“Number of Students”)
plt.show()
# Analyzing Hotel Bookings data
# https://github.com/swapnilsaurav/Dataset/blob/master/hotel_bookings.csv
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/hotel_bookings.csv”
import pandas as pd
df = pd.read_csv(link)
#print(“Shape of the data: “,df.shape)
#print(“Data types of the columns:”,df.dtypes)
import numpy as np
df_numeric = df.select_dtypes(include=[np.number])
#print(df_numeric)
numeric_cols = df_numeric.columns.values
#print(“Numeric column names: “,numeric_cols)
df_nonnumeric = df.select_dtypes(exclude=[np.number])
#print(df_nonnumeric)
nonnumeric_cols = df_nonnumeric.columns.values
#print(“Non Numeric column names: “,nonnumeric_cols)

####
#preprocessing the data
import seaborn as sns
import matplotlib.pyplot as plt
colors = [“#091AEA”,“#EA5E09”]
cols = df.columns
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colors))
plt.show()

cols_to_drop = []
for col in cols:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
#print(f”{col} -> {pct_miss}”)
cols_to_drop.append(col) #column list to drop

# remove column since it has more than 80% missing value
df = df.drop(cols_to_drop, axis=1)

for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
print(f”{col} -> {pct_miss})
# check for rows to see the missing values
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing >0:
df[f’{col}_ismissing’] = missing
print(f”Created Missing Indicator for {cols})

### keeping track of the missing values
ismissing_cols = [col for col in df.columns if ‘_ismissing’ in col]
df[‘num_missing’] = df[ismissing_cols].sum(axis=1)
print(df[‘num_missing’])

# drop rows with > 12 missing values
ind_missing = df[df[‘num_missing’] > 12].index
df = df.drop(ind_missing,axis=0) # ROWS DROPPED

#count for missing values
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

”’
Still we are left with following missing values:
children -> 2.0498257606219004
babies -> 11.311318858061922
meal -> 11.467129071170085
country -> 0.40879238707947996
deposit_type -> 8.232810615199035
agent -> 13.687005763302507
”’

# Analyzing Hotel Bookings data
# https://github.com/swapnilsaurav/Dataset/blob/master/hotel_bookings.csv
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/hotel_bookings.csv”
import pandas as pd
df = pd.read_csv(link)
#print(“Shape of the data: “,df.shape)
#print(“Data types of the columns:”,df.dtypes)
import numpy as np
df_numeric = df.select_dtypes(include=[np.number])
#print(df_numeric)
numeric_cols = df_numeric.columns.values
print(“Numeric column names: “,numeric_cols)
df_nonnumeric = df.select_dtypes(exclude=[np.number])
#print(df_nonnumeric)
nonnumeric_cols = df_nonnumeric.columns.values
print(“Non Numeric column names: “,nonnumeric_cols)

####
#preprocessing the data
import seaborn as sns
import matplotlib.pyplot as plt
colors = [“#091AEA”,“#EA5E09”]
cols = df.columns
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colors))
plt.show()

cols_to_drop = []
for col in cols:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
#print(f”{col} -> {pct_miss}”)
cols_to_drop.append(col) #column list to drop

# remove column since it has more than 80% missing value
df = df.drop(cols_to_drop, axis=1)

for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
print(f”{col} -> {pct_miss})
# check for rows to see the missing values
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing >0:
df[f’{col}_ismissing’] = missing
#print(f”Created Missing Indicator for {cols}”)

### keeping track of the missing values
ismissing_cols = [col for col in df.columns if ‘_ismissing’ in col]
df[‘num_missing’] = df[ismissing_cols].sum(axis=1)
print(df[‘num_missing’])

# drop rows with > 12 missing values
ind_missing = df[df[‘num_missing’] > 12].index
df = df.drop(ind_missing,axis=0) # ROWS DROPPED

#count for missing values
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

”’
Still we are left with following missing values:
children -> 2.0498257606219004 # numeric
babies -> 11.311318858061922 #numeric
meal -> 11.467129071170085 # non-numeric
country -> 0.40879238707947996 # non-numeric
deposit_type -> 8.232810615199035 # non-numeric
agent -> 13.687005763302507 #numeric
”’
#HANDLING NUMERIC MISSING VALUES
df_numeric = df.select_dtypes(include=[np.number])
for col in df_numeric.columns.values:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss > 0:
med = df[col].median()
df[col] = df[col].fillna(med)

#HANDLING non-NUMERIC MISSING VALUES
df_nonnumeric = df.select_dtypes(exclude=[np.number])
for col in df_nonnumeric.columns.values:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss > 0:
mode = df[col].describe()[‘top’]
df[col] = df[col].fillna(mode)


print(“#count for missing values”)
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

#drop duplicate values
print(“Shape before dropping duplicates: “,df.shape)
df = df.drop(‘id’,axis=1).drop_duplicates()
print(“Shape after dropping duplicates: “,df.shape)

 

 

 

JUNE 2023 Data Science Course

THIS IS THE 4th and the last part of the complete Data Science with Python course started 3 months ago!!

”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“D:/datasets/OnlineRetail/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
#df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])
#df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)

# remove accents
def remove_accent(text):
return unicodedata.normalize(‘NFKD’,text).encode(‘ascii’,errors=‘ignore’).decode(‘utf-8’)
#STOP WORDS LIST:
STOP_WORDS = set(remove_accent(w) for w in nltk.corpus.stopwords.words(‘portuguese’))

”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
#converting to lower case
txt_pp = text.lower()
#print(txt_pp)

#remove the accent
#txt_pp = unicodedata.normalize(‘NFKD’,txt_pp).encode(‘ascii’,errors=’ignore’).decode(‘utf-8’)
txt_pp =remove_accent(txt_pp)
#print(txt_pp)
#tokenize
txt_token = nltk.tokenize.word_tokenize(txt_pp)
#print(txt_token)

# removing stop words
txt_token = tuple(w for w in txt_token if w not in STOP_WORDS and w.isalpha())
return txt_token



## write a function to creaet unigram, bigram, trigram
def create_ngrams(words):
unigrams,bigrams,trigrams = [],[],[]
for comment in words:
unigrams.extend(comment)
bigrams.extend(‘ ‘.join(bigram) for bigram in nltk.bigrams(comment))
trigrams.extend(‘ ‘.join(trigram) for trigram in nltk.trigrams(comment))


return unigrams, bigrams, trigrams


# applying basic preprocessing:
reviews_df[‘review_comment_words’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

#get positive reviews – all 5 ratings in review_score
reviews_5 = reviews_df[reviews_df[‘review_score’]==5]

#get negative reviews – all 1 ratings
reviews_1 = reviews_df[reviews_df[‘review_score’]==1]
#create ngrams for rating 5 and rating 1
uni_5, bi_5, tri_5 = create_ngrams(reviews_5[‘review_comment_words’])
print(uni_5)
print(bi_5)
print(tri_5)

# Assignment: perform similar tasks for reviews that are negative (review score = 1)
#uni_1, bi_1, tri_1 = create_ngrams(reviews_1[‘review_comment_words’])
#print(uni_5)

# distribution plot
def plot_dist(words, color):
nltk.FreqDist(words).plot(20,cumulative=False, color=color)

plot_dist(tri_5, “red”)

#NLP – Natural Language processing:
# sentiments: Positive, Neutral, Negative
#
”’
we will use nltk library for NLP:
pip install nltk
”’
import nltk
#1. Convert into lowercase
text = “Product is great but I amn’t liking the colors as they are worst”
text = text.lower()

”’
2. Tokenize the content: break it into words or sentences
”’
text1 = text.split()
#using nltk
from nltk.tokenize import sent_tokenize,word_tokenize
text = word_tokenize(text)
#print(“Text =\n”,text)
#print(“Text =\n”,text1)

”’
3. Removing Stop words: Words which are not significant
for your analysis. E.g. an, a, the, is, are
”’
my_stopwords = [‘is’,‘i’,‘the’]
text1 = text
for w in text1:
   
if w in my_stopwords:
        text.remove(w)
print(“Text after my stopwords:”,text1)

nltk.download(
“stopwords”)
from nltk.corpus import stopwords
nltk_eng_stopwords =
set(stopwords.words(“english”))
#print(“NLTK list of stop words in English: “,nltk_eng_stopwords)
”’
Just for example: we see the word but in the STOP WORDS but
we want to include it, then we need to remove the word from the set
”’
# removing but from the NLTK stop words
nltk_eng_stopwords.remove(‘but’)

for w in text:
   
if w in nltk_eng_stopwords:
        text.remove(w)
print(“Text after NLTK stopwords:”,text)

”’
4. Stemming: changing the word to its root
eg: {help: [help, helped, helping, helper]}

One of the method is Porter stemmer
”’
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
text = [stemmer.stem(w)
for w in text]
”’ above line is like below:
t_list=[]
for w in text:
    a = stemmer.stem(w)
    t_list.append(a)
”’
print(“Text after Stemming:”,text)
”’
5. Part of Speech Tagging (POS Tagging)
grammatical word which deals with the roles they place
like – 8 parts of speeches – noun, verb, …

Reference: https://www.educba.com/nltk-pos-tag/
POS Tagging will give Tags like

CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional

But to perform this, we need to download any one tagger:
e.g. averaged_perceptron_tagger
nltk.download(‘averaged_perceptron_tagger’)
”’
nltk.download(‘averaged_perceptron_tagger’)

import nltk
from nltk.tag import DefaultTagger
py_tag = DefaultTagger (
‘NN’)
tag_eg1 = py_tag.tag ([
‘Example’, ‘tag’])
print(tag_eg1)

#txt = “Example of nltk pos tag list”
#txt = [‘product’, ‘great’, ‘but’, “not”, ‘like’, ‘color’]
#txt = word_tokenize(txt)
#txt = [‘Example’,’of’,’nltk’,’pos’,’tag’,’list’]
pos_txt = nltk.pos_tag(text)
print(“POS Tagging:”, pos_txt)

”’
6. Lemmetising
takes a word to its core meaning
We need to download:  wordnet
”’
nltk.download(‘wordnet’)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(“Very good = “,lemmatizer.lemmatize(“very good”))
print(“Halves = “,lemmatizer.lemmatize(“halves”))

text =
“Product is great but I amn’t liking the colors as they are worst”
text = word_tokenize(text)
text = [lemmatizer.lemmatize(w)
for w in text]
print(“Text after Lemmatizer: “,text)


# Sentiment analysis – read the sentiments of each sentence
”’
If you need more data for your analysis, this is a good source:
https://github.com/pycaret/pycaret/tree/master/datasets

We will use Amazon.csv for this program

”’
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

link = “https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv”
df = pd.read_csv(link)
print(df)

#Let’s create a function to perform all the preprocessing steps
# of a nlp analysis
def preprocess_nlp(text):
#tokenise
#print(“0”)
text = text.lower() #lowercase
#print(“1”)
text = word_tokenize(text) #tokenize
#print(“2”)
text = [w for w in text if w not in stopwords.words(“english”)]
#lemmatize
#print(“3”)
lemm = WordNetLemmatizer()
#print(“4”)
text = [lemm.lemmatize(w) for w in text]
#print(“5”)
# now join all the words as we are predicting on each line of text
text_out = ‘ ‘.join(text)
#print(“6”)
return text_out

# import Resource vader_lexicon
import nltk
nltk.download(‘vader_lexicon’)


df[‘reviewText’] = df[‘reviewText’].apply(preprocess_nlp)
print(df)

# NLTK Sentiment Analyzer
# we will now define a function get_sentiment() which will return
# 1 for positive and 0 for non-positive
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
score = analyzer.polarity_scores(text)
sentiment = 1 if score[‘pos’] > 0 else 0
return sentiment

df[‘sentiment’] = df[‘reviewText’].apply(get_sentiment)

print(“Dataframe after analyzing the sentiments: \n,df)

#confusion matrix
from sklearn.metrics import confusion_matrix
print(“Confusion matrix:\n,confusion_matrix(df[‘Positive’],df[‘sentiment’]))

”’ RESULT

Confusion matrix:
[[ 1131 3636]
[ 576 14657]]
Accuracy: (1131 + 14657) / (1131 + 14657 + 576 + 3636) = 15788/20000 = 78.94%
”’

# Visualization
import matplotlib.pyplot as plt
import numpy as np
data = np.random.randn(1000)
plt.hist(data, bins=30, histtype=‘stepfilled’, color=“red”)
plt.title(“Histogram Display”)
plt.xlabel(“Marks”)
plt.ylabel(“Number of Students”)
plt.show()
# Analyzing Hotel Bookings data
# https://github.com/swapnilsaurav/Dataset/blob/master/hotel_bookings.csv
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/hotel_bookings.csv”
import pandas as pd
df = pd.read_csv(link)
#print(“Shape of the data: “,df.shape)
#print(“Data types of the columns:”,df.dtypes)
import numpy as np
df_numeric = df.select_dtypes(include=[np.number])
#print(df_numeric)
numeric_cols = df_numeric.columns.values
#print(“Numeric column names: “,numeric_cols)
df_nonnumeric = df.select_dtypes(exclude=[np.number])
#print(df_nonnumeric)
nonnumeric_cols = df_nonnumeric.columns.values
#print(“Non Numeric column names: “,nonnumeric_cols)

####
#preprocessing the data
import seaborn as sns
import matplotlib.pyplot as plt
colors = [“#091AEA”,“#EA5E09”]
cols = df.columns
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colors))
plt.show()

cols_to_drop = []
for col in cols:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
#print(f”{col} -> {pct_miss}”)
cols_to_drop.append(col) #column list to drop

# remove column since it has more than 80% missing value
df = df.drop(cols_to_drop, axis=1)

for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
print(f”{col} -> {pct_miss})
# check for rows to see the missing values
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing >0:
df[f’{col}_ismissing’] = missing
print(f”Created Missing Indicator for {cols})

### keeping track of the missing values
ismissing_cols = [col for col in df.columns if ‘_ismissing’ in col]
df[‘num_missing’] = df[ismissing_cols].sum(axis=1)
print(df[‘num_missing’])

# drop rows with > 12 missing values
ind_missing = df[df[‘num_missing’] > 12].index
df = df.drop(ind_missing,axis=0) # ROWS DROPPED

#count for missing values
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

”’
Still we are left with following missing values:
children -> 2.0498257606219004
babies -> 11.311318858061922
meal -> 11.467129071170085
country -> 0.40879238707947996
deposit_type -> 8.232810615199035
agent -> 13.687005763302507
”’
# Analyzing Hotel Bookings data
# https://github.com/swapnilsaurav/Dataset/blob/master/hotel_bookings.csv
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/hotel_bookings.csv”
import pandas as pd
df = pd.read_csv(link)
#print(“Shape of the data: “,df.shape)
#print(“Data types of the columns:”,df.dtypes)
import numpy as np
df_numeric = df.select_dtypes(include=[np.number])
#print(df_numeric)
numeric_cols = df_numeric.columns.values
print(“Numeric column names: “,numeric_cols)
df_nonnumeric = df.select_dtypes(exclude=[np.number])
#print(df_nonnumeric)
nonnumeric_cols = df_nonnumeric.columns.values
print(“Non Numeric column names: “,nonnumeric_cols)

####
#preprocessing the data
import seaborn as sns
import matplotlib.pyplot as plt
colors = [“#091AEA”,“#EA5E09”]
cols = df.columns
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colors))
plt.show()

cols_to_drop = []
for col in cols:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
#print(f”{col} -> {pct_miss}”)
cols_to_drop.append(col) #column list to drop

# remove column since it has more than 80% missing value
df = df.drop(cols_to_drop, axis=1)

for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >80:
print(f”{col} -> {pct_miss})
# check for rows to see the missing values
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing >0:
df[f’{col}_ismissing’] = missing
#print(f”Created Missing Indicator for {cols}”)

### keeping track of the missing values
ismissing_cols = [col for col in df.columns if ‘_ismissing’ in col]
df[‘num_missing’] = df[ismissing_cols].sum(axis=1)
print(df[‘num_missing’])

# drop rows with > 12 missing values
ind_missing = df[df[‘num_missing’] > 12].index
df = df.drop(ind_missing,axis=0) # ROWS DROPPED

#count for missing values
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

”’
Still we are left with following missing values:
children -> 2.0498257606219004 # numeric
babies -> 11.311318858061922 #numeric
meal -> 11.467129071170085 # non-numeric
country -> 0.40879238707947996 # non-numeric
deposit_type -> 8.232810615199035 # non-numeric
agent -> 13.687005763302507 #numeric
”’
#HANDLING NUMERIC MISSING VALUES
df_numeric = df.select_dtypes(include=[np.number])
for col in df_numeric.columns.values:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss > 0:
med = df[col].median()
df[col] = df[col].fillna(med)

#HANDLING non-NUMERIC MISSING VALUES
df_nonnumeric = df.select_dtypes(exclude=[np.number])
for col in df_nonnumeric.columns.values:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss > 0:
mode = df[col].describe()[‘top’]
df[col] = df[col].fillna(mode)


print(“#count for missing values”)
for col in df.columns:
pct_miss = np.mean(df[col].isnull()) * 100
if pct_miss >0:
print(f”{col} -> {pct_miss})

#drop duplicate values
print(“Shape before dropping duplicates: “,df.shape)
df = df.drop(‘id’,axis=1).drop_duplicates()
print(“Shape after dropping duplicates: “,df.shape)

DAY 73: Power BI (Coming Soon)

DAY 74: Tableau (Coming soon)

Thats the end of the course - entire content in presented in 4 blog pages
Data Science Tutorial Aug 2023

MACHINE LEARNING TUTORIAL

## https://github.com/swapnilsaurav/MachineLearning

link =“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”
import numpy as np
import pandas as pd

df = pd.read_csv(link)
#print(df)

X =df.iloc[:,:-1].values
y =df.iloc[:,-1].values
#print(X)
#print(y)


## 1 Handling missing values:
# a) lot of values in the rows / column – drop/delete them
# b) replace with median / mean – numeric & mode – in case of categorical
## Scikit learn package: This package does everything for machine learning
## pip install scikit-learn
# class SimpleImputer belonging to sklearn (scikit-learn) we will use to replace missing values
from sklearn.impute import SimpleImputer
imputer =SimpleImputer(missing_values=np.nan, strategy= ‘mean’)
imputer = imputer.fit(X[:,1:])
X[:,1:3] = imputer.transform(X[:,1:])
#print(X)

## 2 Handling categorical values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(), [0])], remainder=‘passthrough’)
# remainder is to indicate what should columns that are not transformed be
# we are saying, let them be as it is- passthrough
X = transform.fit_transform(X)
y = lc.fit_transform(y) # y will not have ColumnTranformer
X = X[:,1:]
#print(X)

## 3 Handling Outliers

## 4 Creating Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)


## 5 Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
print(X_train, y_train, X_test,y_test)

# K-Fold Cross Validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
for train, test in kf.split(X):
print(“============”)
print(train, \n,test)
#### MODEL



import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv(“D:/datasets/2_Marks_Data.csv”)
print(“Columns are:”,df.columns)
print(“Shape: “,df.shape)
print(“Top 5 rows are: \n,df.head(3))

print(“Describe :\n,df.describe())
plt.scatter(df[“Hours”], df[“Marks”])
plt.show()

X = df.iloc[:,:1].values
y = df.iloc[:,1].values
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=1)

# running the algorithm- Simple Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

#train the data
regressor.fit(X_train, y_train)
#
#above model will give the line in the form mx + c
#output values
print(“Coefficient (m / slope): “, regressor.coef_)
print(“Intercept (c / constant): “, regressor.intercept_)
”’
Coefficient (m / slope): [7.47852733]
Intercept (c / constant): 20.591719221277778
Y = 7.5 * X + 20.6
”’
y_pred = regressor.predict(X_test)
result_df = pd.DataFrame({‘Actual’:y_test, ‘Predicted’:y_pred})
print(result_df)

# RMSE:
# Root (Square root)
# Mean (take the mean: sum of the square /n)
# Squared (square the difference)
# Error (difference)

from sklearn import metrics
mse = metrics.mean_squared_error(y_test, y_pred) #like variance
rmse = mse**0.5 # like std dev
print(“RMSE = “,rmse)
”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))
## Bias v Variance
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)


# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Administration’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘R&D Spend’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘Marketing Spend’],y=df[‘Profit’])
plt.show()

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’


## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = -3791.2 x Florida -3090.1 x California + 0.82 R&D – 0.05 Admin + 0.022 Marketing+ 56650


y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

”’
Case 1: All the columns are taken into account:
Mean Absolute Error = 8696.887641252619
R Square is (Variance) 0.884599945166969
Root Mean Squared Error (Bias) = 7562.5657508560125
R Square is (Bias) 0.9624157828452926
”’
## Testing

import statsmodels.api as sm
import numpy as np
X = np.array(X, dtype=float)
print(“Y:\n,y)
summ1 = sm.OLS(y,X).fit().summary()
print(“Summary of All X \n—————-\n:”,summ1)
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)

”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.
”’
X = X[:,2:] # after backward elemination

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Administration’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘R&D Spend’],y=df[‘Profit’])
plt.show()
plt.scatter(x=df[‘Marketing Spend’],y=df[‘Profit’])
plt.show()

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’


## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = -3791.2 x Florida -3090.1 x California + 0.82 R&D – 0.05 Admin + 0.022 Marketing+ 56650


y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
from sklearn import metrics
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

”’
Case 1: All the columns are taken into account:
Mean Absolute Error = 8696.887641252619
R Square is (Variance) 0.884599945166969
Root Mean Squared Error (Bias) = 7562.5657508560125
R Square is (Bias) 0.9624157828452926
”’
## Testing

import statsmodels.api as sm
import numpy as np
X = np.array(X, dtype=float)
#X = X[:,[2,3,4]]
print(“Y:\n,y)
summ1 = sm.OLS(y,X).fit().summary()
print(“Summary of All X \n—————-\n:”,summ1)

## Test for linearity
# 1. All features (X) should be correlated to Y
# 2. Multicollinearity: Within X there should not be any correlation,
# if its there then take any one for the analysis
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/4_Position_Salaries.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,1:2].values
y = df.iloc[:,2].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’
”’
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)

”’
”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.

X = X[:,2:] # after backward elemination
”’
”’
# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.show()
”’

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=100)
print(X_train)

from sklearn.linear_model import LinearRegression
from sklearn import metrics
”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
”’
#Since dataset is too small, lets take entire data for training
X_train, y_train = X,y
X_test, y_test = X,y
”’
”’
## RUN THE MODEL

regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_}”)

# y =
y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output

mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

# Plotting the data for output
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.plot(X,y_pred)
plt.xlabel(“Level”)
plt.ylabel(“Salary”)
plt.show()
”’

# 3. Model – Polynomial regression analysis
# y = C + m1 * X + m2 * x square
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

for i in range(1,10):
#prepare the parameters
parameters = [(‘polynomial’, PolynomialFeatures(degree=i)),(‘modal’,LinearRegression())]
pipe = Pipeline(parameters)
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X)
## Bias is based on training data
y_pred_tr = pipe.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
rmse_tr = mse ** 0.5
print(“Root Mean Squared Error (Bias) = “,rmse_tr)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))

## Variance is based on validation data
y_pred_tt = pipe.predict(X_test)
mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred_tt)
rmse_tt = mse ** 0.5
print(“Root Mean Squared Error (Variance) = “, rmse_tt)
print(“R Square is (Variance)”, metrics.r2_score(y_test, y_pred_tt))
print(“Difference Between variance and bias = “,rmse_tt – rmse_tr)
# Plotting the data for output
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.plot(X,y_pred)
plt.title(“Polynomial Analysis degree =”+str(i))
plt.xlabel(“Level”)
plt.ylabel(“Salary”)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
#link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/4_Position_Salaries.csv”
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
df = pd.read_csv(link)
print(df.describe())
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
”’

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,3] = lc.fit_transform(X[:,3])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[3])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
print(X)


”’
After doing Backward elemination method we realized that all the state columns
are not significantly impacting the analysis hence removing those 2 columns too.

X = X[:,2:] # after backward elemination
”’
”’
# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Level’],y=df[‘Salary’])
plt.show()
”’

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=100)
print(X_train)

from sklearn.linear_model import LinearRegression
from sklearn import metrics
”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
”’
#Since dataset is too small, lets take entire data for training
X_train, y_train = X,y
X_test, y_test = X,y
”’

## RUN THE MODEL – Support Vector Machine Regressor (SVR)
from sklearn.svm import SVR
#regressor = SVR(kernel=’linear’)
#regressor = SVR(kernel=’poly’,degree=2,C=10)
# Assignment – Best value for gamma: 0.01 to 1 (0.05)
regressor = SVR(kernel=“rbf”,gamma=0.1,C=10)
# fit – train the model
regressor.fit(X_train, y_train)


# y =
y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output

mse = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)
print(“Root Mean Squared Error (Variance) = “,mse**0.5)
mae = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
print(“Mean Absolute Error = “,mae)
print(“R Square is (Variance)”,metrics.r2_score(y_test, y_pred))

## Bias is based on training data
y_pred_tr = regressor.predict(X_train)
mse = metrics.mean_squared_error(y_true=y_train, y_pred=y_pred_tr)
print(“Root Mean Squared Error (Bias) = “,mse**0.5)
print(“R Square is (Bias)”,metrics.r2_score(y_train, y_pred_tr))


# Plotting the data for output
plt.scatter(X_train[:,2],y_pred_tr)
#plt.plot(X_train[:,2],y_pred_tr)
plt.show()

#Decision Tree & Random Forest
import pandas as pd
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/3_Startups.csv”
link = “D:\\datasets\\3_Startups.csv”
df = pd.read_csv(link)
print(df)

#X = df.iloc[:,:4].values
X = df.iloc[:,:1].values
y = df.iloc[:,:-1].values
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=100)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

# Baging, Boosting, Ensemble
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

## Assignment these algorithms and check the RMSE and R square values for both of these algorithms
# Ridge Lasso Elasticnet
import pandas as pd
link=“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/student_scores_multi.csv”
df = pd.read_csv(link)
print(df)
X = df.iloc[:,0:3].values
y = df.iloc[:,3].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.85, random_state=100)

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
lr_ridge = Ridge(alpha=0.8)
lr_ridge.fit(X_train,y_train)
y_ridge_pred = lr_ridge.predict(X_test)

from sklearn.metrics import r2_score
r2_ridge_test = r2_score(y_test, y_ridge_pred)

y_ridge_pred_tr = lr_ridge.predict(X_train)
r2_ridge_train = r2_score(y_train, y_ridge_pred_tr)
print(f”Ridge Regression: Train R2 = {r2_ridge_train} and Test R2={r2_ridge_test})
# Classifications algorithm: supervised algo which predicts the class
”’
classifier: algorithm that we develop
model: training and predicting the outcome
features: the input data (columns)
target: class that we need to predict
classification: binary (2 class outcome) or multiclass (more than 2 classes)

Steps to run the model:
1. get the data
2. preprocess the data
3. eda
4. train the model
5. predict the model
6. evaluate the model

”’
#1. Logistic regression
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))
plt.show()
# Classifications algorithm: supervised algo which predicts the class
”’
classifier: algorithm that we develop
model: training and predicting the outcome
features: the input data (columns)
target: class that we need to predict
classification: binary (2 class outcome) or multiclass (more than 2 classes)

Steps to run the model:
1. get the data
2. preprocess the data
3. eda
4. train the model
5. predict the model
6. evaluate the model

”’
#1. Logistic regression
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
## LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
”’
from sklearn.svm import SVC
”’
## Support Vector Machine – Classifier
classifier = SVC(kernel=’linear’)

classifier = SVC(kernel=’rbf’,gamma=100, C=100)
”’
from sklearn.neighbors import KNeighborsClassifier
## Refer types of distances:
# https://designrr.page/?id=200944&token=2785938662&type=FP&h=7229

classifier = KNeighborsClassifier(n_neighbors=9, metric=‘minkowski’)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)
import sklearn.tree

link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion=”gini”)
”’
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=39, criterion=“gini”)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

”’
# Show decision tree created

output = sklearn.tree.export_text(classifier)
print(output)
# visualize the tree
fig = plt.figure(figsize=(40,60))
tree_plot = sklearn.tree.plot_tree(classifier)
plt.show()
”’

”’
In Ensemble Algorithms – we run multiple algorithms to improve the performance
of a given business objective:
1. Boosting: When you run same algorithm – Input varies based on weights
2. Bagging: When you run same algorithm – average of all
3. Stacking: Over different algorithms – average of all
”’
import sklearn.tree

link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/5_Ads_Success.csv”
link = “D:\\datasets\\5_Ads_Success.csv”
import pandas as pd
df = pd.read_csv(link)
X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0] )

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=100)

# Scaling as Age and Salary are in different range of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Build the model
”’
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion=”gini”)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=39, criterion=”gini”)
”’
from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier(n_estimators=7)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

# visualize the outcome
X_train = X_train[:,1:]
X_test = X_test[:,1:]
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
x_set, y_set = X_train, y_train
X1,X2 = np.meshgrid(np.arange(start = x_set[:,0].min()-1, stop=x_set[:,0].max()+1, step=0.01),
np.arange(start = x_set[:,1].min()-1, stop=x_set[:,1].max()+1, step=0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
cmap=ListedColormap((‘red’,‘green’)))

#Now we will plot training data
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set==j,0],
x_set[y_set==j,1], color=ListedColormap((“red”,“green”))(i),
label=j)
plt.show()

## Model Evaluation using Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(“Confusion Matrix: \n,cm)
cr = classification_report(y_test, y_pred)
accs = accuracy_score(y_test, y_pred)
print(“classification_report: \n,cr)
print(“accuracy_score: “,accs)

”’
# Show decision tree created

output = sklearn.tree.export_text(classifier)
print(output)
# visualize the tree
fig = plt.figure(figsize=(40,60))
tree_plot = sklearn.tree.plot_tree(classifier)
plt.show()
”’

”’
In Ensemble Algorithms – we run multiple algorithms to improve the performance
of a given business objective:
1. Boosting: When you run same algorithm – Input varies based on weights
2. Bagging: When you run same algorithm – average of all
3. Stacking: Over different algorithms – average of all
”’

from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

X,y = make_blobs(n_samples=300, n_features=3, centers=4)
plt.scatter(X[:,0], X[:,1])
plt.show()

from sklearn.cluster import KMeans
km = KMeans(n_clusters=5, init=“random”,max_iter=100)
y_cluster =km.fit_predict(X)

plt.scatter(X[y_cluster==0,0],X[y_cluster==0,1],c=“blue”,label=“Cluster A”)
plt.scatter(X[y_cluster==1,0],X[y_cluster==1,1],c=“red”,label=“Cluster B”)
plt.scatter(X[y_cluster==2,0],X[y_cluster==2,1],c=“green”,label=“Cluster C”)
plt.scatter(X[y_cluster==3,0],X[y_cluster==3,1],c=“black”,label=“Cluster D”)
plt.scatter(X[y_cluster==4,0],X[y_cluster==4,1],c=“orange”,label=“Cluster E”)
plt.show()

distortion = []
max_centers = 30
for i in range(1,max_centers):
km = KMeans(n_clusters=i, init=“random”, max_iter=100)
y_cluster = km.fit(X)
distortion.append(km.inertia_)

print(“Distortion:\n,distortion)
plt.plot(range(1,max_centers),distortion,marker=“o”)
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
link = “D:\\Datasets\\USArrests.csv”
df = pd.read_csv(link)
#print(df)
X = df.iloc[:,1:]
from sklearn.preprocessing import normalize
data = normalize(X)
data = pd.DataFrame(data)
print(data)

## plotting dendogram
import scipy.cluster.hierarchy as sch
dendo = sch.dendrogram(sch.linkage(data, method=‘ward’))
plt.axhline(y=0.7,color=“red”)
plt.show()
link = “D:\\datasets\\Market_Basket_Optimisation.csv”
import pandas as pd
df = pd.read_csv(link)
print(df)
from apyori import apriori
transactions = []
for i in range(len(df)):
if i%100==0:
print(“I = “,i)
transactions.append([str(df.values[i,j]) for j in range(20)])

## remove nan from the list
print(“Transactions:\n,transactions)

association_algo = apriori(transactions, min_confidence=0.2, min_support=0.02, min_lift=2)
print(“Association = “,list(association_algo))
”’
Time Series Forecasting – ARIMA method

1. Read and visualize the data
2. Stationary series
3. Optimal parameters
4. Build the model
5. Prediction
”’
import pandas as pd
#Step 1: read the data
link = “D:\\datasets\\gitdataset\\AirPassengers.csv”
air_passengers = pd.read_csv(link)

”’
#Step 2: visualize the data
import plotly.express as pe
fig = pe.line(air_passengers,x=”Month”,y=”#Passengers”)
fig.show()
”’
# Cleaning the data
from datetime import datetime
air_passengers[‘Month’] = pd.to_datetime(air_passengers[‘Month’])
air_passengers.set_index(‘Month’,inplace=True)

#converting to time series data
import numpy as np
ts_log = np.log(air_passengers[‘#Passengers’])
#creating rolling period – 12 months
import matplotlib.pyplot as plt
”’
moving_avg = ts_log.rolling(12).mean
plt.plot(ts_log)
plt.plot(moving_avg)
plt.show()
”’
#Step 3: Decomposition into: trend, seasonality, error ( or residual or noise)
”’
Additive decomposition: linear combination of above 3 factors:
Y(t) =T(t) + S(t) + E(t)

Multiplicative decomposition: product of 3 factors:
Y(t) =T(t) * S(t) * E(t)
”’
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(ts_log,model=“multiplicative”)
decomposed.plot()
plt.show()

# Step 4: Stationary test
”’
To make Time series analysis, the TS should be stationary.
A time series is said to be stationary if its statistical properties
(mean, variance, autocorrelation) doesnt change by a large value
over a period of time.
Types of tests:
1. Augmented Dickey Fuller test (ADH Test)
2. Kwiatkowski Phillips Schnidt Shin (KPSS) test
3. Phillips Perron (PP) Test

Null Hypothesis: The time series is not stationary
Alternate Hypothesis: Time series is stationary
If p >0.05 we reject Null Hypothesis
”’
from statsmodels.tsa.stattools import adfuller
result = adfuller(air_passengers[‘#Passengers’])
print(“ADF Stats: \n,result[0])
print(“p value = “,result[1])
”’
To reject Null hypothesis, result[0] less than 5% critical region value
and p > 0.05
”’

# Run the model
”’
ARIMA model: Auto-Regressive Integrative Moving Average
AR: p predicts the current value
I: d integrative by removing trend and seasonality component from previous period
MA: q represents Moving Average

AIC- Akaike’s Information Criterion (AIC) – helps to find optimal p,d,q values
BIC – Bayesian Information Criterion (BIC) – alternative to AIC
”’
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(air_passengers[‘#Passengers’].diff().dropna())
plot_pacf(air_passengers[‘#Passengers’].diff().dropna())
plt.show()
”’
How to read above graph:
To find q (MA), we look at the Autocorrelation graph and see where there is a drastic change:
here, its at 1, so q = 1 (or 2 as at 2, it goes to -ve)

To find p (AR) – sharp drop in Partial Autocorrelation graph:
here, its at 1, so p = 1 (or 2 as at 2, it goes to -ve)

for d (I) – we need to try with multiple values
intially we will take as 1

”’
”’
Time Series Forecasting – ARIMA method

1. Read and visualize the data
2. Stationary series
3. Optimal parameters
4. Build the model
5. Prediction
”’
import pandas as pd
#Step 1: read the data
link = “D:\\datasets\\gitdataset\\AirPassengers.csv”
air_passengers = pd.read_csv(link)

”’
#Step 2: visualize the data
import plotly.express as pe
fig = pe.line(air_passengers,x=”Month”,y=”#Passengers”)
fig.show()
”’
# Cleaning the data
from datetime import datetime
air_passengers[‘Month’] = pd.to_datetime(air_passengers[‘Month’])
air_passengers.set_index(‘Month’,inplace=True)

#converting to time series data
import numpy as np
ts_log = np.log(air_passengers[‘#Passengers’])
#creating rolling period – 12 months
import matplotlib.pyplot as plt
”’
moving_avg = ts_log.rolling(12).mean
plt.plot(ts_log)
plt.plot(moving_avg)
plt.show()
”’
#Step 3: Decomposition into: trend, seasonality, error ( or residual or noise)
”’
Additive decomposition: linear combination of above 3 factors:
Y(t) =T(t) + S(t) + E(t)

Multiplicative decomposition: product of 3 factors:
Y(t) =T(t) * S(t) * E(t)
”’
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(ts_log,model=“multiplicative”)
decomposed.plot()
plt.show()

# Step 4: Stationary test
”’
To make Time series analysis, the TS should be stationary.
A time series is said to be stationary if its statistical properties
(mean, variance, autocorrelation) doesnt change by a large value
over a period of time.
Types of tests:
1. Augmented Dickey Fuller test (ADH Test)
2. Kwiatkowski Phillips Schnidt Shin (KPSS) test
3. Phillips Perron (PP) Test

Null Hypothesis: The time series is not stationary
Alternate Hypothesis: Time series is stationary
If p >0.05 we reject Null Hypothesis
”’
from statsmodels.tsa.stattools import adfuller
result = adfuller(air_passengers[‘#Passengers’])
print(“ADF Stats: \n,result[0])
print(“p value = “,result[1])
”’
To reject Null hypothesis, result[0] less than 5% critical region value
and p > 0.05
”’

# Run the model
”’
ARIMA model: Auto-Regressive Integrative Moving Average
AR: p predicts the current value
I: d integrative by removing trend and seasonality component from previous period
MA: q represents Moving Average

AIC- Akaike’s Information Criterion (AIC) – helps to find optimal p,d,q values
BIC – Bayesian Information Criterion (BIC) – alternative to AIC
”’
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(air_passengers[‘#Passengers’].diff().dropna())
plot_pacf(air_passengers[‘#Passengers’].diff().dropna())
plt.show()
”’
How to read above graph:
To find q (MA), we look at the Autocorrelation graph and see where there is a drastic change:
here, its at 1, so q = 1 (or 2 as at 2, it goes to -ve)

To find p (AR) – sharp drop in Partial Autocorrelation graph:
here, its at 1, so p = 1 (or 2 as at 2, it goes to -ve)

for d (I) – we need to try with multiple values
intially we will take as 1

”’
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(air_passengers[‘#Passengers’], order=(1,1,1))
result = model.fit()
plt.plot(air_passengers[‘#Passengers’])
plt.plot(result.fittedvalues)
plt.show()
print(“ARIMA Model Summary”)
print(result.summary())

model = ARIMA(air_passengers[‘#Passengers’], order=(4,1,4))
result = model.fit()
plt.plot(air_passengers[‘#Passengers’])
plt.plot(result.fittedvalues)
plt.show()
print(“ARIMA Model Summary”)
print(result.summary())

# Prediction using ARIMA model
air_passengers[‘Forecasted’] = result.predict(start=120,end=246)
air_passengers[[‘#Passengers’,‘Forecasted’]].plot()
plt.show()

# predict using SARIMAX Model
import statsmodels.api as sm
model = sm.tsa.statespace.SARIMAX(air_passengers[‘#Passengers’],order=(7,1,1), seasonal_order=(1,1,1,12))
result = model.fit()
air_passengers[‘Forecast_SARIMAX’] = result.predict(start=120,end=246)
air_passengers[[‘#Passengers’,‘Forecast_SARIMAX’]].plot()
plt.show()
”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“https://raw.githubusercontent.com/swapnilsaurav/OnlineRetail/master/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])

df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)
”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
txt_pp = text.lower()
print(txt_pp)
#remove accent

# applying basic preprocessing:
reviews_df[‘review_comment_message’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)


”’
NLP – Natural Language Processing – analysing review comment to understand
reasons for positive and negative ratings.
concepts like: unigram, bigram, trigram

Steps we generally perform with NLP data:
1. Convert into lowercase
2. decompose (non unicode to unicode)
3. removing accent: encode the content to ascii values
4. tokenization: will break sentence to words
5. Stop words: not important words for analysis
6. Lemmetization (done only on English words): convert the words into dictionary words
7. N-grams: set of one word (unigram), two words (bigram), three words (trigrams)
8. Plot the graph based on the number of occurrences and Evaluate
”’
”’
cardboard mousepad. Going worth price! Not bad
”’

link=“D:/datasets/OnlineRetail/order_reviews.csv”
import pandas as pd
import unicodedata
import nltk
import matplotlib.pyplot as plt
df = pd.read_csv(link)
print(list(df.columns))
”’
[‘review_id’, ‘order_id’, ‘review_score’, ‘review_comment_title’,
‘review_comment_message’, ‘review_creation_date’, ‘review_answer_timestamp’]
”’
#df[‘review_creation_date’] = pd.to_datetime(df[‘review_creation_date’])
#df[‘review_answer_timestamp’] = pd.to_datetime(df[‘review_answer_timestamp’])

# data preprocessing – making data ready for analysis
reviews_df = df[df[‘review_comment_message’].notnull()].copy()
#print(reviews_df)

# remove accents
def remove_accent(text):
return unicodedata.normalize(‘NFKD’,text).encode(‘ascii’,errors=‘ignore’).decode(‘utf-8’)
#STOP WORDS LIST:
STOP_WORDS = set(remove_accent(w) for w in nltk.corpus.stopwords.words(‘portuguese’))

”’
Write a function to perform basic preprocessing steps
”’
def basic_preprocessing(text):
#converting to lower case
txt_pp = text.lower()
#print(txt_pp)

#remove the accent
#txt_pp = unicodedata.normalize(‘NFKD’,txt_pp).encode(‘ascii’,errors=’ignore’).decode(‘utf-8’)
txt_pp =remove_accent(txt_pp)
#print(txt_pp)
#tokenize
txt_token = nltk.tokenize.word_tokenize(txt_pp)
#print(txt_token)

# removing stop words
txt_token = (w for w in txt_token if w not in STOP_WORDS and w.isalpha())
return txt_token

# applying basic preprocessing:
reviews_df[‘review_comment_words’] = \
reviews_df[‘review_comment_message’].apply(basic_preprocessing)

#get positive reviews – all 5 ratings in review_score
reviews_5 = reviews_df[reviews_df[‘review_score’]==5]

#get negative reviews – all 1 ratings
reviews_1 = reviews_df[reviews_df[‘review_score’]==1]

## write a function to creaet unigram, bigram, trigram
def create_ngrams(words):
unigram,bigrams,trigram = [],[],[]
for comment in words:
unigram.extend(comment)
bigrams.extend(.join(bigram) for bigram in nltk.bigrams(comment))
trigram.extend(‘ ‘.join(trigram) for trigram in nltk.trigrams(comment))
return unigram,bigrams,trigram

#create ngrams for rating 5 and rating 1
uni_5, bi_5, tri_5 = create_ngrams(reviews_5[‘review_comment_words’])
print(uni_5)
print(‘””””””””””””””””””‘)
print(bi_5)
print(” =========================================”)
print(tri_5)

uni_1, bi_1, tri_1 = create_ngrams(reviews_1[‘review_comment_words’])
#print(uni_5)

# distribution plot
def plot_dist(words, color):
nltk.FreqDist(words).plot()
JUNE 2023 DataScience Tutorial PART- II

use employees;

— create table, drop table, alter table

— to read data we use Select

Select * from departments;

 

insert into departments (DID, HOD, DNAME, DCODE) values (100,’Sachin’,’CSE’,’AA00′);

insert into departments  values (101,’MEC’,’Virat’,’AZ00′);

 

insert into departments  values (103,’ECE’,’Rohit’,’KZ00′);

insert into departments  values (105,’CIV’,’Dhoni’,’CZ00′);

insert into departments  values (106,’TCE’,’Sunil’,’BZ00′);

 

Select * from employees;

 

Select @@GLOBAL.secure_file_priv;

 

— Import data into CSV

LOAD DATA INFILE ‘C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/EmployeesData.csv’

INTO TABLE EMPLOYEES

FIELDS TERMINATED BY ‘,’

ENCLOSED BY ‘”‘

LINES TERMINATED BY ‘\n’

IGNORE 1 ROWS;

 

— DELETE to remove a row

DELETE FROM EMPLOYEES WHERE EMPID=120;

 

— Update to modify the existing values in a row

Update Employees

Set Bonus_PCT = 0.45, Salary = 160000

where empid=122;

 

Select * from employees where empid=122

 

—  SELECT

Select FNAME, EMAIL, PHONE, SALARY From Employees;

 

Select FNAME, EMAIL, PHONE, SALARY From Employees where salary >=100000;

 

Select FNAME, EMAIL, PHONE, SALARY From Employees where EMAIL =’ramakrishnavendra@wnbco.co’;

 

— Create Index to make query faster

Create Index idx_salary

on Employees (Salary)

 

—  Relational operators in MYSQL:  =   >   <   >=   <=   <>

Select FNAME, EMAIL, PHONE, SALARY From Employees where EMAIL <>’ramakrishnavendra@wnbco.co’;

 

— BETWEEN  LIKE  IN

 

use employees;

 

select * from employees;

 

select FNAME, LNAME, DOB, Salary from employees where salary > 75000;

— BETWEEN is used for numbers

select FNAME, LNAME, DOB, Salary from employees where salary BETWEEN 75000 AND  95000;

— below statement is same as above (BETWEEN)

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 AND  salary <= 95000;

 

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 AND  Bonus_pct <= 0.4; — 0 to 9 rows

 

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 OR  Bonus_pct <= 0.4; — 25 to 34 rows

 

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000   — 25 rows

 

select FNAME, LNAME, DOB, Salary from employees where  Bonus_pct <= 0.4;   — 9 rows

 

— LIKE – for text data comparison

Select * from employees where FNAME like ‘A%’

 

— IN – checking for given set of values

Select * from employees where FNAME IN (‘O%’, ‘A%’,’B%’);

 

Select * from employees where FNAME IN (‘Anikeet Dey’,’Amol Jain’,’Suresh Narayan Singh Yadav’ );

 

Select * from employees where SALARY IN (36107,  110266, 107799, 198890);

—  Logical operator – AND OR NOT

select * from employees where not( deptid = 100 and salary >100000)

 

—  ORDER BY

select * from employees order by salary  DESC;

 

select * from employees order by salary  ASC;  — Ascending order is by default

 

select * from employees order by DEPTID  ASC, Salary DESC;

use employees;

 

select * from employees where bonus_pct is not null;

 

select * from employees order by FNAME;

select * from employees order by FNAME DESC;

 

UPDATE employees SET BONUS_PCT = NULL WHERE BONUS_PCT > 0.88;

 

select * from employees where bonus_pct is not null limit 3;

 

select * from employees where bonus_pct is not null order by salary DESC limit 5;

 

select avg(salary) as Max_Salary, deptid from employees group by deptid;

 

select avg(salary), count(salary), sum(salary) from employees;

 

select * from employees where salary > (select avg(salary) from employees);

 

— Find highest salary earner for each department

select deptid, MAX(salary) as max_salary FROM employees group by deptid;

 

select deptid, fname, salary from employees where (Deptid, salary)  in 

(select deptid, MAX(salary) as max_salary FROM employees group by deptid);

 

— Questions for Bala:

— 1. Top 5 run scorer

— 2. Top 5 wickets taker

— 3. Top 5 catch takers

— 4. How many matches each team has won and lost

— 5. How many teams won and toss but lost the matches

— 6. How many teams lost by batting first

— 7. How many matches were interrupted by rain

— 8. Top run scorer for each team

— 9. Top wicket takers for each team

— 10. Top catch takers for each team

 

— ——————– ——–

— Questions for Vinay:

— 1. How much is my daily sales?

— 2. Top 5 contributing product by sales

— 3. Top 5 contributing product by count

— 4. Breakup of Payment mode (card/cash/upi/others)

— 5. Top 5 highest priced products

— 6. Which day of the week do more people tend to partake in online grocery shopping

— 7. Top best selling categories (number of transactions per category)

— 8. Top best selling categories (amount of sales)

— 9. Top 20% categories contribution to % of sales

— 10. What is the average sales for each of the 7 days.

— ——————– ——–

 

use employees;

 

select deptid, avg(salary) as “Average Salary” from employees group by deptid;

— COUNT, AVG, SUM, MIN, MAX

 

— Like:  %a, a%,  %a% ,  %b__% (exactly 1 value for _),   ‘a%z’

select fname, email, salary from employees;

 

select fname, email, salary from employees where fname like ‘%ol%’;

 

— J followed by exactly three characters

select fname, email, salary from employees where fname like ‘%J___’;

 

— J followed by atleast four characters

select fname, email, salary from employees where fname like ‘%J____%’;

 

select * from employees where fname in (‘Pankaj Prajapati’,’Manju Mishra’ ,’Arijeet Dasgupta’);

 

select * from employees where fname in (select fname from employees where fname like ‘%J____%’);

select * from employees where (fname,salary) in (select fname,salary from employees where fname like ‘%J____%’);

 

select fname, email, salary from employees where fname like ‘%dey’;

 

 

 

— Questions for Karan

— How many books are currently issued

— Who has read the maximum number of books

— Is there a member who has not issued a book so far

— How many books have been issued for more than a month

— List of books with more than 2000 rupees cost

— List of Books older than 20 years

— The most read books of the library

— The most read genre of library

— Average books that are issued per day

— How many readers have not returned the book on time so far

 

— —————- —- 

— combining data from 2 tables

use employees;

select * from employees;

select employees.FNAME, DID, departments.DNAME from employees, departments

 

 

select employees.FNAME, DID, departments.DNAME from employees, departments where employees.deptid = departments.did;

 

insert into departments values (107, ‘BIOT’,’Kapil’,’BB09′)

 

insert into employees (EMPID, FNAME, LNAME, DOB, EMAIL,PHONE, DOJ,SALARY) 

values (133,’Sachin Tendulkar’,’T’,’1990-05-04′,’sachin@wnbco.co’,9999000009,’2023-05-15′,231456);

 

select * from departments;

 

select employees.FNAME, DID, departments.DNAME from employees, departments where employees.deptid = departments.did;

 

 

— JOINS

 

use employees;

 

select * from employees;

 

select * from departments;

 

select fname, hod from employees, departments where departments.did = employees.DEPTID;  — Inner

 

select fname, hod from employees, departments where employees.DEPTID = departments.did order by empid;

 

select fname, hod from employees INNER JOIN departments ON employees.DEPTID = departments.did order by empid;

 

SELECT T1.COL1, T2.COL2, T3.COL3

FROM (((T1 INNER JOIN T2 ON T1.K1 = T2.K1) INNER JOIN T3 ON T2.K2 = T3.K2) INNER JOIN T4 ON T3.K3 = T4.K3);

 

 

select fname, hod from employees LEFT JOIN departments ON employees.DEPTID = departments.did order by empid;

 

use employees;

 

select fname, hod from employees left join departments on employees.DEPTID = departments.did

UNION

select fname, hod from departments left join employees  on employees.DEPTID = departments.did;

 

 

select fname, hod from departments inner join employees  on employees.DEPTID = departments.did;

 

select * from employees;

 

select sum(EMPID) from employees;

 

select count(EMPID) from employees;

 

select count(EMPID) from employees where salary > 50000;

 

select count(EMPID) , DEPTID from employees

group by deptid

having count(EMPID) > 5;

 

select count(fname), hod from employees left join departments on employees.DEPTID = departments.did

group by deptid

having count(fname) > 3;

 

 

select * from employees;

— Salary_Grade:  <50K: E 50-100K: D    100-150K – C  150-200-B  >200K: A

 

select fname, salary, bonus_pct, 

case when bonus_pct is null then salary

else salary+salary*bonus_pct 

end as total_salary from employees;

 

select fname, salary, 

CASE

when salary < 50000 Then ‘Grade: E’

    when salary > 200000 Then ‘Grade: A’

    when salary > 150000 Then ‘Grade: B’

    when salary > 100000 Then ‘Grade: C’

    Else ‘Grade: D’

End As Salary_Grade

from employees;

use employees;

 

select * from employees;

 

select FNAME, DOB, DOJ, datediff(doj,dob) from employees;

 

 

select fname, NOW() from employees;

 

select now(); — current date and time (system)

 

select date(now());

select curdate();

— default date format in MYSQL is  YYYY-MM-DD

— DD-MM-YYYY

select date_format(curdate(), ‘%d-%m-%Y’) as Date;

 

select curdate() as todays_date, date_add(curdate(),interval 1 Day) as Tomorrow,

date_add(curdate(),interval 1 Week) as next_week,

date_add(curdate(),interval 1 Month) as Next_month,

date_add(curdate(),interval 1 Year) as Next_year;

 

— Add date: Interval, day, week or month or year;

select curdate() as todays_date, date_sub(curdate(),interval 1 Day) as Yesterday,

date_sub(curdate(),interval 1 Week) as last_week,

date_sub(curdate(),interval 1 Month) as last_month,

date_sub(curdate(),interval 1 Year) as last_year;

 

 

select day(curdate()) as day, month(curdate()) as Month, quarter(curdate()) as Quarter, Year(curdate()) as Year,

Weekday(curdate()) as Weekday, week(curdate()) as week, weekofyear(curdate()) as WeekofYear;

 

 

select * from employees where salary > (select salary from employees where empid = 104);

 

—  create a stored procedure

DELIMITER $$

 

Create Procedure GetEmpInfo()

Begin

select * from employees where salary > (select salary from employees where empid = 104);

End $$

DELIMITER ;

 

call GetEmpInfo();

 

drop Procedure GetEmpInfo;

 

— Features: exception handling, conditions and loops (While, Repeat), 

 

— Creating a View

create view EmpData

as

Select EMPID, FNAME, LNAME, DOB, EMAIL, PHONE, DOJ, DEPTID from Employees;

 

select * from empdata;

 

 

—  String functions:  LTRIM, RTRIM, Replace, substring, COncat

select concat(fname,’.’,lname,’@wnbco.co’) from employees;

 

select fname, substring(fname, 3), substring(fname, -3) from employees;

 
import pymysql
con_str = pymysql.connect(host=“localhost”, user=“root”, password=“learnSQL”,database=“employees”)
cursor = con_str.cursor()

q1 = ”’Create Table PY_EMP (
EMPID INT PRIMARY KEY,
NAME VARCHAR(30),
CITY VARCHAR(15))
”’
#cursor.execute(q1)

q2 = ”’INSERT INTO PY_EMP VALUES(1,’Laxman’,’Hyderabad’)”’
cursor.execute(q2)
q2 = ”’INSERT INTO PY_EMP VALUES(2,’Rahul’,’Bangalore’)”’
cursor.execute(q2)


q3 = ”’Select * from Py_Emp”’
cursor.execute(q3)
results = cursor.fetchall()
for row in results:
print(row)

con_str.commit()

con_str.close()
# NUMPY
# pip install numpy
import numpy as np
nums = range(16)
nums = np.reshape(nums,(8,2))
print(nums)
nums = np.reshape(nums,(4,4))
print(nums)
print(“Shape: Rows = “,nums.shape[0], “and columns = “,nums.shape[1])
# indexing
print(nums[1,2], nums[-3,-2])
print(nums[1]) # 2nd row
print(nums[:,1]) # : rows from 0th to (n-1)th
print(nums[-1], nums[:,-2], nums[-1,-2])

# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1)
print(mat1)

print(np.zeros((3,3)))
print(np.ones((3,3)))
print(np.full((5,7),2.0))
print(np.full((5,7),9))

# eye – identity matrix: square matrix with 1 on its main diagonal
mat1 = np.eye(5)
print(mat1)

# NUMPY
import numpy as np
# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1) # 4 * 3 – shape
print(mat1)
l2 = [[2,3,4],[2,1,2],[5,2,3],[3,2,2]]
# array is a function to convert list into numpy
mat2 = np.array(l2)
print(mat2)

# Matrices operations
print(mat1 + mat2)
print(np.add(mat1, mat2))

print(mat1 – mat2)
print(np.subtract(mat1, mat2))

print(mat1 * mat2)
print(np.multiply(mat1, mat2))

print(mat1 / mat2)
print(np.divide(mat1, mat2))

# actual matrix multiplication is done using matmul()
l3 = [[2,3,4],[2,1,2],[5,2,3]]
# array is a function to convert list into numpy
mat3 = np.array(l3)
print(mat3)
print(“Matrix Multiplication”)
print(np.matmul(mat1, mat3))
print(mat1 @ mat3)
## calculating determinant

l4 = [[1,3,5],[1,3,1],[2,3,4]]
mat5 = np.array(l4)
det_mat5 = np.linalg.det(mat5)
print(“Determinant of matrix 5 is”,det_mat5)
print(“Inverse of matrix 5 is: \n,np.linalg.inv(mat5))

”’
Linear Algebra Equation:
x1 + 5×2 = 7
-2×1 – 7×2 = -5

x1 = -8, x2= 3,
”’
coeff_mat = np.array([[1,5],[-2,-7]])
#var_mat = np.array([[x1],[x2]])
result_mat = np.array([[7],[-5]])
# equation here is coeff_mat * var_mat = result_mat [eg: 5 * x = 10]
# which is, var_mat = coeff_mat inv * result_mat
det_coeff_mat = np.linalg.det(coeff_mat)
if det_coeff_mat !=0:
var_mat = np.linalg.inv(coeff_mat) @ result_mat
print(“X1 = “,var_mat[0,0])
print(“X2 = “,var_mat[1,0])
else:
print(“Solution is not possible”)

# # scipy = scientific python
# pip install scipy
”’
#Inequality = OPTIMIZATION or MAXIMIZATION / MINIMIZATION PROBLEM
Computer Parts Assembly:
Laptops & Desktops
profit: 1000, 600
objective: either maximize profit or minimize cost

constraints:
1. Demand: 500, 600
2. Parts: Memory card: 5000 cards available
3. Manpower: 25000 minutes


”’
”’
Optimization using Scipy
let’s assume d = desktop, n = notebooks

Constraints:
1. d + n <= 10000
2. 2d + n <= 15000
3. 3d + 4n <= 25000

profit: 1000 d + 750 n => maximize
-1000d – 750 n =>minimize

”’
import numpy as np
from scipy.optimize import minimize, linprog
d = 1
n = 1
profit_d = 1000
profit_n = 750
profit = d * profit_d + n * profit_n
obj = [-profit_d, -profit_n]
lhs_con = [[1,1],[2,1],[3,4]]
rhs_con = [10000, 15000, 25000]

boundary = [(0, float(“inf”)), # boundary condition for # of desktops
(10, 200000)] # we just added some limit for notebooks
opt = linprog(c=obj, A_ub=lhs_con, b_ub=rhs_con, bounds=boundary, method=“revised simplex”)
print(opt)
if opt.success:
print(f”Number of desktops = {opt.x[0]} and number of laptops = {opt.x[1]})
print(“Maximum profit that can be generated = “,-1 * opt.fun)
else:
print(“Solution can not be generated”)

### ### ### PANDAS
# Pandas – dataframe which resembles Table structure
# pip install pandas
import pandas as pd
df1 = pd.DataFrame()
print(df1)
print(type(df1))

# fruit production
data = [[“Apple”, 15000, 11000,6000],
[“Banana”, 18000,22000,29000],
[“Mango”, 2, 900, 19000],
[“Guava”, 19000,11000,25000]]

fruit_production = pd.DataFrame(data)
print(fruit_production)
print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[1:3,2:]) #based on title(names)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”])
print(fruit_production)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”],
index=[“Fruit 1”,“Fruit 2”,“Fruit 3”,“Fruit 4”])
print(fruit_production)

## dataframe.loc() dataframe.iloc()

print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[[“Fruit 2”, “Fruit 3”],[“February”,“March”]]) #based on title(names)

### ###
# pandas
# pip install pandas
import pandas as pd
l1 = [10,20,30,40,50]
l1 = [[“Sachin”,101,20000,“BATSMAN”],[“Kapil”,501,12000,“BOWLER”],
[“Sunil”,12,21000,“BATSMAN”],[“Zaheer”,725,2000,“BOWLER”]]
df1 = pd.DataFrame(l1,columns=[“Player”,“Wickets”,“Runs”,“Type”],
index=[“Player 1”,“Player 2”,“Player 3”,“Player 4”])
print(df1)

d1 = {‘Apple’:[12000,11000,13000],
‘Banana’: [17000,18000,19000],
‘Mango’:[11000,13000,15000]}
df2 = pd.DataFrame(d1)
print(df2)

# creating dataframe from list of dictionary
data1 = [{“Guava”:9000, “Oranges”: 5000},
{“Guava”:8000, “Oranges”: 7000},
{“Guava”:10000, “Oranges”: 6000}]
df3 = pd.DataFrame(data1)
print(df3)

print(df3.iloc[0,:]) #first row and all column values
print(df3.iloc[:,0])

print(df2.iloc[:,0:2])
print(df2.iloc[[0,2],[0,2]])

#
print(df2.loc[[0,2],[“Apple”,“Mango”]])
print(df1.loc[[“Player 1”,“Player 4”],[“Player”,“Runs”]])

df2.iloc[2,0] = 14000
print(df2)
print(“========= DF1 =============”)
df1[‘Avg’] = df1[‘Runs’] / df1[“Wickets”]
print(df1)
print(“Reading data from DF1: “)
df4 = df1[df1.Player !=‘Sachin’] #filter where clause
print(\n\n New dataset without Sachin: \n, df4)
df1 = df1.drop(“Player”,axis=1) # axis default is 0
# unlike pop() and del – drop() returns a new dataframe
print(df1)


print(“Average Wickets of all the players = “,df1[‘Wickets’].mean())
print(“Average Wickets of players by type = \n\n,df1.groupby(‘Type’).mean())
# axis = 0 refers to rows
# axis = 1 refers to columns

print(\n\nDropping columns from DF1: “)
del df1[‘Wickets’] #dropping column Wickets using del
print(df1)

df1.pop(‘Runs’) #dropping column using pop
print(df1)
#
import pandas as pd

ud_df = pd.read_csv(“D:/datasets/gitdataset/user_device.csv”)
print(ud_df) # 272 rows x 6 columns
print(“Rows: “,ud_df.shape[0])
print(“Columns: “,ud_df.shape[1])

print(ud_df.tail(1))
print(ud_df.head(1))

use_df = pd.read_csv(“D:/datasets/gitdataset/user_usage.csv”)
print(use_df) # 240 rows x 4 columns

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’)
print(result_df) # [159 rows x 9 columns] = ud_df: 159 + 113, use_df = 159 + 81

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘outer’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘left’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘right’)
print(result_df)

## Working with Pandas – Example ##
import pandas as pd
import numpy as np
df = pd.read_csv(“D:/datasets/gitdataset/hotel_bookings.csv”)
print(df.shape)
print(df.dtypes)
”’
numeric – int, float
categorical – 1) Nominal – there is no order 2) Ordinal – here order is imp
”’
df_numeric = df.select_dtypes(include=[np.number])
print(df_numeric)

df_object= df.select_dtypes(exclude=[np.number])
print(df_object) # categorical and date columns

print(df.columns)
for col in df.columns:
missing = np.mean(df[col].isnull())
if missing >0:
print(f”{col}{missing})
”’
Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING – Divide the train and test


”’
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
print(df)
Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING –
a. Divide the train and test
b. Run the model
6. EVALUATE THE MODEL:
a. Measure the performance of each algorithm on the test data
b. Metric to compare: based on Regression (MSE, RMSE, R square) or
classification (confusion matrix -accuracy, sensitivity..)
c. select the best performing model
7. DEPLOY THE BEST PERFORMING MODEL

Hypothesis test:
1. Null Hypothesis (H0): starting statement (objective)
Alternate Hypethesis (H1): Alternate of H0

Z or T test:
Chi square test: both are categorical

e.g. North zone: 50 WIN 5 LOSS – p = 0.005

# simple (single value) v composite (specifies range)
# two tailed test v one tailed test [H0: mean = 0,
H1 Left Tailed: mean <0
H1 Right Tailed: mean >0
# level of significance:
alpha value: confidence interval – 95%
p value: p value <0.05 – we reject Null Hypothesis
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
print(X)
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train)
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output
Mastering Data Science – June 2023

Mastering Data Science Course - JUNE 2023

# comments – ignored by the compiler/interpreter
# Python is case sensitive language
print(5 + 5) # inbuilt function – prints whatever you give to it on the screen
print(“5 + 5”) # “” – used for repeating by print
print(“5 + 5 =”,5+5)
print(‘3 + 5 =’,5+5)

# variables
x = 777
y = 75
print(x+y)
print(x,“+”,y)
print(x,“+”,y,“=”,x+y)
print(x,‘+’,y,‘=’,x+y)
# variables naming rule: it should start with an alphabet, it can have numbers and _
num = 10
num1 = 101
n1m2 = 56
first_number = 56

# write a program to add 2 numbers
num1 = 45
num2 = 66
print(“Sum of two numbers is”,num1+num2)

x = 45
y = 66
print(“Sum of two numbers is”,x+y)


sdfdsagtarebgdvdzdczvdv = 45
dfsgdfsgbsysbsdfhnjdjshff = 66
print(“Sum of two numbers is”,sdfdsagtarebgdvdzdczvdv+dfsgdfsgbsysbsdfhnjdjshff)

# write a program to calculate area and perimeter of a rectangle, when their sides are given
# input -> process -> output
# area = length * breadth
# perimeter = 2 * (length + breadth)
# input:
length = 52
breadth = 31
#process:
area = length * breadth;
perimeter = 2*(length + breadth);
#output
print(“Area of the rectangle is”,area);print(“The perimeter is”,perimeter)

########### Assignment ################
# WAP to calculate area and circunference of a circle when radius is given
# WAP to calculate perimeter of a triangle who 3 sides are given
##########################################

# Basic data types – 5 types
length = 30 # int=integer = numbers without decimal values
print(“type of data = “,type(length)) # <class ‘int’>
# type() – gives the type of the data that is in the variable

length = 29.8 # float = decimal values
print(“type of data = “,type(length)) # <class ‘float’>

number1 = 5+2j
print(“type of data = “,type(number1)) #<class ‘complex’>
# imaginary numbers are the root of -ve numbers = i
print(“Doing a calculation = “, number1 * (52j)) # (a+b)(a-b) = a square – b square
# 25 – (-4) = 29 + 0j

# bool (boolean)- True (1) & False (0)
val1 = True # False
print(“type of data = “,type(val1))

#text -str = string
name = “Sachin Tendulkar”
print(“type of data = “,type(name))

########### Assignment ################
# WAP to calculate area and circunference of a circle when radius is given
# WAP to calculate perimeter of a triangle who 3 sides are given
##########################################

# Basic data types – 5 types

cost_price = 45
selling_price = 67
quantity = 78
profit = (selling_price – cost_price) * quantity
print(“Total profit after selling”,quantity,“items bought at”,cost_price,
“and sold at”,selling_price,“is”,profit,end=\n)
print(f”Total profit after selling {quantity} items bought at {cost_price}
f”and sold at {selling_price} is {profit},end=\n) #f string
print(“Total profit after selling {} items bought at {} and sold at “
“{} is {}”.format(quantity, cost_price,selling_price,profit))
print(“Total profit after selling {0} items bought at {2} and “
“sold at {3} is {1}”.format(quantity, profit,cost_price,selling_price))

quantity = int(“14”)
# int(), str(), bool(), float(), complex()
total_cost = 500
cost_per_quantity = total_cost/quantity
# implicit and explicit conversion
print(f”Cost per items for quantity {quantity} and total cost {total_cost} is {cost_per_quantity:.2f})

player = “Kohli”
country = “India”
position =“Opener”
print(f”This is {player:<15} who plays for {country:>15} as {position:^20} in cricket.”)

player,country,position = “Mubwangwa”, “Zimbabwe”,“Wicket-Keeper”
print(f”This is {player:<15} who plays for {country:_>15} as {position:.^20} in cricket.”)

print(“I am fine \nhow are \nyou”) #escape character (works only inside quotes) \n, \t
# \n will create newline
print(\\n will create newline”)
# \\n will create newline
print(\\\\n will create newline”)

print(f”{player} \n{country} \n{position})
print(“First line content”,end=” – “)
print(“Second line content”,end=\n)

value = 50
print(“Decimal number = “,int(value))
print(“Hexa number = “,hex(value)) #0x
print(“Octal number = “,oct(value)) #0o
print(“Binary number = “,bin(value)) #0b

value = 0b110011001100
print(“Decimal number = “,int(value))
print(“Hexa number = “,hex(value)) #0x
print(“Octal number = “,oct(value)) #0o
print(“Binary number = “,bin(value)) #0b

#### Operations –
##Arithmematic operations
val1, val2 = 54, 5
print(val1 + val2) # 55
print(val1 – val2) #
print(val1 * val2) #
print(val1 / val2) #

print(val1 // val2) #integer division – will return only the integer part
print(val1 ** val2) # power (raise to)- 54 to the power of 5
print(54 ** (1/2)) #square root of 54
print(val1 % val2) # modulo – remainder

#Relational (comparison operator): Output is always a bool value
# == != < > <= >=
val1,val2 = 15, 15
print(val1 == val2) # is val1 equal to val2 ? – T
print(val1 != val2) # F
print(val1 < val2) # F
print(val1 > val2) # F
print(val1 <= val2) # T
print(val1 >= val2) # T

## Logical operations

#Arithematic: + – * / // ** %
#Comparison: > < >= <= == !=
#Logical: and or not: input and output both values are boolean
# Prediction 1: Sachin or Sehwag will open the batting
# Prediction 2: Sachin and Sourav will open the batting
# Actual: Sachin and Rahul opened the batting

print(True or False) # OR gives True even if one value is True
print(True and False) # AND gives False even if one value is False
print(5 > 6 and 8<10) # False and True
print(3 + 4 * 4)
val1,val2,val3 = 10,15,10
print(val1 > val2 and val2 ==val3 or val3==val1 and val1!=val3 and val2>=val3 or val3 !=val1)
print(3 + 5 * 2 6 * 3 + 5 3 * 4 / 3)

#BIT WISE Operators: and (&) or (|), left shift and right shift
print(50 & 30)
print(bin(50), bin(30))
print(int(0b10010))
print(50 | 30)
print(int(0b111110))

print(312 >> 3)
print(12 << 3)

########### Refer the document for assignment ########
# Conditions – if, elif else
# Pass or Fail
print(“PASS”)
print(“FAIL”)

avg = 5
if avg >=35:
print(“PASS”)
print(“Congratulations on great result”)
else:
print(“Result: FAIL”)
#if avg > 90 – A+, 80 -A, 70: B+, 60: B-, 50: C, >35: D, <35: E
avg = float(input(“Enter the average value of the student: “))
if avg>=90:
print(“Super Duper Result: A+”)
elif avg>=80:
print(“Grade: A-“)
elif avg>=70:
print(“Grade: B+”)
elif avg >=60:
print(“Grade: B-“)
elif avg >=50:
print(“Grade: C”)
elif avg >=35:
print(“Grade: D”)
else:
print(“Grade: E”)

### input()
val = int(input(“Enter a number: “)) #takes value from the user

print(val)
print(type(val))
# by default input() takes in value as string

#### Take marks of 5 subjects from the user and calculate sum and average and then assign the grade
# based on above discussion
”’
Write a program to input a value from the user and check if its positive, negative or zero.
if its positive, then check if its odd or even. For even numbers, check if they are multiple of 4
”’
#Nested if conditions
num = int(input(“Enter a number: “))
if num >0:
print(num,“is positive”,end=“”)
if num %2 ==0:
print(” and also even”)
if num%4==0:
print(num,“is divisible by 4.”)
else:
print(” and also odd”)
elif num<0:
print(num, “is negative”)
else:
print(num,“is neither positive not negative”)

#Assignment: Write a program to input 3 sides of a triangle anc check
#if they are: right angled, isoceles, scalene or equilateral

# short form of one line if-else condition
a,b = 10,5
if a>b:
print(a,” is greater”)
else:
print(b, ” is greater”)

#short
result = a if a>b else b

print(result,“is greater”)

## let’s take example of 3 values and check which number is highest
a,b,c = 500,100,500
if a>b:
if a>c:
print(a,“is highest”)
else:
print(c, “is highest”)
else: # b is greater than a
if b>c:
print(b,“is highest”)
else:
print(c, “is highest”)

result = a if a>b else b
result = result if result>c else c
print(result,“is highest – II”)

if a>=b and a>=c:
print(a,“is highest – III”)
elif b>=a and b>=c:
print(b,“is highest – III”)
else:
print(c,“is highest – III”)
# LOOPS:
#1. FOR – when you know exactly how many times to repeat
#2. WHILE – when you dont know exactly how many times but you know when to stop

# range() generates range of values from given =position to the <final position with given increment
range(1,49,6) #range(=start,<end,increment): 1,7,13,19,25,31,37,43
range(5,11) #range(start,end), increment is default = 1: 5,6,7,8,9,10
range(5) # range(end), start default = 0, increment=1: 0,1,2,3,4
# for loop using range
for variable in range(1,49,6):
print(“=================”)
print(“HELLO : “,variable )

for variable in range(5,11):
print(“xxxxxxxxxxxxxxxxx”)
print(“HELLO : “,variable )

for variable in range(5):
print(“………..”)
print(“HELLO : “,variable )

# for counter in range_of_values
# while true condition
final,start = 5,0
while start< final:
print(“I am in while now, start = “,start)
start +=1 #start = start+1
print(“Thats it for today”)
n=10
“””
* * * * *
* * * * *
* * * * *
* * * * *
* * * * *
“””
for j in range(n):
for i in range(n):
print(“*”,end=” “)
print()

”’
*
* *
* * *
* * * *
* * * * *
”’
#z=0
for j in range(n):
#z+=1 # z=z+1
for i in range(j+1):
print(“*”,end=” “)
print()

“””
* * * * *
* * * *
* * *
* *
*
“””
for j in range(n):
for i in range(n-j):
print(“*”,end=” “)
print()

”’
*
* *
* * *
* * * *
* * * * *
”’
for j in range(n):
for k in range(n-j-1):
print(” “,end=“”)
for i in range(j+1):
print(“*”,end=” “)
print()

##### Assignment ##########
#Print below pattern:
“””
* * * * *
* * * *
* * *
* *
*
“””
## assignment code
”’
A pattern

”’
n=15
for j in range(n):
for k in range(n-j-1):
print(” “,end=“”)
if j==0 or j==int(round(n/2)):
for i in range(j+1):
print(“* “,end=“”)
else:
print(“*”,end=“”)
print(” “*j,end=“”)
print(“*”,end=“”)
print()

#### Assignment
# Practice patterns A to Z

for num in range(1,11):
for i in range(1,11):
print(f”{i:>2} * {num:>2} = {num*i: >2},end=” “)
print()

user_says = “y”
while user_says ==“y”:
print(“Hello”)
user_says = input(“Enter y to print again or any other key to stop:”)

check = True
while check:
print(“Hello Again”)
user_says = input(“Enter y to print again or any other key to stop:”)
if user_says!=“y”:
check = False

while True:
print(“Hello Again Again”)
user_says = input(“Enter y to print again or any other key to stop:”)
if user_says!=“y”:
break

user_says = input(“Enter y to print again or any other key to stop:”)
while user_says ==“y”:
print(“Hello”)
user_says = input(“Enter y to print again or any other key to stop:”)

st,end,co=11,100,11
for i in range(st,end,co):
if end%co==0:
if i == (end // co-1) * st:
print(i)
else:
print(i, end=“, “)
else:
if i==end//co*st:
print(i)
else:
print(i, end=“, “)

st,end,co=11,100,11
for i in range(st,end,co):
if i+co>=end:
print(i)
else:
print(i, end=“, “)

”’
Find 10 multiples of even number
If the number is negative then do not execute
if you come across multiple value as a multiple of 12 then skip it
if you come across multiple value as a multiple of 20 then stop it
”’
while True:
num = int(input(“Enter a number: “))
if num>=0:
for i in range(1,11):
if num*i%12 ==0:
continue
if num*i%20 ==0:
break
print(f”{num} * {i} = {num*i})
ch=input(“Hit any key to stop: “)
if len(ch)!=0:
break
#ATM Transaction
while True:
print(“Enter your choice from below menu:”)
print(“1. Deposit \n2. Withdraw \n3. Account Transfer \n4. Statement \n5. Quit”)
ch = input(“Enter your option here: “)
if ch==“1”:
ch=input(“Enter the amount: “)
print(“… continue witht he logic”)
elif ch==“2”:
print(“Logic is not ready”)
elif ch==“3”:
print(“Logic is not ready”)
elif ch==“4”:
print(“Logic is not ready”)
elif ch==“5”:
print(“Thank you for Banking with us, Have a Good Day!”)
break
else:
print(“Invalid Option, please try again!”)
continue


# Check for Prime
is_prime = True
for i in range(2,num//2+1):
if num %i==0:
is_prime = False
break
if is_prime:
print(f”{num} is a Prime number”)
else:
print(f”{num} is not a Prime number”)

# generate prime numbers between 5000 and 10000

for num in range(5000,10001):
is_prime = True
for i in range(2,num//2+1):
if num %i==0:
is_prime = False
break
if is_prime:
print(f”{num} ,end=“, “)



# Assignments:
## 1. WAP to generate first 5 fibonacci numbers: 0,1, 1, 2,3,5,8,13,21…
# guess the number: Computer v Human
import random

num = random.randint(1,100)
#print(“Number = “,num)
#print(“===============”)
counter = 0
while True:
guess = int(input(“Guess the number (1-100): “))
if guess <1 or guess>100:
print(“Invalid number! Try again…”)
continue
counter+=1 #counter = counter + 1
if guess == num:
print(f”Congratulations! You have successfully guessed the number is {counter} steps.”)
break
elif guess < num:
print(“Sorry! Your guess is lower. Try again…”)
else:
print(“Sorry! Your guess is higher. Try again…”)

############
# guess the number: Computer v Computer
import random

num = random.randint(1,100)
print(“Number = “,num)
print(“===============”)
counter = 0
st,en = 1,100
while True:
#guess = int(input(“Guess the number (1-100): “))
guess = random.randint(st,en)
print(“Guess number = “,guess)
if guess <1 or guess>100:
print(“Invalid number! Try again…”)
continue
counter+=1 #counter = counter + 1
if guess == num:
print(f”Congratulations! You have successfully guessed the number is {counter} steps.”)
break
elif guess < num:
print(“Sorry! Your guess is lower. Try again…”)
st=guess+1
else:
print(“Sorry! Your guess is higher. Try again…”)
en=guess-1

################

# guess the number: Computer v Human (automation)
import random

num = random.randint(1,100)
print(“Number = “,num)
print(“===============”)
counter = 0
st,en = 1,100
while True:
guess = (st+en)//2
#guess = random.randint(st,en)
print(“Guess number = “,guess)
if guess <1 or guess>100:
print(“Invalid number! Try again…”)
continue
counter+=1 #counter = counter + 1
if guess == num:
print(f”Congratulations! You have successfully guessed the number is {counter} steps.”)
break
elif guess < num:
print(“Sorry! Your guess is lower. Try again…”)
st=guess+1
else:
print(“Sorry! Your guess is higher. Try again…”)
en=guess-1

###########################
# guess the number: Computer v Human (automation)
import random
import time
TOTAL = 0
max = 100000
start_time = time.time()
for i in range(max):
num = random.randint(1,100)
#print(“Number = “,num)
#print(“===============”)
counter = 0
st,en = 1,100
while True:
guess = (st+en)//2
#guess = random.randint(st,en)
print(“Guess number = “,guess)
if guess <1 or guess>100:
#print(“Invalid number! Try again…”)
continue
counter+=1 #counter = counter + 1
if guess == num:
print(f”Congratulations! You have successfully guessed the number is {counter} steps.”)
TOTAL +=counter
break
elif guess < num:
#print(“Sorry! Your guess is lower. Try again…”)
st=guess+1
else:
#print(“Sorry! Your guess is higher. Try again…”)
en=guess-1
end_time = time.time()
print(“Total steps = “,TOTAL)
print(“Average number of steps = “,TOTAL/max,” total time taken to run”,max,“steps is”,(end_time – start_time))
# Average number of steps = 5.80429 total time taken to run 100000 steps is 36.42937159538269

####################################

# guess the number: Computer v Computer (measure steps)
import random
import time
TOTAL = 0
max = 100000
start_time = time.time()
for i in range(max):
num = random.randint(1,100)
#print(“Number = “,num)
#print(“===============”)
counter = 0
st,en = 1,100
while True:
#guess = (st+en)//2
guess = random.randint(st,en)
print(“Guess number = “,guess)
if guess <1 or guess>100:
#print(“Invalid number! Try again…”)
continue
counter+=1 #counter = counter + 1
if guess == num:
print(f”Congratulations! You have successfully guessed the number is {counter} steps.”)
TOTAL +=counter
break
elif guess < num:
#print(“Sorry! Your guess is lower. Try again…”)
st=guess+1
else:
#print(“Sorry! Your guess is higher. Try again…”)
en=guess-1
end_time = time.time()
print(“Total steps = “,TOTAL)
print(“Average number of steps = “,TOTAL/max,” total time taken to run”,max,“steps is”,(end_time – start_time))
# Average number of steps = 7.4783 total time taken to run 100000 steps is 28.571797370910645
# STRINGS
txt1 = “HELLO”
txt2 = ‘HI’
print(txt1, txt2)
txt3 = ”’I am fine”’
txt4 = “””I am here”””
print(txt3, txt4)

txt5 = ”’I am fine here
Hope you are
doing well
too. So how are you”’
txt6 = “””I am here
Where are you?
How are you?”””
print(txt5)
print(txt6)

# I’m fine
print(“I’m fine”)
# He asked,”How are you?”
print(‘He asked,”How are you?”‘)
# \ – escape sequence
print(“I am \nfine”)
print(‘I\’m fine’)

txt7 = “HELLO”
print(len(txt7))
print(txt7[1])
print(txt7[1],txt7[4])
print(txt7[1:4]) #ELL
print(txt7[:4]) #HELL
print(txt7[2:]) #LLO
print(txt7[:]) #HELLO

txt8 = input(“Enter a text:”)
length = len(txt8)
print(txt8[-1], txt8[length-1]) #last character
print(txt8[0], txt8[-length]) #first character
print(txt8[1:4], txt8[-4:-1]) # ELL
print(txt8[:4], txt8[-5:-1], txt8[:-1]) # HELL
print(txt8[1:], txt8[1:5], txt8[-4:]) # ELLO

a = “HELLO”
b = “THERE”
print(a +” “+ b)
c = (a+” “) * 4
print(c)
print(c[:-1])
#a[0]=”h” #’str’ object does not support item assignment – IMMUTABLE
a = “h”+a[1:]
print(a)

for i in b:
print(i)

for j in range(len(b)):
print(j, b[j])

### ### String Methods #####
a= “heLLLLLLLLLlllo”
## Refer String Methods – https://docs.python.org/3.11/library/stdtypes.html#string-methods
print(a.islower())
print(a.lower())

ch = input(“Enter Yes to continue: “)
if ch.upper()==“YES”:
print(“YES”)

num1 = input(“Enter a number: “)
if num1.isdigit():
num1 = int(num1)
else:
print(“Exiting because of invalid number”)
# String methods
str1 = “Hello How Are You?”
print(“Capitalize the content: “,str1.capitalize())
print(str1.isalpha())
print(“”.join(str1.split(” “)))
str1_1 = “”.join(str1.split(” “))
print(“is alpha for joined text: “,str1_1.isalpha())
if str1_1.isalpha():
print(“Your name has been saved into the database”)
else:
print(“Invalid name, try again!”)

str2 = ” name “
print(str2.isspace())
print(“Strip txt:”,str2.strip())
print(“Strip txt:”+str2.strip())

str3 = “Hello How Are You are you doing you are?”
print(“count of are:”,str3.lower().count(“are”, 0, 21))
print(“Finding find:”, str3.find(“find”)) #returns the position of first occurrence
print(“Finding find:”, str3.find(“you”))
print(“Finding find:”, str3.find(“you”,23,35))

## Assignment – find the position of all the occurrences and print the output in below format:
# found 2 words and they are at positions 22 and 32

print(str3.startswith(“Hello How”))
print(str3.endswith(“?”))

# Replace –
print(str3.replace(“you”,“YOUR”))
print(str3.replace(“you”,“YOUR”, 1))
print(str3[:24]+str3[24:].replace(“you”,“YOUR”, 1))
print(str3) # original text will not change becasue of any method as strings are immutable
# List: linear ordered mutable collection
list1 = [20,30.0, True, “Hello”, [2,3,5]]
print(“Datatype: “, type(list1))
print(“Number of values = “,len(list1))
print(list1[0], type(list1[0]))
print(list1[-1], type(list1[-1]))

print([1,2,3]+[10,20,30])
print([1,2,3] *3)

for i in list1:
print(i)

for i in range(len(list1)):
print(i, “: “,list1[i])

for i in list1[-1]:
print(i)

list1[0] = “Twenty”
print(list1)

#Tuple: linear ordered immutable collection
t1 = (20,30.0, True, “Hello”, [2,3,5])
print(type(t1))
print(t1[0],t1[-1])
#t1[0]= 5 # ‘tuple’ object does not support item assignment

# you can convert list to tuple and tuple to list
l1 = [3,5,9]
print(type(l1))
l1 = tuple(l1)
print(type(l1))
l1 = list(l1)
print(type(l1))

t1 = (2,100,1000,10000,100)
t2 = (2,100,1000,10000,100000)
if t1 > t2:
print(“T1”)
elif t2 > t1:
print(“T2”)
else:
print(“They are equal”)

print(“Count = “,t1.count(100))
print(“Index = “,t1.index(100))

l1 = list(t1)
print(“Count = “,l1.count(100))
print(“Index = “,l1.index(100))

l1.append(99)
l1.append(199)
l1.append(299)
print(“L1 after append= “,l1)

#insert & append – both will add members to the list
l1.insert(2,555)
print(l1)
l1.insert(200,444)
print(l1)
# FIFO – Queue
total = 0
marks=[]
for i in range(5):
m1 = int(input())
marks.append(m1)
total += m1
print(“Marks obtained are:”,marks)
print(“Total = “,total)
#Marks obtained are: [50, 60, 70, 80, 90]
# LIFO – stack
total = 0
marks=[]
for i in range(5):
m1 = int(input())
marks.insert(0,m1)
total += m1
print(“Marks obtained are:”,marks)
print(“Total = “,total)
#Marks obtained are: [90, 80, 70, 60, 50]

# using sum() to do total


marks=[]
for i in range(5):
m1 = int(input())
marks.append(m1)
total = sum(marks)
print(“Marks obtained are:”,marks)
print(“Total = “,total)

l1 = [90, 80, 70, 60, 50]
idx =10
if len(l1) >idx:
l1.pop(10) #default is -1: pop works on index
print(l1)
if 90 in l1:
l1.remove(90) #remove works on value
print(l1)

if l1.count(90) > 0:
l1.remove(90)
l2 = [90, 80, 70, 60, 50]
print(sum(l2))

l2 = [80, 90, 60, 50, 70]
l2.reverse()
print(l2)
l2.sort()
l2.sort(reverse=True)
print(l2)

# extend
l1 = [2,4,6]
l2 = [4,5,9]
l3 = l1 + l2
print(l3)
l1.extend(l2) # l1 = l1 + l2
print(l1)

l3 = [2, 4, 6, 4, 5, 9]
l4 = l3 # deep copy [renaming= duplicate name]
l5 = l3.copy() #shallow copy [taking a photocopy]
print(“l3 = “,l3)
print(“l4 = “,l4)
print(“l5 = “,l5)
l3.append(11)
l4.append(22)
l5.append(33)
print(“After append:”)
print(“l3 = “,l3)
print(“l4 = “,l4)
print(“l5 = “,l5)

”’
Assignment: Write a program using List to read the marks of
5 subjects for 5 students and find the highest marks for:
i) Each Student
ii) Each Subject

e.g. [[44,77,66,55,88],[64,54,74,84,94],[99,66,44,77,55],[83,74,65,56,67],[91,81,71,61,51]]
Highest for Student 1: 88
2: 94
3: 99
4: 83
5: 91
Highest for Subject 1: 99
2: 81
3: 74
4: 84
5: 94
”’
all_marks = []
for i in range(5):
students=[]
print(“Marks for Student “,i+1,“: “)
for j in range(5):
m = int(input(“Marks obtained in Subject “+str(j+1)+“: “))
students.append(m)
all_marks.append(students)
print(“All marks = “,all_marks)
# [[44,77,66,55,88],[64,54,74,84,94],[99,66,44,77,55],[83,74,65,56,67],[91,81,71,61,51]]

#All marks = [[44, 77, 66, 55, 88], [64, 54, 74, 84, 94],
# [99, 66, 44, 77, 55], [83, 74, 65, 56, 67], [91, 81, 71, 61, 51]]


l3 = [2, 4, 6, 4, 5, 9]
l3.clear()
print(l3)

#Tuple
t1 = ()
print(type(t1))
t1 = (100,)
print(type(t1))
t1 = (100,200)
print(type(t1))

######### DICTIONARY ##########
# dictionary -> unordered mutable collection
# dict = {key:value}
dict1 = {}
print(type(dict1))
dict1 = {1:500,“Two”: “Sachin Tendulkar”,“some index”:999,“City”:“Mumbai”, 10:1000}
#Key shouldnt be duplicated
print(dict1[1])

#DICTIONARY
dict1 = {}
print(type(dict1))

all_info= {}
# all_info= {“”:[],””:[],””:[]}
for i in range(3):
rno = input(“Enter the Roll No. “)
marks=[]
for j in range(3):
m = float(input(“Enter the marks: “))
marks.append(m)
t_dict = {rno:marks}
all_info.update(t_dict)

print(“All Info: \n,all_info)

t1 = (3,6,9)
a,b,c = t1
#a,b= t1 ValueError: too many values to unpack (expected 2)
# a,b,c,d = t1 ValueError: not enough values to unpack (expected 4, got 3)


all_info = {‘101’: [3.0, 4.0, 5.0], ‘102’: [7.0, 8.0, 9.0], ‘103’: [4.0, 5.0, 6.0],
‘104’:[2.0, 4.0, 6.0], ‘105’:[3.0,6.0, 9.0]}
chk_rno = input(“Enter the roll number for information: “)

print(all_info.keys())
print(all_info.values())
print(all_info.items())

for k in all_info.items():
if k[0]==chk_rno:
print(k[1])

found = False
for k,v in all_info.items():
if k==chk_rno:
print(v)
found = True

if not found:
print(“Sorry, roll no is not in the list”)

chk_rno = input(“Enter the roll number to delete: “)
for k in all_info.items():
if k[0]==chk_rno:
all_info.pop(chk_rno)
break
print(“After removal: \n,all_info)
all_info.popitem() #removes last updated value
print(“After popitem: \n,all_info)

all_info = {‘101’: [3.0, 4.0, 5.0], ‘102’: [7.0, 8.0, 9.0], ‘103’: [4.0, 5.0, 6.0],
‘104’:[2.0, 4.0, 6.0], ‘105’:[3.0,6.0, 9.0]}
#copy
all_info1 = all_info #Deep copy
all_info2 = all_info.copy() # Shallow copy
print(“First:”)
print(all_info)
print(all_info1)
print(all_info2)
all_info.update({9:“Orange”})
all_info1.update({10:“Apple”})
all_info2.update({11:“Mango”})
print(“Second:”)
print(all_info)
print(all_info1)
print(all_info2)

### SETS : linear unordered collection

set1 = {3,5,6,8}
print(type(set1))

list1 = [‘Mango’,‘Mango’,‘Mango’,‘Mango’,‘Mango’]
set1 = {‘Mango’,‘Mango’,‘Mango’,‘Mango’,‘Mango’}
print(len(list1))
print(len(set1)) # set will have unique values
list1 = [“A”,“B”,“C”,“D”,“E”]
list2 = [“C”,“D”,“A”,“B”,“E”]
set2 = {“C”,“D”,“A”,“B”,“E”}
print(set2) # order doesnt matter in set
# SETs are used to represent values
set2 = {“C”,“D”,“A”,“B”,“E”}
set1 = {“C”,“D”,“A”,“F”,“G”}

#frozen_set is the immutable version of set
#sets are mutable
set1.add(“Z”)
set3 = {“P”,“Q”}
set2.update(set3)
print(“Set1: “,set1)
print(“Set2: “,set2)
set2.pop()
print(“Set2: “,set2)
set2.remove(“Q”)

#Operations on sets: Union, Intersection, Difference, difference_update
print(“Set1: “,set1)
print(“Set2: “,set2)
#union – combining all the elements from the sets
print(set1.union(set2))
print(set1 | set2)

#intersection: only the common elements
print(set1.intersection(set2))
print(set1 & set2)

#difference
print(set1 – set2)
print(set2 – set1)
print(set1.difference(set2))

#symmetric difference
print(set1 ^ set2)
print(set2 ^ set1)
print(set1.symmetric_difference(set2))

set1.union(set2) #union update – set1 will have new values
set1.intersection_update(set2) # set1 will be updated with output values
set1.difference_update(set2)
set1.symmetric_difference_update(set2)
set1.issubset(set2)

# FUNCTIONS –

# inbuilt functions: print(), input(), ….
# user defined functions

#just created a function called as myquestions()
# required positional argument
def myquestions(choice,a):
print(“Choice =”,choice,“and A =”,a)
if choice==1:
print(“Whats your name?”)
print(“What are you doing here?”)
print(“When will you be done?”)
else:
print(“Whats your Work here?”)
print(“What are your interests?”)
print(“Why dont you stand out?”)
return “Job Done”

myquestions(1,5) #passing while calling
#print(“My Out = “,myout)
print(“Another time let me repeat”)
myout = myquestions(0,4)
print(“My Out = “,myout)

str1 = “HELLO”
output = str1.lower() #returns a value
print(output)

list1 = [5,1,9,6,2,7,3]
output2 = list1.reverse() #doesnt return a value
print(output2)
print(list1)

out=print(“heloo”)
print(“OUT=”,out) #doesnt return
out = input() #returns
print(“OUT=”,out)


# non-required = default argument
# non-positional = keyword argument
def myquestions(choice,a=-99): #default value
print(“Choice =”,choice,“and A =”,a)
if choice==1:
print(“Whats your name?”)
print(“What are you doing here?”)
print(“When will you be done?”)
else:
print(“Whats your Work here?”)
print(“What are your interests?”)
print(“Why dont you stand out?”)
return “Job Done”



myquestions(1,5) #passing while calling
myquestions(1)
myquestions(a=1,choice=5) #keyword argument

def myfavoritefruits(f1,*fruits, **interests): #* and ** for variable number of arguments
# * – as tuple, ** will read as dictionary
if len(fruits)==0:
print(“My favorite fruit is”,f1)
else:
#tuple(list(fruits).append(f1))
print(“My favorite fruits are:”,tuple(set(fruits)))
if len(interests)!=0:
print(“My other interests are: “)
for i,j in interests.items():
print(“Favorite”,i,“is”,j)


myfavoritefruits(“Guava”,“Orange”)
myfavoritefruits(“Orange”)
myfavoritefruits(“mango”,“mango”,“mango”,“apple”, color=“red”,place=“hyderabad”,food=“Biryani”)
# required, default, keyword, variable length arguments

def myfun1(x,y):
print(f”x={x}, y={y} and sum is {x+y})


myfun1(10,20)
a,b = 5,6
myfun1(a,b)
x,y=2,8
myfun1(x=y,y=x)
myfun1(y,x)

#Scope of a variable
PI = 3.14

def myfun1(x,y):
print(f”x={x}, y={y} and sum is {x+y})
global g
print(“Print G =”,g)
h=70
g=20
print(“Local: “,g)

g=100 #Global variable
myfun1(10,20)
#print(“H =”,h)

def my_fun2(n):
all_values = []
for i in range(n):
all_values.append(input(“Enter values: “))
return all_values

val = my_fun2(10)
print(val)
# class and objects
# functions and variables – for class and specific for objects

class Library:
total_books = 0 #class level variable
var2 = –1
name = “ABC International School Library”
def __init__(self,title, author,copies=1): # auto called when object is created
print(“Adding books to the library”)
Library.total_books+=1
self.total_copies = copies #object level variable
self.title = title
self.author = author

def display_detail(self):
print(“================================”)
print(“Title \t\t Author \t\t\t Copies”)
print(“——————————–“)
print(f”{self.title} {self.author} {self.total_copies})
print(“================================”)

@classmethod
def print_total(cls):
print(“Total book count is: “,cls.total_books)

# variables – needs to be told if its object level – by default its class level
# methods – needs to be told if its class level – by default its object level

lib1 = Library(“Python Programming”,“Sachin Tendulkar”,10) # __init__ will be called
lib2 = Library(“Machine Learning”,“Virat Kohli”) # __init__ will be called
lib3 = Library(“SQL Programming”, “Dhoni”, 12) # __init__ will be called
print(Library.total_books) #calling using class name
print(lib1.total_books) #calling using object
print(lib2.total_books)

print(Library.total_books)
print(lib1.total_books)
print(lib2.total_books)

print(lib1.total_copies)
print(lib2.total_copies)
print(lib2.author)
print(lib3.author)
lib1.display_detail()
lib1.print_total()
lib3.print_total()
Library.print_total()


class MathsOps:
def __init__(self,a:int, b=1):
self.num1 = a
self.num2 = b # Protected members are defined as _ (single underscore)
self.__mul = –1 #we are making it as PRIVATE (__)
self.div = –1
self.square = self.num1 ** 2
self._power = self.num1 ** self.num2 #protected

def add(self):
return self.num1 + self.num2
def subtract(self):
return self.num1 – self.num2
def multiply(self):
self.__mul = self.num1 * self.num2
return self.__mul
def divide(self):
self.div = self.num1 / self.num2

m1 = MathsOps(100)
print(m1.add())
print(m1.subtract())
#print(m1.__mul)
print(“Multiply: “,m1.multiply())
division = m1.divide()
print(“Division = “,division)
print(m1._power) #protected concept is there Python but not implemented
print(m1.square)
class CountryInfo:
def __display1(self): #private
print(“Country = India”)
def _display2(self): #protected
print(“Country is India”)
class University (CountryInfo):
def __init__(self, name,year_estb):
self.uname = name
self.year_estb = year_estb
def display(self): #public
print(f”The university {self.uname} was established in the year {self.year_estb})

def generateInfo(self):
print(“Sorry this is not implemented”)

def makeitready(self):
print(“Sorry this is not implemented”)
class CityInfo:
def display(self):
print(“City = Hyderabad”)

class Professors(University, CityInfo):
def __init__(self,name,univ,salary, year_estb):
University.__init__(self,univ, year_estb)
self.name = name
self.salary = salary
def printinfo(self):
print(self.name,“: “,self.salary)
University.display(self)
CityInfo.display(self)

def generateInfo(self):
print(“generating payslip”)
class Students(CityInfo,University):
def __init__(self,name,univ,marks, year_estb):
University.__init__(self,univ,year_estb)
self.name = name
self.marks = marks

def printinfo(self):
print(self.name,“:”,self.marks)
University.display(self)
CityInfo.display(self)
def generateInfo(self):
print(“Marks are being generated”)

u1 = University(“ABC University”, 1968)
u2 = University(“XYZ Management University”, 1998)
u2.display()
p1 = Professors(“Sachin”,“ABC University”,167000,1968)
p1.printinfo()
s1 = Students(“Virat”,“XYZ Management University”, 75,1998)
s1.printinfo()

s1.display()
p1.display()
#p1.__display1() #private members are not accessible outside the class
p1._display2() #protected members are accessible by derived classes

ct1 = CityInfo()
ct1.display()

s1.generateInfo() # foreced the derived class to have implemeted the method
p1.generateInfo() # foreced the derived class to have implemeted the method

# Please do the program from:
# https://designrr.page/?id=206786&token=2174026775&type=FP&h=8743
# Page no. 154

#Android: STORY MANTRA:
# categories -> Python -> Python book
class CityInfo:
def cityinfoprint(self):
print(“City is Hyderabad”)
class University:
def __init__(self,univ):
self.univ = univ
def printInfo(self):
print(“University is “,self.univ)
def sampleM(self,num1,num2):
print(“Sample M 3”)
class Student(University):
def __init__(self,name,age,univ):
#University.__init__(self, univ)
super().__init__(univ)
self.name=name
self.age = age
def printInfo(self):
print(“Name, Age & University :”,self.name, self.age, self.univ)
super().printInfo()
CityInfo.cityinfoprint(self)
def sampleM(self):
”’

:return:
”’
print(“Sample M 1”)

s1 = Student(“Sachin”, 49,“Mumbai University”)
s1.printInfo()
s1.sampleM()

def myfunction1(a=10,b=20):
”’
This is my sample function to do nothing but to show off
:param a:int is for length
:param b:int for for taking input data as breadth
:return: nothing
”’
return a + b

num1 = 78
num2 = 88
print(num1 + num2) # operator overloading
print([2,46] + [5,9,8]) # operator overloading

print(input.__doc__)
print(print.__doc__)
print(len.__doc__)
print(myfunction1.__doc__)
from abc import ABCMeta, abstractmethod
class Shape:
__metaclass__ = ABCMeta

def __init__(self, shapeType):
self.shapeType = shapeType
@abstractmethod
def area(self):
pass

@abstractmethod
def perimeter(self):
pass

class Rectangle(Shape):
def __init__(self, length,breadth):
Shape.__init__(self,“Rectangle”)
self.length = length
self.breadth = breadth

def perimeter(self):
return 2*(self.length + self.breadth)

def area(self):
return self.length * self.breadth

class Circle(Shape):
def __init__(self, radius):
Shape.__init__(self,“Circle”)
self.rad = radius

def perimeter(self):
return 2* 3.14 * self.rad

def area(self):
return 3.14 * self.rad**2


r1 = Rectangle(10,5)
print(“Perimeter is”,r1.perimeter())
print(“Area is”,r1.area())

c1 = Circle(5)
print(“Perimeter is”,c1.perimeter())
print(“Area is”,c1.area())

class Books:
count = 0
def __init__(self,title):
self.title = title
Books.count+=1

@classmethod
def totalcount(cls):
print(“Total titles in the library = “,cls.count)

def __del__(self):
print(f”The object with title {self.title} is getting destroyed. You cant use it again!”)
b1 = Books(“Python Programming”)
b2 = Books(“SQL Programming”)
b3 = Books(“Machine Learning”)
b4 = Books(“Tableau book”)
print(b1.title)
b1.totalcount()
print(b3.title)
b3.totalcount()
b4.totalcount()
print(b4.title)

del b4
input()

b4.totalcount()

def myfun1(a,b):
print(a+b)
return a+b

def myfun2():
print(“I do nothing”)

class Sample:
def __init__(self):
print(“Object created”)
def printinfo(self):
print(“Some output here”)

if __name__ ==“__main__”:
myfun1(99,87)
myfun1(99,7)
s1 = Sample()


=====================


#import p11 as RocketScience
from p11 import myfun1

#RocketScience.myfun2()
print(myfun1(5,10))

import random
random.random()

########### FILE HANDLING
#mode of file handling: r (read), w (write-delete old content and write new), a (Append)
## r+ (read & write), w+ (write & read), a+ (append and read)

filename = “abc.txt”

fileobj = open(filename,“r+”)
content = ”’This is a sample content
story about a king
and a queen
who lived in a jungle
so I am talking about
Lion the kind of jungle”’

fileobj.write(content)
content2 = [‘THis is sample line 1\n,‘line 2 content \n,‘line 3 content \n]
fileobj.writelines(content2)
fileobj.seek(20)
output = fileobj.read()
print(“Content from the file:\n,output)
fileobj.seek(10)
output = fileobj.read()
fileobj.seek(10)
content3 = fileobj.read(15)
content4 = fileobj.readline()
print(“Content from the file:\n,output)

fileobj.seek(0)
content5 = fileobj.readlines()
print(“Content from the file:\n,content5)

fileobj.close()

## Exception handling
#SyntaxError : print(“Hello)
#logical error: you make error in the logic – very difficult to find
#runtime errors (exceptions):

num1 = int(input(“Enter a number: “))
# ValueError exception
# Exceptions
a=“k”
b=10
c=-1
try:
c = b/d

except ZeroDivisionError:
print(“Denominator is zero hence stopping the program from executing”)
except TypeError:
print(“Invalid numbers, hence exiting…”)
except NameError:
print(“One of the values has not been defined. Try again”)
except Exception:
print(“Not sure but some error has occurred, we need to stop”)
else:
print(“Answer is”,c)
finally:
print(“We have completed division process”)
#
class InvalidLength(Exception):
def __init__(self,value=0):
self.value = value

length, breadth = –1,-1
while True:
try:
length = int(input(“Enter length: “))
except ValueError:
print(“Invalid number, try again…”)
else:
#assert length > 0, “Rectangle with this diamension is not possible”
if length <=0:
try:
raise InvalidLength
except InvalidLength:
print(“Invalid value for Length hence resetting the value to 1”)
length=1
break
while True:
try:
breadth = int(input(“Enter breadth: “))
except ValueError:
print(“Invalid number, try again…”)
else:
assert breadth>0,“Rectangle with this diamension is not possible”
break

area = length * breadth
print(“Area of the rectangle is”,area)
## ### ##
# datetime, date, time

from datetime import datetime, timedelta
import time
from pytz import timezone

curr_time = datetime.now()
print(“Current time is”,curr_time)
print(“Current time is”,curr_time.strftime(“%d / %m /%Y”))
print(curr_time.year, curr_time.day, curr_time.date())
for i in range(5):
time.sleep(1) # sleep for 2 seconds
print(“Time left:”,5-i,“seconds”)
print(“Good Morning”)
print(“Current time is”,datetime.now())
print(“Date 2 days back was”,(curr_time-timedelta(days=2)).strftime(“%d/%m/%Y”))
print(“UTC Time is”,datetime.now(timezone(‘UTC’)))
print(“UTC Time is”,datetime.now(timezone(‘US/Eastern’)))
print(“UTC Time is”,datetime.now(timezone(‘Asia/Kolkata’)))

Download link:

 

https://dev.mysql.com/downloads/installer/

Create table employees.Employees( EMPID INT Primary Key auto_increment, FNAME VARCHAR(55) NOT NULL, LNAME VARCHAR(55), DOB DATE, EMAIL VARCHAR(35) unique, PHONE VARCHAR(11), DOJ DATE Default(‘2021-07-20’), — YYYY-MM-DD SALARY FLOAT(2), DEPTID INT, Foreign Key (DEPTID) References Departments(DID), Constraint U_UC_LN_DOB Unique(LNAME,DOB), CHECK(Salary 0.0) ) — Constraints: Primary Key, Foreign Key, Not Null, Unique, Check, Default

— Modifying a table – Table is already created and in use –  ALTER TABLE

— ADD or DELETE (DROP) or MODIFY or RENAME a Column

— DDL command to delete is DROP

— DDL command to modify is ALTER

use employees;

 

ALTER table employees ADD BONUS Float(3);

 

ALTER table employees ADD dummy Float(3);

 

ALTER TABLE Employees DROP COLUMN dummy;

 

ALTER TABLE EMPLOYEES MODIFY COLUMN BONUS float(4)

 

ALTER TABLE EMPLOYEES RENAME COLUMN BONUS to BONUS_PCT;

Python and Machine Learning Course May 2023

print(‘5 + 3 =’, 5+3)

pi = 3.1 # variable
g = 9.8
radius = 15
# area of a circle = pi * r square
# commentsm jhjkhjkhkjhjghg
print(“Area of a circle =”,3.1 * radius * radius)

# Write a program to find area and perimeter of a rectangle by taking length and breadth as input
length = 21
breadth = 38
area = length * breadth
perim = 2 * (length + breadth)
print(“Area of rectangle =”,area)
print(“Perimeter of rectangle =”,perim)

#Write a program to find area and perimeter of a square
side = 35
area = side * side
perim = 4 * side
print(“Area of square =”,area)
print(“Perimeter of square =”,perim)

#Find total and average of five numbers
num1 = 78
num2 = 82
num3 = 79
num4 = 91
num5 = 59
total = num1 + num2 + num3 + num4 + num5
avg = total / 5
print(“Total = “,total,”Average is”,avg)

#Find total and average of five numbers
num1, num2, num3,num4,num5 = 78,82,79,91,59
total = num1 + num2 + num3 + num4 + num5
avg = total / 5
print(“Total = “,total,“Average is”,avg)
# find the value of 2x square -10x +30, when x = 8
x = 8
y = 2 * x * x – (10 * x) + 30 # expression
print(“Value of y =”,y)

#different types
# basic data types – they can store only one value at a time
# integer (int) – numbers without decimal values
num1 = 5
num2 = 0
num3 = –99 #<class ‘int’>
print(“1. “,type(num1)) #type(num1) – this is first to be called and then print() is called

# float (float) – decimal values
num1 = 5.0
num2 = 0.9
num3 = –99.1 #<class ‘float’>
print(“2. “,type(num1))

# string (str)
var1 = “Hello”
var2 = ‘Good Evening’
var3 = ”’Hi there”’
var4 = “””Python learning”””
print(type(var1), type(var2),type(var3),type(var4))

# boolean (bool) – True & False
var1 = True
var2 = False
print(type(var1), type(var2))
# complex (complex): complex numbers are square root of negative numbers
# square root of -25 = 5i = 5j
var1 = 5j
print(type(var1))

## Operations

#Arithematics operations (maths op)

#Relational operations

#logical operations (and/or/not)

DS-Weekend-022023
print(“Hello”,5+4,end=” “)
print(“6+3=”,6+3,end=\n)
print(“How are you?”,end=\n)

print(“This line \n for new line”) ;
# integer – int
a=5;
print(type(a))
#string – str

#boolean – bool

#float

#complex

a = 3+5j
print(a*a) # -16 +30j

#####
#1. WAP to find area and perimeter of a rectangle
# input: what you give to the computer
# process: what is the ask from the computer
# output: what you get back in return
# input: length & breadth
# process: area = length * breadth and perimeter as 2*(length + breadth)
# output: print the answers (area and perimeter) on to the screen

length = 25
breadth = 15
area = length * breadth
perimeter = 2*(length + breadth)
print(“Area = “,area)
print(“Perimeter = “,perimeter)

length , breadth,name = 25, 15,“Sachin” #implicit conversion
area = length * breadth
perimeter = 2*(length + breadth)
print(“Area = “,area,“and Perimeter = “,perimeter)

#unpack

#input() – to get input value from the user
length = input(“Enter the length value = “) #implicit into str
length = int(length) #explicit conversion to int
print(“Data type of length is “,type(length))
breadth = int(input(“Enter the breadth value = “))
print(“Data type of breadth is “,type(breadth))
area = length * breadth
perimeter = 2*(length + breadth)
# f string – format string
print(f”A rectangle with length {length} and breadth {breadth} has an area of {area} and perimeter of {perimeter})
# f-string expanded to float and str
total_cost = 100
num_units = 33
print(f”Total cost came to Rs {total_cost} for {num_units} pens so the cost of each pen is Rs {total_cost/num_units:.2f})
print(f”{3.69:.1f})

player = “Virat”
position = “captain”
country = “India”
print(f”Player {player:.<12} is {position:X^15} of {country:->12} team”)

player = “Mbanwaweba”
position = “wicket-keeper”
country = “Zimbabwe”
print(f”Player {player:<12} is {position:^15} of {country:>12} team”)
#Arithematic operations
val1 = 7
val2 = 3
print(val1 + val2) # addition
print(val1 – val2) # subtract
print(val1 * val2) # multiply
print(val1 / val2) # division
print(val1 % val2) # remainder
print(val1 // val2) # integer division
print(val1 ** val2) # power
print(int(10 ** (1/2)))

#binary value
print(bin(10)) #0b1010
print(hex(0o12)) #0xa
print(oct(10)) #0o12
print(int(0b1010))

# binary operators: << >> & |
#Shift
print(10 <<3) # 1010
print(int(0b101000))
print(10 >> 2)
#Relational or conditional: > < <= >= == !=
val1 = 10
val2 = 20
val3 = 10
print(val1 > val2) # is val1 greater than val2? – False
print(val1 >= val2) # is val1 greater than or equal to val2? – False
print(val1 >= val3) # is val1 greater than or equal to val3? – True
print(val1 > val3) # is val1 greater than val3? – False
print(val1 < val3) #False
print(val1 <= val3) #True
print(“val1 == val3: “,val1 == val3)
print(val1 != val3)
print(“A40” > “90”)

# Logical operator: and or not
#input and output both are bool values
# and – both have to True to result into True (otherwise its False)
print(“T and T: “, True and True)
print(“T and F: “, True and False)

# or (+)
print(“F or T : “, False or True)
# 2+5*3 = ?
val1 = 10
val2 = 20
val3 = 10
print(val1 <= val3 or val1 > val3 and val1 > val2) # = ?
# T

print(bin(15))
print(“15 & 10 = “, 15&10)
print(“15 | 10 = “, 15|10)

#print(f”15 | 10 = 15{|}10″, )

print(f”val1 <= val3 or val1 > val3 and val1 > val2 = {val1 <= val3 or val1 > val3 and val1 > val2})
#print(f”True {and} False”)
l = int(input(“Enter the value of length: “))
b = int(input(“Enter the value of breadth: “))

#conditional statements checks the value
if l>0 and b>0:
area = l * b
perimeter = 2*(l+b)
print(f”Rectangle with length {l} and breadth {b} has area of {area} and perimeter of {perimeter})

#another set of if condition
if l>0 and b>0:
area = l * b
perimeter = 2*(l+b)
print(f”Rectangle with length {l} and breadth {b} has area of {area} and perimeter of {perimeter})
else:
print(“Sides of rectangle doesnt look valid”)

#
# Check is a number is positive and if positive chck if divisible by 5

value = 50
if value > 0:
print(f”{value} is positive”,end=” “)
if value %5==0:
print(“and its divisible by 5”)
elif value <0:
print(f”{value} is negative”)
else:
print(f”{value} is neither positive nor negative”)

marks1, marks2,marks3,marks4,marks5 = 96, 96,96,96,95

#assign grade on the basis on avg marks:
# avg>90: A, 75-90: B, 60-75: C, 50-60:D, <50: E
avg = (marks1 + marks2+marks3+marks4+marks5)/5
print(“Average: “,avg)
if avg>=90:
print(“Grade A”)
#print(“Result: You have Passed”)
elif avg>=75:
print(“Grade B”)
#print(“Result: Passed”)
elif avg>=60:
print(“Grade C”)
elif avg>=50:
print(“Grade D”)
else:
print(“Grade E”)
print(“Result: Failed”)

#
print(“===================”)
flag = 0 #didnt win dean’s award
if avg>=50:
#print(“Result: You’ve passed!”)
if avg >= 90:
print(“Grade A”)
if avg >=95:
flag=1
elif avg >= 75:
print(“Grade B”)
elif avg >= 60:
print(“Grade C”)
else:
print(“Grade D”)
print(“Result: You’ve passed!”)
else:
print(“Result: Sorry You’ve failed!”)
print(“Grade E”)
if flag==1:
print(“You win special dean’s award”)

## checking the greater number between 2 values
val1,val2 = 40,20

if val1 > val2:
print(f”{val1} > {val2})
elif val1 < val2:
print(f”{val2} > {val1})
else:
print(f”{val2} = {val1})

#########
val1,val2 , val3 = 50, 90,20

if val1 > val2:
#print(f”{val1} >= {val2}”)
if val1 > val3:
if val3 > val2:
print(f”{val1} >= {val3} >= {val2})
else:
print(f”{val1} >= {val2} >= {val3})
else:
print(f”{val3} >= {val1} >= {val2})
else:
#print(f”{val2} >= {val1}”)
if val2>val3:
if val1 > val3:
print(f”{val2} >= {val1} >= {val3})
else:
print(f”{val2} >= {val3} >= {val1})
else:
print(f”{val3} >= {val2} >= {val1})


#LOOPS- to execute block of code multiple times

#range(a,b,c): a=starting (=), b= ending value (<), c=increment
#range(5,30,5): 5,10,15,20,25
#range(a,b) c is default = 1
#range(4,10): 4,5,6,7,8,9
#range(b), a is default = 0, c is default = 1
#range(4): 0,1,2,3
#FOR Loop – when we know how many times to run

print(“i” in “India”) #True
print(“A” in “India”) #False because A is not in India

for counter in range(5,10,3):
print(“In For Loop:”,counter)

for counter in “India”:
print(“In For Loop:”,counter)

#Run a loop 5 times:
for i in range(5):
print(“i = “,i)

#a way of printing even numbers upto 20
for i in range(0,21,2):
print(i,end=“, “)
print()

for i in range(1,101):
if i %5==0:
print(i,end=“, “)
print()

# WHILE Loop –
ch=“n”
#extry controlled loop
while ch==“y”: #while will execute if the condition is True
print(“How are you?”)
ch=input(“Enter y to continue, anyother key to stop: “)

#exit controlled
while True:
print(“I am fine”)
ch = input(“Enter n to stop, anyother key to stop: “)
if ch==“n”:
break
###
”’
* * * * *
* * * * *
* * * * *
* * * * *
* * * * *
”’

for j in range(5):
for i in range(5):
print(“*”,end=” “)
print()

”’
*
* *
* * *
* * * *
* * * * *
”’

for j in range(5):
for i in range(j+1):
print(“*”,end=” “)
print()

”’
* * * * *
* * * *
* * *
* *
*
”’
num=10
for j in range(num):
for i in range(num-j):
print(“*”,end=” “)
print()

”’
*
* *
* * *
* * * *
* * * * *
”’
for j in range(5):
for i in range(5-j):
print(” “,end=“”)
for i in range(j+1):
print(“*”,end=” “)
print()


#Multiplication table
for j in range(1,11):
for i in range(1,11):
print(f”{i:<2} * {j:<2} = {j*i:<2},end=” “)
print()

#match and case
ch = input(“Enter you favorite programming language: “)
match ch:
case “Python”:
print(“You are on Data Scientist track”)
case “Java”:
print(“You are on Mobile App Developer track”)
case “Javascript”:
print(“You are on Web Developer track”)

#Program to take input marks and find avg
ch=“y”
sum=0
counter=0
while ch==“y”:
marks = int(input(“Enter marks: “))
counter+=1
sum+=marks # sum=sum+marks
ch=input(“Do you have more marks to add? y for yes: “)
avg = sum/counter
print(f”Total marks= {sum} and Average marks ={avg})

#
# guessing number game – human v computer
import random
num = random.randint(1,100) #both start and end is inclusive
attempt = 0
while True:
val = int(input(“Guess the number (1-100): “))
if val<1 or val>100:
print(“Invalid number!”)
continue #take you to the beginning of the loop
attempt+=1
if val==num:
print(f”You have guessed it correctly in {attempt} attempts”)
break #throw you out of the loop
elif val <num:
print(“Incorrect! Your guess is low”)
else:
print(“Incorrect! Your guess is high”)

##
# guessing number game – computer v computer
import random
num = random.randint(1,100) #both start and end is inclusive
attempt = 0
start,end=1,100
while True:
val = random.randint(start,end) #int(input(“Guess the number (1-100): “))
if val<1 or val>100:
print(“Invalid number!”)
continue #take you to the beginning of the loop
attempt+=1
if val==num:
print(f”You have guessed it correctly in {attempt} attempts”)
break #throw you out of the loop
elif val <num:
print(“Incorrect! Your guess is low”)
start = val + 1 #guess a higher number
else:
print(“Incorrect! Your guess is high”)
end = val – 1 #guess a lower number

### Using IF Condition in one single line – one line condition
# Ternary operator: condition logic should not be more than 1 line

val1, val2 = 30,40

var1 = val1 if val1 > val2 else val2
print(“1. Higher number is “,var1)

var1 = “val1 is higher” if val1 > val2 else “val2 is higher”
print(“2. Message: “,var1)

### One line for loop
#square the values greater than 5 and cube the values for others
for i in range(10):
if i >5:
val = i**2
else:
val = i**3
print(val)

#Above code can be implemented in one line:
print(“Using one line loop and condition:”)
for i in range(10): print(i**2) if i>5 else print(i**3)

Assignment Programs

Exercise 1: Write a program in Python to display the Factorial of a number.

Exercise 2: Write a Python program to find those numbers which are divisible by 7 and multiples of 5, between 1500 and 2700 (both included).

Exercise 3: Write a Python program to reverse a number.

Exercise 4: Write a program to print n natural number in descending order using a while loop.

Exercise 5: Write a program to display the first 7 multiples of 7.

Exercise 6: Write a Python program to convert temperatures to and from Celsius and Fahrenheit.

[ Formula : c/5 = f-32/9 [ where c = temperature in celsius and f = temperature in fahrenheit ]

Expected Output :

60°C is 140 in Fahrenheit

45°F is 7 in Celsius


Exercise 7: Write a Python program that iterates the integers from 1 to 50. For multiples of three print “Fizz” instead of the number and for multiples of five print “Buzz”. For numbers that are multiples of three and five, print “FizzBuzz”.

Sample Output :

fizzbuzz

1

2

fizz

4

buzz


Exercise 8: Write a Python program to print the alphabet pattern ‘A’.

Expected Output:


  ***                                                                   

 *   *                                                                  

 *   *                                                                  

 *****                                                                  

 *   *                                                                  

 *   *                                                                  

 *   *



Exercise 9: Write a Python program to print the alphabet pattern ‘G’. 

Expected Output:


  ***                                                                   

 *   *                                                                  

 *                                                                      

 * ***                                                                  

 *   *                                                                  

 *   *                                                                  

  *** 


#String
name = “Sachin” \
“Cricket God”
name1 = ‘Virat’
name2 = ”’Rohit
Captain of Team India”’
name4=“””Dhoni
won the world
cup for India
in multiple formats of the
game”””

print(name)
print(name2)
print(name4)

#substring- subset of data – indexing
name = “Sachin Tendulkar”
print(name[0])
print(name[1])
print(“SCI => “,name[0]+name[2]+name[4])
print(“First 3 characters: “,name[0:3], name[:3]) #nothing on left on : means start from zero
print(“chin => “,name[2:6])
print(“kar =>> “, name[13:16])
#len()
print(“Total characters in name is”,len(name))
print(name[15], name[len(name)-1], name[-1])
print(“First character using backward indexing: “,name[-16], name[-len(name)])

print(“kar using backward indexing=>> “, name[-3:]) #when left black on the right means go upto end

fname = “Sachin”
age = 49
print(“First name of the player is “+fname)
print(“Age = “+str(age))

for i in range(5):
print(“* “*(5-i))

#Data structures
# list, tuple, dictionary, sets

print(“Print all characters: “,name,name[:])

# Strings are immutable: TypeError: ‘str’ object does not support item assignment
val = “HELLO”
val = “hELLO”

txt = “abade”
for i in txt:
print(“* “*(5– txt.index(i)))

## is…() – return either True or False
txt = “highway route number. I AM driving there. since one month”
print(“txt.isupper: “,txt.isupper())
print(“txt.islower: “,txt.islower())
print(“Alphabet & Numeric”,txt.isalnum())
print(“Alphabet & Numeric”,txt.isalpha())
print(“Alphabet & Numeric”,txt.isdigit())

print(txt.upper())
print(txt.lower())
print(txt.title())
print(txt.capitalize())

# split()
print(txt.split(‘o’))
out = [‘highway’, ‘route’, ‘number.’, ‘I’, ‘AM’, ‘driving’, ‘there.’, ‘since’, ‘one’, ‘month’]
print(“====”)
out2 = ” “.join(out)
print(out2)

txt = “I am driving on highway route on number fifteen.”
print(txt.lower().count(“i”))
start_pos=0
for i in range(txt.lower().count(“i”)):
print(txt.lower().index(“i”,start_pos),end=“, “)
start_pos=txt.lower().index(“i”,start_pos) + 1
print()

print(txt.replace(“on”,“over”,1))
print(txt)
#strings are immutable

#LIST
l1 = [2,3,5.5,True,“Hello”,[4,8,12]]
print(len(l1))

print(type(l1))
print(type(l1[0]))
print(type(l1[-1]))

print(l1[-2].upper())
l2 = [False,5,115]
l3 = l1 + l2
print(l3*3)

#lists are mutable
l2[1] = “How are you?”
print(l2)

for i in l2:
print(i,end=“, “)
print(\n\n### Methods”)
### Methods
l1 = [2,4,6,8,10,2,14]
print(l1.index(2,2))
print(l1.count(2))
print(“1. Current list = “,l1)
print(l1.pop(2)) #removes index element
print(“2. Current List = “,l1)
print(l1.remove(14)) #removes value element
print(“3. Current List = “,l1)
l1.append(21) #adds at the end of the list
print(“4. Current List = “,l1)
l1.insert(3,31) #takes position and the value
print(“5. Current List = “,l1)
#below creating 2 new lists based on l1 values
l2 = l1 #deep copy
l3 = l1.copy() #shallow copy
print(“1. L1 = “,l1)
print(“1. L2 = “,l2)
print(“1. L3 = “,l3)
l1.append(42)
l2.append(52)
l3.append(62)
print(“2. L1 = “,l1)
print(“2. L2 = “,l2)
print(“2. L3 = “,l3)
l1.extend(l3)
print(l1)
l1.reverse()
print(l1)
l1.sort() #increasing order
print(l1)
l1.sort(reverse=True) #decreasing order
print(l1)

#definition: List is an ordered linear mutable collection
subject_list = [‘Maths’,‘Science’,‘English’,‘Social Science’,‘German’]
marks_list = []
sum=0
for i in range(5):
#marks = int(input(“Enter marks in Subject “+str(i+1)+”: “))
marks = int(input(“Enter marks in Subject ” + subject_list[i] + “: “))
sum+=marks
marks_list.append(marks)
print(“Marks obtained in each subject = “,marks_list,“and total marks = “,sum)

sum=0
for i in marks_list:
sum+=i
print(“Total marks = “,sum)



#Reduce, Map, Filter => later after functions
# TUPLE:
#definition: Tuple is an ordered linear immutable collection
t1 = ()
t1 = (5,)
t1 = (5,4)
t1 = (5,6,7,8.0,“9”)
print(type(t1))
#tuples are converted to list and vice-versa
t1 = list(t1)
t1.append(45)
t1 = tuple(t1)
#tuples are faster than list for reading

#packing
t1 = (3,30,“Hello”) #packing
#unpacking
a,b,c = t1
print(a,b,c)

print((2,3,99) > (3,1)) #checks one by one member untill it finds the greater value

#List – linear ordered mutable collection
# dictionary – ordered collections – key:value pair

dict1 = {}
print(type(dict1))
dict1 = {“name”:“Sachin”,“city”:“Mumbai”,“Runs”:12345,“name”:“Tendulkar”}
print(dict1)
print(dict1[‘name’])
dict2 = {(“Team IPL”,“Team Ranji”):“Mumbai Indians”}
dict1.update(dict2)
print(dict1)

for i in dict1.keys():
print(i, dict1[i])
print(“Iterating through values:”)
for i in dict1.values():
print(i)
print(“Iterating through items (key,value):”)
for i,j in dict1.items():
print(i,j)
for i in dict1.items():
print(list(i))

print(“Printing the values: “)
print(“Keys :”,dict1.keys())
print(“Values:”,dict1.values())
print(“Items:”,dict1.items())

dict1.pop(‘Runs’)
print(“After pop: “,dict1)
dict2 = dict1 #deep copy
dict3 = dict1.copy() # shallow copy
t_dict = {“Country”:“India”}
dict1.update(t_dict)
print(“Printing all 3 Dictionaries: “)
print(“Dict1: “,dict1)
print(“Dict2: “,dict2)
print(“Dict3: “,dict3)
#dict1[‘name’]=”Tendulkar”
t_dict = {“name”:“Tendulkar”}
dict1.update(t_dict)
for i in range(2):
print(“Dict1 before popitem: “,dict1)
dict1.popitem()
print(“Dict1 after popitem: “,dict1)

”’
Write a program to input Roll no. and marks of 3 students in 3 subjects
{101:[], 102:[]}

”’
dict_marks={}

for i in range(3):
t_dict = {}
roll=int(input(“Enter the Roll number:”))
t_list=[]
for j in range(3):
marks=int(input(“Enter the marks for Roll no.”+str(roll)+” in Subject “+str(j+1)+” :”))
t_list.append(marks)
t_dict = {roll:t_list}
dict_marks.update(t_dict)
print(“Final data:”,dict_marks)

”’
Assignment program:
from the below dictionary find the topper for each subject:
{100: [55, 66, 78], 102: [90, 87, 54], 105: [67, 76, 87]}

e.g. Highest in Subject 1: 102 with 90 marks
Highest in Subject 2: 102 with 87 marks
Highest in Subject 3: 105 with 87 marks
”’
#Example 2:
master_dict = {}

for k in range(2):
t_dict = {“Name”:0,‘Age’:0,‘Email’:0,“Address”:0}
for i in t_dict.keys():
j=input(“Enter the client’s “+i+” :”)
t_dict[i] = j
master_dict.update(t_dict)
print(“Master Dictionary information: \n,master_dict)

##################################################
# SETS
set1 = {}
print(type(set1))
set1 = {“Apple”,“Banana”,“Mango”,“Grapes”,“Guava”}
print(type(set1))
set2 = {“Mango”,“Grapes”,“Guava”,“Apple”,“Banana”}

set1 = {1,2,3,4,5,6}
set2 = {1,3,5,7,9}
print(“Union:”)
print(set1 | set2)
print(set1.union(set2))
print(“Intersection:”)
print(set1 & set2)
print(set1.intersection(set2))
print(“Minus – difference”)
print(set1 – set2)
print(set2.difference(set1)) #set2 – set1
print(“Symmetric Difference”)
print(set1 ^ set2)
print(set1.symmetric_difference(set2))
print(“Set1 before pop:”,set1)
set1.pop()
print(“Set1 after pop:”,set1)

list1 = [1,2,3,4,1,2,3,1,2,1]
list1 =list(set(list1))
print(type(list1))
print(list1)
#################### FUNCTIONS ##############################
def myquestions():
”’this is a sample function to demonstrate how function works
it doesnt take any parameter nor does it return anything
-written on 22nd april”’
print(“whats your name?”)
print(“how are you?”)
print(“Where are you going?”)

def mycalc1(a,b,c):
print(“MY CALC 1”)
print(f”A,B and C values are {a},{b},{c})
total = a+b+c
print(total)

def mycalc2(a,b=0,c=9):
print(“MY CALC 2”)
print(f”A,B and C values are {a},{b},{c})
total = a+b+c
print(total)

def myfunc1(a,*b,**c):
print(“A = “,a)
print(“B = “,b)
print(“C = “,c)


myquestions()
print(myquestions.__doc__)
print(“Doc for print”)
print(input.__doc__)
#doc – first line of code inside the function, must be multiline comment
print(\n\n\n)
mycalc1(10,50,90) #required positional arguments
mycalc2(3,7,19) #calling default arguments
mycalc1(c=1,a=7,b=3) #use keywords to avoid positional

print(“Calling variable length arguments:”)
myfunc1(10,1,2,3,4,5,6,7,8,9,0,4,5,66,44,333,33, name=“Sachin”,age=43,runs=19890)

def myfun1(a,b,c):
#
global x
print(“X = “,x)
x = 5
print(“X = “, x)


x=50
myfun1(5,10,15)
print(“in Main x = “,x)
###
def isPrime(n):
check= True
for i in range(2,n//2+1):
if n%i==0:
check=False
break
return check

check = isPrime(51)
if check:
print(“51 number is prime”)
else:
print(“51 number is not prime”)

#generate list of prime number between 1000 and 2000

for i in range(1000,2001):
out =isPrime(i)
if out:
print(i)
def myfunc1():
“””This is a sample function to see the working of a function”””
print(“What’s your name?”)
print(“How are you?”)
print(“Where do you live?”)

def myfunc2(a,b,c): # required positional arguments
print(f”Values of a,b and c are {a},{b} and {c} respectively”)
total = a+b+c
print(“Total is “,total)

def myfunc3(a,b=0,c=0): # a & b required positional and c is positional not required (default)
print(f”Values of a,b and c are {a},{b} and {c} respectively”)
total = a+b+c
print(“Total is “,total)



def isPrime(n):
”’isPrime is a function that takes a parameter n and
check and prints if its a prime number of not”’
prime = True
for i in range(2,n//2 +1):
if n%i==0:
prime=False
break
return prime

if __name__ ==“__main__”:
myfunc1()
print(“————-“)
print(myfunc1.__doc__)
# doc string: a multi line string and first line in the function
print(print.__doc__)
print(int.__doc__)

myfunc2(5, 10, 15) # required positional
print(“Calling My Func3 below:”)
myfunc3(10, 20)
myfunc3(10, 20, 30)

n = 11
out = isPrime(n)
if out:
print(n, “is a prime number”)
else:
print(n, “is not a prime number”)

# I want to print a list of all the values that are prime between 100 and 500
print(“Printing list of prime numbers from 100 to 500:”)
for k in range(100, 501):
if isPrime(k):
print(k)

# We are talking about non-positional (or KEYWORD arguments)
print(“Working on keyword arguments:”)

myfunc2(b=45, c=50, a=70) # for keyword – use same arguments that are already there

# Module:

P5.py file:

from p4 import myfunc2
def mytask(n):
print(“Hello : “,n)
if n==0:
return 100
mytask(n-1)

# 5! = 5 * 4!

def myfacto(n):
if n==1:
return 1
return (n * myfacto(n-1))


###### decorators

def outer():
print(“Line one of outer”)
def inner():
print(“Line 1 of inner”)
print(“Line two of outer”)
inner()
print(“Line three of outer”)



def myouter11():
print(“This is line 1 from myouter 11”)
def myouter22():
print(“This is line 1 from myouter 22”)
def myouter33():
print(“This is line 1 from myouter 33”)
def myouter2(var1):
print(“This is line 1 from myouter 2”)
var1()
print(“This is line 2 from myouter 2”)


if __name__ ==‘__main__’:
out = myfacto(50)
print(“Factorial of 4 is”, out)
outer()
myouter2(myouter33)

P6.py:

#import p5 as SuperFunctions
from p5 import mytask, myfacto, myfunc2
from MyPack1 import modul1
modul1.myfunc1()

#SuperFunctions.mytask(7)
mytask(50)
myfacto(10)
# one line for loop
print(“Option 1”)
for i in range(5):
print(“Hello”)
print(“Option 2”)
for i in range(5): print(“Hello”)
print(“Option 3”)
list1 = [2,4,6,8,10]
for i in list1: print(i)
print(“Option 4”)
prod=1
for i in range(1,10): prod*=i
print(prod)
print(“Option 5”)
mylist1 = [i for i in range(1,10)]
print(“Mylist1 = “,mylist1)

# one line if condition
print(“Condition Option 1”)
num=-5
if num>0:
print(“Positive”)
else:
print(“Not positive”)
output = “Positive” if num>0 else “Not Positive”
print(“Option 1 output = “,output)

print(“Condition Option 1 with Loops”)
### calculate cube of values between 1 and 9
print([num**3 for num in range(1,10)])
### calculate cube of values between 1 and 9 if the value is odd
print([num**3 for num in range(1,10) if num%2==1])
# one line function
print(“one line function Option 1”)
myfun1 = lambda x,y:print(“Total = “,x+y)
myfun1(10,20)
print(“one line function Option 2”)
friends=[“Rohit”,“Rahul”,“Surya”,“Kohli”]
batting = lambda team: [print(“Now batting:”,x) for x in team]
batting(friends)

#MAPS, FILTER & REDUCE
input = [2000,3000,100,200,5000,6000,3000,900,600,500,230,8000]
# all the three concepts works on list – input is a list and output depends upon the task
## 1. map – if there is a single logic (formula) that you need to apply on entire list values
#example: convert these feet into metres: divide by 3.1
some_func = lambda num:num/3.1
out = list(map(some_func,input))
print(“Output = “,out)

# Filter: if there is a single logic (condition) based on whicch you select subset
## subset of values are created when the condition returns True
out = list(filter(lambda x: (x//100)>=10,input))
print(“Filtered values are: “,out)

# Reduce: takes entire data in a list and reduces them to just 1 single value based on the given formula
from functools import reduce
print(“Sum of all the values are: “,reduce(lambda a,b:a+b, input))

PROJECT 1: Working with Dictionary

## Project 1: Create a menu option for billing
## Dictionary: {Item_code: [“Item_Description”,price]}
## create bills for each individual: {item_code: [quantity, price, total_cost]}
from datetime import date, datetime, timedelta, timezone
import time
start = time.time()
for i in range(1000000):
out = i**3+500*i**2+9
time.sleep(1)
end = time.time()

print(“Total time taken by the program to run: “,end-start)


from datetime import datetime
currenttime= datetime.now()
print(“Current time: “,currenttime)
print(“Date: “,currenttime.strftime(“%y-%m-%d”))
print(“Get: “,currenttime.year, currenttime.day, currenttime.minute)
print(“out: “,currenttime.today())
print(“Weekday: “,currenttime.weekday())

from datetime import timedelta,datetime
print(“Yesterday: “,currenttime-timedelta(days=1))
print(“Next week: “,currenttime+timedelta(days=7))
class BookMagazine:
__publisher = “Eka Publishers”
def __init__(self, title,page):
print(“Publisher is: “,BookMagazine.__publisher)
self.title=title
self.pages=page
class Books(BookMagazine):
total_books = 0 # class level variable – all objects and class will return same value
#object level variables will be inside object methods
def __init__(self, title,author,page):
BookMagazine.__init__(self,title, page)
self.author=author
Books.total_books +=1

def display_book(self):
print(“Dummy Function: Book Created”)
print(“Title = “, self.title)
if self.author==“”:
print(“There is no author name declared”)
else:
print(“Author = “, self.author)
print(“Pages = “, self.pages)

@classmethod
def display_count(cls):
print(“Total book count = “,cls.total_books)
##
b1=Books(“Python Programming”,“Swapnil Saurav”,330)
print(“B1 Display”)
b1.display_book()

b2=Books(“Data Science”,“Swapnil Saurav”,550)
print(“B2 Display”)
b2.display_book()

b3=Books(“Data Visualization”,“Swapnil Saurav”,250)
b3.display_book()
print(b1.total_books)

print(Books.total_books)

#Today’s assignment; Using Class and objects perform addition, subtraction, multiplication,
# and division. Each of these should have unique functions. init should take in 2 input values
# from the user
# Implement atleast one of these: class variable and method, object variable and method

class Magazines(BookMagazine):
def __init__(self,title,pages,genre):
BookMagazine.__init__(self,title,pages)
self.genre = genre

class Library:
def lib_fun1(self):
print(“Printing from Library class:”)
#print(“Publishers = “,BookMagazine.__publisher) #throws error as private members cant be accessed
print(“Total Books = “,Books.total_books)

#print(Books.__publisher) #throws error as private members cant be accessed
#print(b2.)
l1= Library()
l1.lib_fun1()
#Access Modifiers:
## private: only the members of the same class can access
## protected: (one _ variable name): _name, _pub => only derived class can be called
### concept of protected is there but practically its not yet updated

## public: any member can call members of any class
class Books:
total_books=0
def __init__(self,title,author):
self.title = title
self.author = author
Books.total_books +=1
def display_data(self):
print(f”Title = {self.title} and Author = {self.author})

b1 = Books(“Python Programming”,“Saurav”)
print(type(b1))
b1.display_data()
b2 = Books(“Machine Learning”,“Saurav”)
b2.display_data()
l1 = []
print(type(l1))

##############
#Errors
# Syntax errors – when you dont follow Python rules
# print 5

#logical errors – wrong logic.. a + b = 4 *2

# Exception errors – runtime errors
num1=0
try:
num1 = int(input(“Enter a number: “))
#10/0
except (ValueError, ZeroDivisionError):
print(“You have not entered a valid number, hence exiting…”)
except Exception:
print(“Some error has occured, please retry!”)
else:
print(num1) # ValueError
finally:
print(“Error or No error I will be called”)
#10/0: ZeroDivisionError
num1=0
while True:
try:
num1 = int(input(“Enter a number: “))
break
except (ValueError, ZeroDivisionError):
print(“You have not entered a valid number, hence exiting…”)
except Exception:
print(“Some error has occured, please retry!”)
else:
print(num1) # ValueError
finally:
print(“Error or No error I will be called”)

# Assertion Error
def print_data(num):
#perform this only when num >100
assert (num>100), “Value entered is too small to process”
return num**num
try:
out = print_data(100)
except AssertionError:
print(“Give larger value and run again”)
else:
print(“Output is “,out)

#########


### create my own exception
class TooSmallValue(Exception):
def __init__(self,value=0,min=100):
self.value = value
self.min = min

#driving code
value = int(input(“Enter a value >100: “))
try:
if value <=100:
raise TooSmallValue
except TooSmallValue:
print(“TooSmallValue: Give larger value and run again”)
else:
print(“Output is “,value*value)


######### WORKING WITH OPERATING SYSTEMS

import os
os_name = os.name
print(os.name)
if os_name==‘nt’:
print(“You are using a Windows machine”)
elif os_name==‘posix’:
print(“This is Mac or Linux or Unix machine”)
else:
print(“not sure which OS you are using”)

## Some OS specific command
#os.rename(“infy.py”,”infy_apr.py”)
#os.mkdir(“TEST”)
from pathlib import Path
import os

path_loc = Path(“C:/Users/HP/PycharmProjects/pythonProject/”)
for p in path_loc.iterdir():
print(p, ” : Is it a directory: “, p.is_dir())

# Text file processing
fileobj=open(“file1.txt”,“a+”) # read(r), write (w), append (a): r+ w+ a+
if fileobj.readable():
print(“read operations”)
content = fileobj.read()
print(“Entire content:”)
print(content)
fileobj.seek(5) #move to first character
print(“First 10 characters:”)
print(fileobj.read(10))
# readline will read maximum one line at a time that too current
fileobj.seek(0)
line = fileobj.readline(1000)
print(“Line: \n,line)
lines = fileobj.readlines()
print(“==========reading lines”)
print(lines)
fileobj.seek(0)
print(“==========reading lines”)
print(lines)
else:
print(“Its not readable”)

if fileobj.writable():
lines = [‘Twinkle Twinkle Little Star\n,‘How I wonder\n,
‘What you are\n,‘Up Above the World\n]
fileobj.writelines(lines)
fileobj.close()

######### CSV Files
import csv

fileobj = open(“D:/datasets/tcs_stocks.csv”) # default mode is r (read)
csv_file = csv.reader(fileobj, delimiter=“,”)
print(list(csv_file))
fileobj.seek(0)
for i in csv_file:
for j in i[:2]:
print(j, end=” “)
print()

fileobj.close()

# create a csv file
header = [‘Name’,‘Team’,‘Matches’]
row1 = [‘Sachin’,‘Mumbai’,222]
row2 = [‘Laxman’,‘Hyderabad’,212]
row3 = [‘Rahul’,‘Bangalore’,333]
import csv
fileobj = open(“sample1.csv”,‘w’,newline=)
row_writer = csv.writer(fileobj,delimiter=‘|’)
row_writer.writerow(header)
row_writer.writerow(row1)
row_writer.writerow(row2)
row_writer.writerow(row3)
fileobj.close()

###### JSON #############
#json: load, loads, dump, dumps
import json
fileoj = open(“json1.json”,“r”)
content = json.load(fileoj)
#print to check if we got the content or not – this is not how to
#display the json content
print(type(content))
#we will use json dumps to display on to the screen
print(json.dumps(content, indent=4, sort_keys=True))
fileoj.close()

fileoj = open(“json2.json”,“w”)
content1 = ‘{“Name”:”Virat”,”Game”:”Cricket”}’
print(type(content1))
content1 = json.loads(content1)
print(type(content1))
print(json.dumps(content1, indent=4, sort_keys=True))
json.dump(obj=content1,fp=fileoj, indent=4)
fileoj.close()

############ DATABASE ################

# structured v unstructured
# Name, Age, Country, Runs, Wickets

## Library Application
# Table 1: Books
## Columns: BookID (INTEGER), BookTitle (TEXT), Price (FLOAT), Copies (INTEGER)

# Table 2: Members
#Columns: MemberID (INTEGER), Name (TEXT), Email (TEXT), Phone (TEXT), Address (TEXT)

#Relationship:
# one to one:
# one to many / many to one:
# many to many:

# Table 3: BOOKSMEMBERS
# columns: TID (INTEGER), BOOKID(INTEGER) , MID(INTEGER),
# ISSUEDATE (DATE), RETURNDATE (DATE)

# OLTP – Online Transaction Processing (normal ) – Reading + (Editing done in bulk)
# OLAP – Online Analytical Processing (Analytics) – Reading

# CRUD : Create (INSERT), Read (SELECT), Update (UPDATE), Delete (DELETE)

# Roles in DBMS:
## Admin (DBA)
## Database Design (ER diagram, create tables) – SQL
## Application Developers: SQL – CRUD
#### Working with Databases #########
”’
# Constraints: Keys (Primary Key, Foreign Key), NOT NULL, UNIQUE, CHECK, DEFAULT

Table 1: Publisher
Create Table Publisher(
PUBID int Primary Key,
Address varchar(100),
Name varchar(25));

INSERT INTO Publisher Values (101, ‘Hyderabad’,’Eka Publishers);
INSERT INTO Publisher Values (102, ‘Mumbai’,’Best Publishers);

Table 2: Books:

Create Table Books(
BookID int Primary Key,
Author varchar(25) NOT NULL,
Title varchar(25) NOT NULL,
Price float(7,2),
Available bool Default 1,
PubID int,
CHECK(Price>=0.0),
Foreign Key (PubID) references Publisher(PubID)
);

INSERT INTO Books (BookID, Author, Title, PubID) Values (101,’Swapnil’,’Python Programming’,101);

INSERT INTO Books (BookID, Author, Title, PubID) Values (102,’Saurav’,’Machine Learning’,101);


Table 3: Member
Create Table Member(
MEM_ID Int Primary key,
Memb_date Date,
Memb_Type varchar(1),
Address varchar(100),
Name varchar(30),
Expiry_date date);

INSERT INTO MEMBER(MEM_ID, Name) Values(101,’%s’)
INSERT INTO MEMBER(MEM_ID, Name) Values(102,’%s’)
INSERT INTO MEMBER(MEM_ID, Name) Values(103,’%s’)

Table 4: BooksMember

Create table BooksMember(
BMID Int Primary Key,
MEMID int Foreign Key References MEMBER(MEMID),
BOOKID int Foreign Key References BOOKS(BOOKID),
BORROW_DATE Date,
RETURN_DATE Date,
);

”’
import pymysql
db_connect = pymysql.connect(host=‘localhost’,password=‘learnSQL’,db=‘library’,user=‘root’)
cursor = db_connect.cursor()
#cursor.execute(‘Drop table Publisher;’)
tab1 = ”’
Create Table Publisher(
PUBID int Primary Key,
Address varchar(100),
Name varchar(25));
”’
#cursor.execute(tab1)
tab2 = ”’
Create Table Books(
BookID int Primary Key,
Author varchar(25) NOT NULL,
Title varchar(25) NOT NULL,
Price float(7,2),
Available bool Default 1,
PubID int,
CHECK(Price>=0.0),
Foreign Key (PubID) references Publisher(PubID)
);
”’
#cursor.execute(tab2)

tab3 = ”’
Create Table Member(
MEM_ID Int Primary key,
Memb_date Date,
Memb_Type varchar(1),
Address varchar(100),
Name varchar(30),
Expiry_date date);
”’
#cursor.execute(tab3)
tab4 = ”’
Create table BooksMember(
BMID Int Primary Key,
BORROW_DATE Date,
RETURN_DATE Date,
MEM_ID int,
BOOKID int,
Foreign Key (MEM_ID) References MEMBER (MEM_ID),
Foreign Key (BOOKID) References BOOKS (BOOKID));
”’
#cursor.execute(tab4)

########### CRUD Create (Insert), Read (Select), Update, Delete ###
## Peforming Create – using Insert
list_insert = [“INSERT INTO Publisher Values (101, ‘Hyderabad’,’Eka Publishers’);”,
“INSERT INTO Publisher Values (102, ‘Mumbai’,’Best Publishers’);”,
“INSERT INTO Books (BookID, Author, Title, PubID) Values (101,’Swapnil’,’Python Programming’,101);”,
“INSERT INTO Books (BookID, Author, Title, PubID) Values (102,’Saurav’,’Machine Learning’,101);”]
list_insert = []
for statement in list_insert:
cursor.execute(statement)
db_connect.commit() # to save the changes

# Insert by dynamic query

#Remove the multi line comment to practice:
”’
name1 = input(“Enter the Member 1 name: “)
insert1 = “INSERT INTO MEMBER(MEM_ID, Name) Values(101,’%s’)”%(name1)
cursor.execute(insert1)
name2 = input(“Enter the Member 2 name: “)
insert2 = “INSERT INTO MEMBER(MEM_ID, Name) Values(102,’%s’)”%(name2)
cursor.execute(insert2)

name3 = input(“Enter the Member 3 name: “)
insert3 = “INSERT INTO MEMBER(MEM_ID, Name) Values(‘%d’,’%s’)”%(103,name3)
cursor.execute(insert3)
db_connect.commit()
”’
## Update existing value in Member table
update1 = “Update Member Set Name=’Sachin Tendulkar’ where mem_id=101”
cursor.execute(update1)
## Delete existing member from Member table
delete1 = “Delete from Member where mem_id=102”
cursor.execute(delete1)
db_connect.commit()
## Reading using Select
select1 = “Select * from Member”
cursor.execute(select1)
results = cursor.fetchall()
for r in results:
print(r)
db_connect.close()

”’
To practice SELECT Commands, please login to:
https://livesql.oracle.com/

create an account and start practicing
”’

— Reading all the rows and columns

select * from HR.Employees;


— All the rows but given columns only

Select Employee_ID, FIRST_NAME, EMAIL from HR.Employees;


— Restricted columns and restricted rows 

Select Employee_ID, FIRST_NAME, EMAIL from HR.Employees where Employee_ID =120;



select first_name||’ has a salary of $’||  salary “Salary Information” , email from hr.employees order by email;


select first_name||’ has a salary of $’||  salary “Salary Information” , email, HIRE_DATE from hr.employees order by HIRE_DATE, email desc;


select first_name, last_name, email, salary from hr.employees where salary > 10000  and salary <18000;


select first_name, last_name, email, salary from hr.employees where salary between 10000 and 18000;


select first_name, last_name, email, salary from hr.employees where salary in (17000, 11000, 13500);


select count(*) from hr.employees


select avg(salary) from hr.employees


select * from HR.Employees;

 

select * from hr.departments;

 

select first_name, last_name, email, to_char(hire_date, ‘Month DD, YYYY’), round(months_between(sysdate, hire_date)) Tenure_in_months from hr.employees;

 

select * from dual;

 

select 3+3 from HR.Employees where rownum<2;

 

select 3+3, abs(-80), to_date(‘May 14, 2023 08:01 P.M.’, ‘Month DD, YYYY HH:MI P.M.’) from dual

 

select Decode(3+5,8,’CORRECT’,’INCORRECT’)  from dual;

 

— Aggregate functions

 

select JOB_ID, count(*), round(avg(salary)), round(sum(salary)) TOTAL_SALARY from hr.employees group by JOB_ID having count(*)>=5;

 

— JOINING TABLES

 

select FIRST_NAME , LAST_NAME, t1.DEPARTMENT_ID, t2.DEPARTMENT_ID, Department_name, HIRE_DATE from HR.Employees t1, hr.departments t2 where t1.department_id = t2.department_ID;

 

— SUB QUERY

select * from HR.Employees where Employee_ID in (select employee_id from hr.employees);

 

select * from HR.Employees where Employee_ID = (select employee_id from hr.employees where rownum < 2);

 

— SET OPERATIONS

select * from HR.Employees where salary <11000

INTERSECT

select * from HR.Employees where salary >20000

import numpy as np
x = range(16)
print(type(x))
x = np.reshape(x,(8,2))
print(type(x))
print(x)
print(“Y:”)
y = [[3,4,5],[2,1,2],[9,0,1],[2,2,2]]
y = np.array(y)
print(y)
print(y[2,0])
print(y[-2,-3])

print(y[1:3,1:])
z = [[3,4,1],[2,3,2],[2,7,1],[2,9,2]]
z = np.array(z)
print(“=======================”)
print(z)
print(y)
print(y + z)
print(np.add(y,z))

print(y – z)
print(np.subtract(y,z))

print(y * z)
print(np.multiply(y,z))

print(y /z)
print(np.divide(y,z))


y = [[3,4,5,3],[2,1,2,2],[9,0,1,1],[2,2,2,4]] # 4 * 4
y = np.array(y)
z = [[3,4,1],[2,3,2],[2,7,1],[2,9,2]] # 4 * 3
z = np.array(z)
print(y @ z) #matrix multiplication
print(np.matmul(y,z))

”’
x + y = 35
2x + 3y = 90
Find x and y ?
”’
# coefficient
coeff = np.array([[1,1],[2,3]])
#variable matrix
solution = np.array([[35],[90]])
## coeff * variable = solution
# variable = inv(coeff) * solution
det_coeff = np.linalg.det(coeff)
print(“Determinant of Coefficient matrix = “,det_coeff)
if det_coeff !=0:
variable = np.linalg.inv(coeff) @ solution
print(“Solution is: x = “,int(variable[0,0]), “and y = “,int(variable[1,0]))


import pandas as pd

y = [[3,4,5,3],[2,1,2,2],[9,0,1,1],[2,2,2,4]]
y1 = np.array(y)
y2 = pd.DataFrame(y)
print(y1)
print(y2)
print(“Y2: \n,y2.loc[0:2,1:3])
y2 = pd.DataFrame(y,columns=[“January”,“February”,“March”,“April”])
print(y2)
y2 = pd.DataFrame(y,columns=[“January”,“February”,“March”,“April”],
index=[“Banana”,“Apple”,“Guava”,“Mango”])
print(y2)
print(“Y2: \n,y2.loc[[“Guava”,“Mango”],[“January”,“February”,“March”]])
# loc, iloc
data = [[“January”,1500,1900],[“February”,1900,1800],[“March”,1500,1800],[“April”,1000,1500],[“May”, 2300,2500]]
import pandas as pd
data_df = pd.DataFrame(data, columns=[“Month”,“Runs Scored”,“Runs Given Away”])
print(data_df)
print(data_df[“Runs Scored”].mean())
print(data_df[“Runs Given Away”].sum())
print(data_df[data_df[‘Month’]==“March”])
print(data_df[data_df[‘Month’].isin([“January”,“April”,“May”])])
print(data_df.iloc[0])
print(data_df.loc[[0,2,4],[“Month”,“Runs Given Away”]])

#pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/user_device.csv”)
device_df = pd.read_csv(“D:/datasets/gitdataset/user_device.csv”) #(272, 6)
print(device_df.shape)
usage_df = pd.read_csv(“D:/datasets/gitdataset/user_usage.csv”) #(240, 4)
print(usage_df.shape)
new_df = pd.merge(device_df, usage_df,on=“use_id”) #how=inner
print(new_df)

new_df = pd.merge(device_df, usage_df,on=“use_id”, how=“left”) #how=inner
print(new_df)
new_df = pd.merge(device_df, usage_df,on=“use_id”, how=“right”) #how=inner
print(new_df)
new_df = pd.merge(device_df, usage_df,on=“use_id”, how=“outer”) #how=inner
print(new_df)
# 159+81+113 = 353
# Univariate: Histogram, Bar chart,

# Bivariate: Scatter

import pandas as pd
data = pd.read_csv(“D:\\datasets\\gitdataset\\hotel_bookings.csv”)
print(data.shape)
print(data.dtypes)

import matplotlib.pyplot as plt
import seaborn as sns
data_30 = data.columns[:30]
#print(data_30)
color_list=[“#00FF00”,“#FF0000”]
sns.heatmap(data[data_30].isnull(), cmap=sns.color_palette(color_list))
plt.show()
import numpy as np
”’
for i in data.columns:
missing_cnt = np.mean(data[i].isnull())
print(f”{i} has {missing_cnt*100}% of missing values”)
”’
for i in data.columns:
missing_cnt = np.mean(data[i].isnull())
if missing_cnt >0.8:
print(f”{i} has {missing_cnt*100}% of missing values”)

## Company has more than 94% missing data – so lets drop it
# axis = 0 – row & axis = 1 – column
data = data.drop([‘company’], axis=1) # column company will be dropped
print(“Shape after dropping company: “,data.shape)
for i in data.columns:
#check for row missing
missing = data[i].isnull()
num_missing = np.sum(missing)
if num_missing > 0:
data[f’{i}_ismissing’] = missing
missing_cnt = np.mean(data[i].isnull())
if missing_cnt >0.8:
print(f”{i} has {missing_cnt*100}% of missing values”) #company is not there now
print(“Shape after adding _ismissing columns: “,data.shape)
# create a new column which will store the missing number of values for each row
is_missing_col = [col for col in data.columns if “ismissing” in col]
data[“num_missing_cnt”] = data[is_missing_col].sum(axis=1)
print(data[“num_missing_cnt”])
# selecting rows with more thab 12 missing values
index_missing_col = data[data[‘num_missing_cnt’]>12].index
data = data.drop(index_missing_col, axis=0)
print(“Shape after removing missing rows: “,data.shape)

# find the missing values:
cols_num = data.select_dtypes(include=[np.number])
all_num_cols = cols_num.columns.values
for i in all_num_cols: #list of columns which are numeric
missing_cnt = np.mean(data[i].isnull())
if missing_cnt >0.00:
#print(f”{i} has {missing_cnt*100}% of missing values”)
med = data[i].median()
data[i] = data[i].fillna(med)


”’
children has 2.0498257606219004% of missing values – FLOAT
babies has 11.311318858061922% of missing values – FLOAT
agent has 13.687005763302507% of missing values – FLOAT

meal has 11.467129071170085% of missing values – CAT
country has 0.40879238707947996% of missing values – CAT
deposit_type has 8.232810615199035% of missing values – CAT

”’


#handle categorical values
data[‘agent’] = pd.Categorical(data.agent)
mode = data[‘agent’].describe()[‘top’]
data[‘agent’] = data[‘agent’].fillna(mode)

data[‘meal’] = pd.Categorical(data.agent)
mode = data[‘meal’].describe()[‘top’]
data[‘meal’] = data[‘meal’].fillna(mode)

data[‘country’] = pd.Categorical(data.agent)
mode = data[‘country’].describe()[‘top’]
data[‘country’] = data[‘country’].fillna(mode)

data[‘deposit_type’] = pd.Categorical(data.agent)
mode = data[‘deposit_type’].describe()[‘top’]
data[‘deposit_type’] = data[‘deposit_type’].fillna(mode)

print(“Final check for missing values:”)
for i in all_num_cols: #list of columns which are numeric
missing_cnt = np.mean(data[i].isnull())
if missing_cnt >0.00:
print(f”{i} has {missing_cnt*100}% of missing values”)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

url = “https://www.hubertiming.com/results/2017GPTR10K”
from urllib.request import urlopen
html_code = urlopen(url)

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_code,“lxml”)
print(soup.title)
all_a = soup.find_all(‘a’) # returns as a list of all values
print(all_a)
for link in all_a:
print(link.get(“href”))

import re
rows = soup.find_all(‘tr’)
list_rows = []
for row in rows:
row_td = row.find_all(‘td’)
row_td = str(row_td)
#row_td = BeautifulSoup(row_td,”lxml”).get_text()
#print(row_td)
pattern = re.compile(<.*?>)
row_td = (re.sub(pattern,“”,row_td))
list_rows.append(row_td)

#Data clearning
list_rows = list_rows[5:] #removing not required rows
# 2. convert into dataframe
data_df = pd.DataFrame(list_rows)

# 3. split into different columns
data_df = data_df[0].str.split(‘,’,expand=True)
print(data_df)

”’
<tr>
<th> – header
<td> – data
”’
all_headers = []
headers = str(soup.find_all(“th”))
headers = BeautifulSoup(headers,“lxml”).get_text()
all_headers.append(headers)
header_df = pd.DataFrame(all_headers)
header_df = header_df[0].str.split(‘,’,expand=True)
print(header_df)
main_df = [header_df, data_df]
main_df = pd.concat(main_df)
print(“=============\n\n)
main_df = main_df.rename(columns=main_df.iloc[0])
main_df = main_df.drop(main_df.index[0])
main_df = main_df.dropna(axis=0, how=“any”)
#remove [ from first col and ] from last column
main_df.rename(columns={‘[Place’ : ‘Place’}, inplace=True)
main_df.rename(columns={‘ Team]’ : ‘Team’}, inplace=True)
main_df[‘Place’] = main_df[‘Place’].str.strip(‘[‘)
main_df[‘Team’] = main_df[‘Team’].str.strip(‘]’)
print(main_df)
print(main_df.info())
import numpy as np
import pandas as pd
url=“D:\\datasets\\OnlineRetail\\order_reviews.csv”
reviews_df = pd.read_csv(url)
print(list(reviews_df.columns))

## 1. convert entire text to lowercase
## 2. compatibility decomposition
## 3. convert into utf8
## 4. removing accent
## 5. sentences into words
## 6. remove stop words

import unicodedata
import nltk
# nltk.download(‘punkt’)

## Function to perform steps 1 to 6
def basic_nlp_analysis(text):
text = text.lower()
#Below code will perform:
## 2. compatibility decomposition
## 3. convert into utf8
## 4. removing accent
text = unicodedata.normalize(‘NFKD’,text).encode(‘ascii’, errors=‘ignore’).decode(‘utf-8’)
## 5. sentences into words
words = nltk.tokenize.word_tokenize(text)

## 6. remove stop words
STOP_WORDS = set(w for w in nltk.corpus.stopwords.words(‘portuguese’))
words = tuple (t for t in words if t not in STOP_WORDS and t.isalpha())

return words

commented_reviews = reviews_df[reviews_df[‘review_comment_message’].notnull()].copy()
print(commented_reviews[‘review_comment_message’])
# will apply basic nlp operations on the column
commented_reviews[‘review_comment_words’] = commented_reviews[‘review_comment_message’].apply(basic_nlp_analysis)

print(commented_reviews[‘review_comment_words’])

INFERENTIAL STATS:

 

Predicting for the population from the sample data

Led by probability – an event you are interested in / total possible outcome

1/6

Head/Tail: ½ 

Discrete and Continuous 

Bayes theorem – p (probability of success) & q (probability of failure)

q = 1-p

Probability (1 time event) and Probability distribution (repetition)

Toss coin one after another: Sample Space: TT, HH, TH, HT = 2/4

Toss 2 coins at the same time: Sample Space: TT, HH, TH

 

Weekend DS Batch April 2023
#1. Python programming-  Python interpreter (python.org)
#2. Data analytics - desriptive stats
#3. Python visualization
#4. Database & SQL (OLTP)
#5. Inferential stats
#6. warehousing (OLAP)
#7. machine learning (using Python)
#8.Visualization using Power BI ====>
#9. Machine learning using R programming
#10. Intro Tableau
# each topic will have a project
#assignments - they will not have code
# go through job description of the jobs you are interested in and put them in an excel
# you X against topics you know

#how to build your resume/profile
###################
# 1. Software to install: Python interpreter from Python.org
# 2. IDE - Integrated Development Environment
## pycharm, vs code, jupyter, colab (online)
print("Hello Everyone",end=". "); #this is comment which will not be executed by Python.
print("wiefsisdkjnsjdosidviosdosdiv");
print("Hello","How are you?","I am fine");print("5+4=",5+4,"another equation: 6+4=",6+4)
# f-string (format string)
print(f"5+4={5+4} another equation: 10/3={10/3:.2f}",end="")
#data type-what kind of value is it? - integer(int), float, complex,string (str), boolean(bool)
print()
var1=5 #<class 'int'>
print(type(var1))
var1="HEllo"
print(type(var1)) #<class 'str'>
var1=5.0 #<class 'float'>
print(type(var1))

var1=5j #<class 'complex'>
print(type(var1))
print(var1 * var1)

#bool
var1 = True #False
print(type(var1))
var1= "true"
Learn R Programming

 

var1 = 5

var1 = 50

print(var1)

#[1] 50

print(var1 + var4)

var = 55

var = 55

var4 = 55

#print(var1, var4)

#Error in print.default(var1, var4) : invalid printing digits 55

cat(var1, var4)  #50 55

print(‘var1 + var4’)

cat(‘var1 + var4=’,var1 + var4)

var1 + var4= 105

#class

print(class(var2))  #[1] “list”

#class

var2 <- 6

print(class(var2))   #[1] “numeric”

#class

var2 <- 6

print(class(var2))  #[1] “numeric”

var2 <- 6.0

print(class(var2))  #[1] “numeric”

var2 <- 6L  #”numeric”

print(class(var2))   #[1] “integer”

var2 <- “6L”  #”integer”

print(class(var2))   #[1] “character”

var2 = TRUE

print(class(var2))   ## “logical”

var1 = 10

var2 = 15

print(var1 %% var2)  #modulo – remainder

var1 = 100

print(var1 %% var2)  #modulo – remainder

var1 = 95

var2 = 15

print(var1 %% var2)  #modulo – remainder

var1 = 5

var2 = 15

print(var1 ^ var2)  #power:


var1<- 15

var2 <- 20

var3 <- 15

#Relational Operator / comparison operator – output is Logical

print(var1 > var2)  #is var1 greater than var2? – FALSE

print(var1 >= var3) 

print(var1 <= var3) 

print(var1 == var3) # double = is for asking is it equal?

print(var1 != var3)


#Logical operator- input and output both are logical

#I will do work 1 and work 2 today

#actual – I did only work 1 => No


#I will do work 1 or work 2 today

#actual – I did only work 1 => Yes

print(var1 == var3 | var1 != var3)  #

print(var1 == var3 & var1 != var3)


#CONDITIONAL STATEMENTS

var1 <- 0

# is it positive or not ?

if (var1 >0) {

  print(“Its positive”)

}


if (var1 >0) {

  print(“Its positive”)

} else {

  print(“Its not positive”)

}


if (var1 >0) {

  print(“Its positive”)

} else if (var1<0){

  print(“Its negative”)

} else {

  print(“Its zero”)

}



#Collections: Vectors, Lists, Matrices, Arrays, Factors & DataFrames

#Vectors: will store multiple values of same datatype

vec1 <- c(45,56,36)

print(vec1)


#List: multiple data types

list1 = list(45,56,”Hello”,c(2,4,6))

print(list1)


#Matrix

mat1 = matrix(c(2,2,4,4,6,6,8,8,10,10,11,11) ,nrow=3,ncol = 4,byrow = FALSE)

print(mat1)


#Arrays – more than 2-D

arr1 = array(c(2,2,4,4,6,6,8,8,10,10,11,11),dim=c(2,4,2,2))

print(arr1)



#factors: categorical values

gender = factor(c(“M”,”M”,”M”,’F’,”F”,”F”))

print(class(gender))

print(nlevels(gender))


#DataFrame

players_stats <- data.frame(

  ID= c(10,20,30),

  Name=c(“Sachin”,”Virat”,”Dhoni”)

)

print(players_stats)



#membership:  %in% : check if left side value is in right side or not

cities<- c(“Delhii”,”New York”,”London”)


print(“Delhi” %in% cities)


avg <- 98

## avg: 80: Grade A, 70-80: B, 60-70- C, 50-60 – D, 40-50: E , <40: Failed

if (avg >=80) {

  print(“Grade: A”)

  if (avg>=90){

    print(“You win special certificate!”)

    if (avg>=95) {

      print(“You win medal”)

    }

  }

} else if (avg>=70) {

  print(“Grade: B”)

} else if (avg>=60) {

  print(“Grade: C”)

} else if (avg >=50) {

  print(“Grade: D”)

} else if (avg>=40) {

  print(“Grade: E”)

} else {

  print(“Failed”)

}

 

result = 3

val1 <- switch(result,

               “Grade A”,”Grade B”,”Grade C”,”Grade D”,”Grade E”, “Grade F”)

cat(“Result – “,val1)

 

 

#Loops – to repeat:  

#repeat: keep repeating – break when a condition is met -EXIT Controlled

#while: will check for the condition and then repeat: ENTRY Controlled 

#for (exactly  how many times to run)

 

start = 1

repeat{

  print(start)

  if (start==10){

    break

  }

  start = start+1

}

 

start = 11

while (start <=20) {

  print(start)

  start = start + 1

}

 

#For loop

 

words <- LETTERS[1:5]

for (i in words) {

  print(i)

}

numbers <- seq(1,10,by=3)

for (i in numbers) {

  print(i)

}

 

num = 30

start = 2

isPrime=TRUE

repeat{

  

  if (num%%start==0){

    isPrime = FALSE

    break

  }

  if (start==num-1) {

    break

  }

  start=start+1

}

 

if (isPrime) {

  print(“Number is Prime”)

} else {

  print(“Number is not Prime”)

}

 

 

## Assignment 1: Do the above with WHILE and FOR

## Assignment 2: Extend the same logic (one of the 3) to generate prime numbers

## between 1000 and 1500



for (num in 10:20){

  #print(num)

  num1=53

  Isprime=TRUE

  for (a in 3:(num1-1)) {

    # cat(“testing value a”,a)

    if (num1%%a == 0) {

      Isprime=FALSE

      #print(a)

      #print(“inside Hello”)

      break

    }

  }

  if (Isprime==TRUE){

    print(num)

  }

}

########################


#Built-in function

print() #parameter


myfunc.generatePrime <- function(num) {

  isPrime=TRUE

  for(i in 2:(num-1)) {

    if (num %%i==0) {

      isPrime=FALSE

    }

  }

  if (isPrime){

    print(‘num is prime’) 

  } else {

    print(‘num is not Prime’)

  }

}


val <- mean(1:100)

print(val)


myfunc.generatePrime(30)


myfunc.checkPrime2 <- function(num) {

  isPrime=TRUE

  for(i in 2:(num-1)) {

    if (num %%i==0) {

      isPrime=FALSE

    }

  }

  return(isPrime)

}


output <- myfunc.checkPrime2(53)

if (output){

  print(‘num is prime’) 

} else {

  print(‘num is not Prime’)

}


for (num in 1000:1300) {

  output <- myfunc.checkPrime2(num)

  if (output){

    print(num) 

  }

}

######   #####################  ################

#built in functions

print(seq(10,90))

print(max(10:90))

print(mean(10:90))

 

#user defined functions

sum.func <- function(num1=1, num2=2,num3=4,num4=6) {

  cat(“Number 1 = “,num1)

  cat(“\n Number 2 = “,num2,”\n”)

  cat(“Number 3 = “,num3)

  cat(“\n Number 4 = “,num4,”\n”)

  result = num1 * num2

  print(result)

}

#calling the functions by parameters

sum.func(40,30)

#call by name

sum.func(num2=40,num4=30)

 

## Assignments: Logic built using loops- convert them to

## functions

 

# #####################

a <- “Whats your name”

b <- ‘What\’s your name?’

 

print(paste(a,b,sep = “:”))

 

print(substring(a,2,6))

 

print(tolower(a))

print(toupper(a))

 

vector1 = c(“Monday”, TRUE,5,”Thursday”)

print(vector1)

print(vector1[2])

print(vector1[-2])

print(vector1[-2])

 

print(vector1[c(2,4)])

 

list1 = list(“Monday”, TRUE,5,”Thursday”)

print(list1)

 

library(ggplot2)

dataset2 <- data.frame(city=c(“City A”,”City B”,”City C”),

                       revenue=c(200,220,190))

 

ggplot(dataset2, aes(x=city,y=revenue)) +

  geom_bar(stat=”identity”)

 

##############################

# VECTORS

vec1 <- c(2,4,”HELLO”, 5,6)

print(vec1)

 

#built-in 

vec2 <- 5:50

print(vec2)

 

vec2 <- 5.4:30.8

print(vec2)

 

#start, end and increment by

vec3 <- seq(5,30.2,by=0.9)

print(vec3)

 

vec1 <- c(2,4,”HELLO”, 5,6,9,11)

print(vec1[c(2,3,6)])

 

vec1 <- c(2,4,6,8,10)

vec2 <- c(1,2,1,2,0)

print(vec1 + vec2)

 

vec1 <- c(2,4,6,8,10,12)

vec2 <- c(1,2)

print(vec1 + vec2)

 

vec1 <- c(2,4,16,18,10,12)

vec3 <- sort(vec1)

print(vec3)

vec3 <- sort(vec1, decreasing = TRUE)

print(vec3)

 

## LIST

list1 <- list(55,”Hello”,c(2,4,6), 5.4)

print(list1)

print(list1[c(1,3)])

list2 <- list(33,99)

 

mergedlist <- c(list1,list2)

print(mergedlist)

 

 

###MATRICES

mat1 <- matrix(c(2,4,6,8,10,12),nrow = 3,byrow=FALSE)

print(mat1)

mat2 <- matrix(c(2,4,6,8,10,12),nrow = 3,byrow=TRUE)

print(mat2)

 

print(mat1 + mat2)

print(mat1 – mat2)

 

print(mat1 * mat2)

 

print(mat1 / mat2)

 

## ARRAY

arr1 <- array(c(2:20),dim = c(2,2,2))

print(arr1)

print(arr1[1,2,1])

print(arr1[,2,1])

# c(1,2,1)

 

##  Factors

regions<- factor(c(“N”,”S”,”S”,”W”,”N”,”E”,”E”,”E”))

 

print(is.factor(regions))

 

 

dataset1 <- data.frame(

  quarter = c(“Q1″,”Q2″,”Q3″,”Q4”),

  revenue = c(100,150,200,170),

  fruits = c(“Apple”,”Banana”,”Mango”,”Oranges”)

)

print(dataset1)

shorterrow <- dataset1[2:3,]

print(shorterrow)

print(dataset1[,c(2,3)])

 

setwd(“D:\\dataset”)

dataset <- read.csv(“1_Data_PreProcessing.csv”)

print(dataset)

 

dataset$Salesperson = ifelse(is.na(dataset$Salesperson),

                             ave(dataset$Salesperson,FUN=function(x) mean(x,na.rm=TRUE)),

                             dataset$Salesperson) 

dataset$Quotation = ifelse(is.na(dataset$Quotation),

                             ave(dataset$Quotation,FUN=function(x) mean(x,na.rm=TRUE)),

                             dataset$Quotation) 

#connecting to SQL Server

#ipaddress, username, password, dbname

 

#install and run library – RODBC

#sql_connection = odbcConnect(“SQLSERVERODBC”)

#sqlQuery(sql_connection,”Select * from table1″)

 

#handling the categorical value

dataset$Region = factor(dataset$Region)

 

#step 3: breaking into training and test set

library(caTools)

split = sample.split(dataset$Win, SplitRatio = 0.8)

training_set = subset(dataset,split==TRUE)

test_set = subset(dataset,split==FALSE)

 

#Step 4: Feature Scaling

# to bring dataset in similar range

### 1. divide the column with higher value, inthis case quotation by 1000

### 2. Min-Max Scaling – values ranges between 0 to 1

### 3. Z Score normalization – preferred

training_set[,2:3] = scale(training_set[,2:3])

test_set[,2:3] = scale(test_set[,2:3])

test_set

 

setwd(‘D:\\dataset’)

dataset = read.csv(“2_Marks_Data.csv”)

scatter.smooth(x=dataset$Hours,y=dataset$Marks,main=”Hours Studied v Marks Obtained”)

#split the dataset into training set and test set

library(caTools)

split = sample.split(dataset$Marks, SplitRatio=0.8)

training_set = subset(dataset, split=TRUE)

test_set = subset(dataset, split=FALSE)

 

#create regression object

regressor=lm(formula = Marks~Hours, data = training_set)

summary(regressor)

# y = 20.76 + 7.57x

#

 

# While solving machine learning problem – 

## 1. Is my data in a ready state to run the algorithm

## 2. Run the algorithm and check the values

####  2.1. Is this the best performance of this model (can I improve this model)

####  2.2: Is this the best model

## 3. Evaluate the performance of the algorithm

## RMSE and Rsquare (o to 1) – closer to 1 means best formance

 

## training performance v test performance – over fitting and under fitting

setwd(‘D:\\dataset’)

dataset = read.csv(“2_Marks_Data.csv”)

print(dataset)

scatter.smooth(x=dataset$Hours,y=dataset$Marks,main=”Hours Studied v Marks Obtained”)

#split the dataset into training set and test set

library(caTools)

split = sample.split(dataset$Marks, SplitRatio=0.75)

#training_set = subset(dataset, split=TRUE)

training_set = dataset[split,]

print(training_set)

test_set = dataset[!split,]

print(test_set)

#create regression object

regressor=lm(formula = Marks~Hours, data = training_set)

summary(regressor)

# y = 20.76 + 7.57x

#

 

# While solving machine learning problem – 

## 1. Is my data in a ready state to run the algorithm

## 2. Run the algorithm and check the values

####  2.1. Is this the best performance of this model (can I improve this model)

####  2.2: Is this the best model

## 3. Evaluate the performance of the algorithm

## RMSE and Rsquare (o to 1) – closer to 1 means best formance

 

## training performance v test performance – over fitting and under fitting

 

y_predict = predict(regressor, newdata = test_set)

#y_predict = predict(regressor, newdata = training_set)

comparison = cbind(test_set, y_predict)

print(comparison)

 

mse = mean((comparison$Marks – comparison$y_predict)^2)

print(mse)

library(MLmetrics)

mape.value = MAPE(comparison$y_predict, comparison$Marks)

print(mape.value)

 

 

y_predict = predict(regressor, newdata = training_set)

#y_predict = predict(regressor, newdata = training_set)

comparison = cbind(test_set, y_predict)

print(comparison)

 

mse = mean((comparison$Marks – comparison$y_predict)^2)

print(mse)

library(MLmetrics)

mape.value = MAPE(comparison$y_predict, comparison$Marks)

print(mape.value)

 

Machine Learning with Python – 01M0
print('5+3=',5+3)  #comment 1
#practice day 1
var1 = 45
print(type(var1)) #<class 'int'> integer
# we have 5 basic types of data (datatypes)

var1=55.0
print(type(var1)) #<class 'float'>

var1 = 3j
print(type(var1)) #<class 'complex'>
print(3j * 3j) #(-9+0j)

var1="Hello"
print(type(var1)) #<class 'str'> string

var1=True #False
print(type(var1))

quantity = 53
price = 119.77
total_cost = quantity * price
#The total cost of 50 pens costing 119 is total_cost
print("The total cost of",quantity,"pens costing",price,"is",total_cost)
#format string - f string
print(f"The total cost of {quantity} pens costing {price} is {total_cost:.2f}")

name1 ="Rohit";country="India"
position="Captain"
print(f"Player named {name1:.<12} plays for {country:X^15} and he is {position:>15} of the team")

name1 ="Mangwabate"
country="Zimbabwe"
position="Wicket-keeper"
print(f"Player named {name1:<12} plays for {country:^15} and he is {position:>15} of the team")

var1,var2 = 50, 60
print("I am here");print("Hello \\\n Ok \tfine",end=". ")
print("How are you?" );

#str, float, int, bool, complex
# \ is called escape character
num1 = 55
name1 = "Sachin"
#whenever you read as input, the data type will always be string
name2 = input("Enter your name: ")
print(name2)
num2 = input("Enter radius: ")
num2 = int(num2)
print(num2)
area = 3.14*num2*num2
print("Area = ",area)

# + / - *
# // (integer division - this will give you only integer part of the division)
# ** power
# %(modulo -remainder)
num1 = 13
num2 = 5
#Arithematic/math operations
print(num1 + num2)
print(num1 - num2)
print(num1 * num2)
print(num1 / num2) #output wll always be float
print(num1 // num2) #output will always be int
print(num1 ** num2) # 10 to the power 5
print(num1 % num2) #modulo -
#logical operators: Input is bool values and output is also bool: and or not
#and - even if one value is false it will give you false
#or - even if one value is True it will give you true
#not - opposite: not True = False
print(True and False)
print(False or False)
print(not False)
num1 = 9
num2 = 13
print("Logical operator")
print(num1 > num2 or num1 < num1 and num2 == num1 or num1 != num2 or num1!=num2)
#(T)


#comaprison operators: < > <= >= == != : True/False
num1 = 9
num2 = 13
print("is num1 equal to num2? ",num1==num2)
print("is num1 not equal to num2? ",num1!=num2)
print("is num1 greater than num2? ",num1>num2)
print("is num1 greater than or equal to num2? ",num1>=num2)
print("is num1 less than num2? ",num1<num2)
print("is num1 less than or equal to num2? ",num1<=num2)

r=5
pi=3.14
print("Radius = ",pi*r**2)
# is 71 divisible by 9 ?
print(71%9)

####################
#is x divisible by y or not?
#conditions - they are used to check if its condition 1 or 2
# if to check conditions followed by Conditional or logical- they only
# give you True or False as output
x = 72
y = 9
if x%y==0:
print(f"{x} is perfectly divisible by {y}")
print("PERFECT DIVISIBLE")
else:
print(f"{x} is NOT divisible by {y}")
num1 = 0
if num1 >0:
print("Its positive")
else:
print("Its not positive")

num1 = 0
if num1 >0:
print("Its positive")
elif num1<0:
print("Its negative")
else:
print("Its not positive")

sum=448
avg=sum/5
print("AVG = ",sum/5)
#if avg>90 - grade A
# 80-90 - grade B # 70-80 - grade C # 60-70 - grade D
# 50-60 - grade E #40-50" grade F #<40 - Fail
if avg>=90:
print("PASS")
print("Grade = A")
elif avg>=80:
print("PASS")
print("Grade = B")
elif avg >=70:
print("PASS")
print("Grade = C")
elif avg >= 60:
print("PASS")
print("Grade = D")
elif avg >=50:
print("PASS")
print("Grade = E")
elif avg>=40:
print("PASS")
print("Grade = F")
else:
print("Grade = Failed")

#Nested conditions
if avg>=40:
print("PASS")
if avg >= 90:
print("Grade = A")
if avg>=95:
print("You win President's medal!")
elif avg >= 80:
print("Grade = B")
elif avg >= 70:
print("Grade = C")
elif avg >= 60:
print("Grade = D")
elif avg >= 50:
print("Grade = E")
else:
print("Grade = F")
else:
print("Grade = Failed")

num1,num2,num3=10,15,20
if num1>=num2:
print(f"{num1} >= {num2}")
else:
print(f"{num2} >= {num1}")
#Repeating - loops in programming language
# range(a,b,c): a-start number (including), b-end number (excluding), c-increment
range(3,15,4) # 3, 7, 11,
range(3,7) # c is default 1: 3,4,5,6
range(4) #a is default 0, c is default 1: 0,1,2,3

# there are 2 ways loops are implemented in Python: FOR / WHILE
#for - when you know exactly how many times to run
for counter in range(1,11):
print(counter)
#odd numbers between 1 and 10
for i in range(1,11,2):
print(i)
# even numbers between 1 and 10
for i in range(2, 11, 2):
print(i)

for i in range(5):
print("*")

for i in range(5):
print("*",end=' ')
print("\n\n")
'''
* * * * *
* * * * *
* * * * *
* * * * *
* * * * *
'''
for j in range(5):
for i in range(5):
print("*",end=' ')
print()
'''
*
* *
* * *
* * * *
* * * * *
'''
for j in range(5):
for i in range(j+1):
print("*",end=' ')
print()

'''
* * * * *
* * * *
* * *
* *
*
'''
for j in range(5):
for i in range(5-j):
print("*",end=' ')
print()

'''
*
* *
* * *
* * * *
* * * * *
'''
for j in range(5):
for i in range(5-j-1):
print(" ",end='')
for k in range(j + 1):
print("*", end=' ')
print()

'''
* * * * *
* * * *
* * *
* *
*
'''
'''
1* 1=1 2* 1=2 ... 10*1=10
...
1*10=10 2* 10=20 ... 10*10
'''
for n in range(1,11):
for m in range(1,11):
print(f"{m:>2} * {n:>2} = {n*m:>3}",end=" ")
print()

#while - when you until what condition
num = 51
isPrime = True
for i in range(2, num):
if num % i == 0:
isPrime = False
break
if isPrime:
print("Its a prime number ")
else:
print("Its not a prime number ")

# print prime numbers between 1000 and 5000
start, end = 1000, 5000
for k in range(start, end + 1):
isPrime = True
for i in range(2, k):
if k % i == 0:
isPrime = False
break
if isPrime:
print(k, end=" , ")

### WHILE LOOP
#print numbers from 1 to 10
i = 1
while i <=10:
print(i)
i+=1

choice = True
while choice:
print("Hello")
ch = input("Press n to stop")
if ch=='n':
choice = False


while True:
print("Hello")
ch = input("Press n to stop")
if ch == 'n':
break
#Strings
str1 = 'Hello'
str2 = "Hi there"
str3 = '''Hi there
how are you
Where are you?'''
str4 = """I am fine
I am here
How are you"""
print(str4)
print(str1 + str2)
print(str1 *3)

for i in str1:
print(i)

print(str1[2])
#str1[1] = "B"
print(str1[4], str1[-1])
print(str1[0],str1[-5])
print(str1[:3])
print(str1[-3:])

str1= "Hello How ARE YOU"
print(str1.isalnum())
num1 = input("Enter a number: ")
if num1.isdigit():
int(num1)
print(num1)
else:
print("Invalid number")

str2= ' '
print(str2.isspace())
print(str2.islower())
print(str2.isupper())

str3 = "Hello HI There"
print(str3.lower())
print(str3.upper())
print(str3.title())
str4 = str3.replace("h","ABC",1)
print(str4)
str4 = str3.split('e')
print(str4)
str5 = "e".join(str4)
print(str5)
print("HI" in str3)
print(str3.find("HI"))
print(str3.count('e'))
#Sets
#data structures - collections: String, List, Tuple, Dictionary
#SETS - A B M O C - there is no order
# doesnt allow duplicate

set1 = {2,4,6,8}
print(set1)
#union intersection minus
set2 = {6,8,10,12}
#union
print(set1.union(set2))
print(set1 | set2)
#intersection
print(set1.intersection(set2))
print(set1 & set2)

#difference
print(set1.difference(set2))
print(set1 - set2)
print(set2.difference(set1))
print(set2 - set1)

#symmetric difference
#union of 2 differences
print(set1.symmetric_difference(set2))
print(set1 ^ set2)
print(set1, set2)

#update doesnt give new set, it changes the main set
set1.update(set2)

#union -> update
# {intersection_update, difference_update, symm_diff_update}
print(set1, set2)

set3 = {2,4,10,12}

# sets to list and to tuple
set1 = tuple(set1)
list(set1)
set()
#List - linear ordered mutable
list1 = []
print(list1)
print(type(list1))
list1 = [2,4,6.5,"Hello",True,[2,8,12]]
print("Number of elements = ",len(list1))
print(list1[1])
print(type(list1[-2]))
print(type(list1[-1]))

#sum and avg of 5 marks
list_of_marks=[]
sum = 0
for i in range(0):
m=int(input("Enter marks: "))
list_of_marks.append(m)
sum+=m
print("Sum = ",sum)
print("List of marks = ",list_of_marks)

## 2 ways to add values to an existing list: append, insert
list_of_marks.insert(1,100)#index, value
print("List of marks = ",list_of_marks)
list_of_marks.insert(1,80)
print("List of marks = ",list_of_marks)
list_of_marks.insert(1,90)
print("List of marks = ",list_of_marks)
list_of_marks.insert(1,30)
print("List of marks = ",list_of_marks)
list_of_marks.insert(1,40)
print("List of marks = ",list_of_marks)

list_of_marks.sort(reverse=True)
print("(Sort)List of marks = ",list_of_marks)
list_of_marks.reverse()
print("(Reverse)List of marks = ",list_of_marks)

num_to_delete = 80
if num_to_delete in list_of_marks:
list_of_marks.remove(num_to_delete)
print("(remove)List of marks = ",list_of_marks)
list_of_marks.pop(3)
print("(pop)List of marks = ",list_of_marks)
num_to_delete = 80
if list_of_marks.count(num_to_delete) >0:
list_of_marks.remove(num_to_delete)
print("(remove)List of marks = ", list_of_marks)
list1 = [10,3,4,5,3,4,6,3,7,8,3,6]
print(list1.count(3))
print(list1.index(3)) #index(element,start,end)

#index of all the values in the list:
element_search = 3
inx_found=0
for i in range(list1.count(element_search)):
print(list1.index(element_search, inx_found), end=" ,")
inx_found = list1.index(element_search, inx_found) + 1
print()
list1 = [1,3,5,7]
list2 = list1 #they are same , just have 2 names
list3 = list1.copy() #copy - creates a different copy
print("1. List 1 = ",list1)
print("1. List 2 = ",list2)
print("1. List 3 = ",list3)
list1.append(15)
list2.remove(5)
list3.append(19)

print("2. List 1 = ",list1)
print("2. List 2 = ",list2)
print("2. List 3 = ",list3)

# TUPLE - immutable form of List
t1 = ()
print(type(t1))

t1 = (1,)
print(type(t1))

t1=list(t1)
t1.append(40)
t1 = tuple(t1)
#list, tuple and sets are all 3 inter convertible
#advantage of tuple -its fast to access
t2 = (1,2,3) #packing
a,b,c = t2 #unpacking
print(a,b,c)

#Dictionary - uses its own key to track values
#list, tuple, sets - linear

list1 = [2,3,4,5,2,3,4,2,3,2]
list1=list(set(list1))
print(list1)

#Dictionary: key:value pairs
d1 = {}
print(type(d1))
d1 = {4,5}
print(type(d1))

d1 = {4:"Sachin","Matches":5}
print(type(d1))
print(d1)
print("Keys: ",d1.keys())
print("Values: ",d1.values())
print("(Key, Value): ",d1.items())

print(d1['Matches'])
d2={'City':'Mumbai'}
d1.update(d2)
print(d1)

d2={'City':'Hyderabad'}
d1.update(d2)
print(d1)

#deep v shallow copy
d3 = d1 # deep - creates another name for d1
d4 = d1.copy()
print("D1 = ",d1)
print("D3 = ",d3)
print("D4 = ",d4)
d1.update({'Sports':'Cricket'})
print("D1 = ",d1)
print("D3 = ",d3)
print("D4 = ",d4)
#remove a member: pop(), popitem()
d1.pop(4) #pop takes key as input
print("D1 after pop: ", d1)
d1.popitem() #last added value is removed - remeber last added is not same as last updated
print("D1 after popitem: ", d1)

print(" iterating through keys: ")
for i in d1.keys():
print(i)
print(" iterating through values: ")
for i in d1.values():
print(i)

print(" iterating through items")
for i in d1.items():
print(i)
for i,j in d1.items():
print(f"Keys = {i} and value = {j}")

women = {101:"Renuka", 103:"Smriti",105:"Harmanpreet",107:"Deepti"}
men = {102:"Sachin",104:"Virat",106:"Rohit"}
all ={211:'Steve',222:'Warner'}
all.update(women)
all.update(men)
print(all)
for key,val in all.items():
if key in women.keys():
print(f"{val} plays in women's team")
elif key in men.keys():
print(f"{val} plays in men's team")
else:
print(f"{val} neither part of mens or womens team")
# SET - sets - linear unordered mutable collection - doesnt allow duplicate
set1 = {'Apple','Grapes','Banana','Orange'}
print(type(set1))
set1.add('Cherry')
set2 = {"Pineapple","Mango","Apple","Orange"}
# two ways to remove
set1.remove("Banana")
set1.discard("Apple")
#set1.remove("Rose") - if value isnt there throws error
set1.discard("Rose") #doesnt throw error
print("1. Set1: ",set1)
set1.pop()
set1.update(set2) #union
print("2. Set1: ",set1)
set1.clear()
print("3. Set1: ",set1)
### SET FUNCTIONS ####
set1 = {'Apple','Grapes','Banana','Orange'}
set2 = {"Pineapple","Mango","Apple","Orange"}
#UNION
print("UNION")
print(set1 | set2)
print(set1.union(set2))
print("INTERSECTION")
print(set1 & set2)
print(set1.intersection(set2))
print("DIFFERENCE")
print(set1 - set2)
print(set1.difference(set2))
print(set2 - set1)
print(set2.difference(set1))

print("SYMMETRIC DIFFERENCE")
print(set1 ^ set2)
print(set2 ^ set1)
print(set1.symmetric_difference(set2))
#update() will update the values of main set
# set1.union(set2) - this gives a new set as output
# set1.update(set2) - set1 is updated with the values
# union - update()
set1.update(set2)
print(set1)
# intersection: intersection_update()
set1.intersection_update(set2)
print(set1)
# difference_update()
set1.difference_update(set2)
print(set1)
#symmetric_difference_update()
set1.symmetric_difference_update(set2)
print(set1)

# set, list, tuple => they are inter-convertible
list1 = [3,6,9,12,3,6,9,3,6,3]
list1 = list(set(list1))
print(list1)
set1 = {'Apple','Grapes','Banana','Orange'}
set1 = list(set1)
set1.index("Grapes")
set1 = set(set1)
set1 = tuple(set1)
set1 = set(set1)
print(set1.issubset(set2))

#
list1 = [3,6,9,12,3,6,9,3,6,3]
list2 = [3,6,9,12,15]
#does all the elements of list2 present in list1?
t_list1 =set(list1)
if set(list1).issuperset(set(list2)):
print("yes, list2 value exists in list1")
else:
print("No, list2 has additional elements")
 
# Basic data types (stores only 1 value) - int, float, str,bool and complex
# Collections (stores multiple values - 1D) - list, tuple,dictionary, set
# functions - own functions - user defined functions
# print(), input(), type(), int(), str(), len() : in-built functions (developers of python have already written for us)
# we will learn to write our own functions

#first part of writting function is to Define the meaning- function
def mytext(val1, val2,val3):#required positional argument
print("Hello How are you today?")
print("Where are you going?")
print("I am fine.",val1)

def mytext2(val1=0, val2=0,val3=9):#default positional argument
print("Hello How are you today?")
print("Where are you going?")
print("I am fine.",val1)
print("Values are: ",val1,val2,val3)
#demo keyword (non-positional) arguments
def mytext3(val1, val2,val3):#default positional argument
print("Hello How are you today?")
print("Where are you going?")
print("I am fine.",val1)
print("Values are: ",val1,val2,val3)


#default argument (non-required) & keyword argument (non-positional)
mytext(5,10,0)
print("Done with one time calling now calling second time")
mytext2(20,4,10)
mytext2(20,4)
mytext2(10,5,1)
mytext3(val3=10,val1=9,val2=8)
mytext3(100, val3=9,val2=8)
#print()
# numpy, pandas (Multi-D)

def isPositive(val1):
#result = "Positive" # "+ve" / 1
if val1 >0:
return 1
else:
return 0 #print("Its not Positive")

res = isPositive(100)
if res==1:
print("Its positive, now lets go ahead building our logic")
else:
print("STOP! STOP! STOP!")

isPositive(-100)
isPositive(90)
def myfun1(val1):  #Required positional argument
print(val1)

myfun1(10)

def myfun2(val1,val2,val3): #Required positional argument
print(val1, val2,val3)

myfun2(10,30,20)

# Default argument
def myfun3(val1,val2=100,val3="New York"): #Required positional argument
print(val1, val2,val3)
return val1+val2

myfun3(10,30)

## keywords - (non-positional)
result = myfun3(val2=99,val3=77,val1=44)
print("Result = ",result)

## single function to perform perimeter of triangle, square, pentagon, hexagon

def calcPerimeter(s1=0,s2=0,s3=0,s4=0,s5=0,s6=0):
if s1==0:
return "You have not provided any value!"
elif s2==0:
return "Perimeter of a line is the same value which is "+str(s1)
elif s3==0:
print("We cant have a closed shape with 2 only sides!")
elif s4==0:
print("Its a Triangle! Perimeter is",s1+s2+s3)
elif s5==0:
if s1==s2 and s2==s3 and s3==s4:
print("its a square with perimeter",s1*4)
elif s1==s2 and s4==s3:
print("Its a rectangle with Perimeter",2*(s1+s3))
else:
print("Its an irregular 4 sides shape with perimeter",s1+s2+s3+s4)
elif s6==0:
print("Its a pentagon with perimeter",s1+s2+s3+s4+s5)
else:
print("Its a hexagon with perimeter",s1+s2+s3+s4+s5+s6)

result = calcPerimeter()
print(result)
result = calcPerimeter(5)
print(result)
calcPerimeter(6,8)
calcPerimeter(8,7,5,4,3,3)

def checkNum(val1):
if val1 <0:
return -1 #for negative
elif val1==0:
return 0 #zero value
else:
return 1 #positive

res = checkNum(100)
if res==-1:
print("Negative")
elif res==0:
print("Zero")
else:
print("Positive")

res = checkNum(-100)
if res==-1:
print("Negative")
elif res==0:
print("Zero")
else:
print("Positive")
list1 = []

class Book:

num_books = 0 #class level variable

# object level method
def say_hi(self,n):
self.name=n #self. indicates name is specific to object
print(“Hi…”)
Book.num_books+=1

# class level variable
# class level method
# object level variable

b1 = Book()
b2 = Book()
b1.say_hi(“Python”)
b2.say_hi(“Django”)
print(b1.num_books)
print(b2.num_books)
print(Book.num_books)
print(“b1.name = “,b1.name)
print(“b2.name = “,b2.name)
#Book.name
class Library:
def __init__(self, library):
self.library = library
def _method2(self): #declared as protected
return “I am in Library!!!”
def __method3(self): #private declaration – cant be accessed outside of this class
return “I am in Library by method 3!!!”
def get_libname(self):
return self.library
class Books(Library): #Books is a derived class of Library class (base class)
title_count = 0
#__init__ – this is automatically called when object is created
def __init__(self, title, author,libname=“XYZ Library”):
Library.__init__(self,libname)
self.title = title
self.author = author

Books.title_count+=1

#get author – object
def get_author(self):
return self.author
def get_title(self):
return self.title

#count of the book
@classmethod
def get_bookcount(cls):
return cls.title_count
class Sample:
def sample_method(self):
Library._method2(self) #protected will not show up but still callable
#protected concept exists but not strictly implemented

def sample_method3(self):
Library.__method3(self) #private members are not accessible

b1 = Books(“Python Programming”,“Swapnil”)
b2 = Books(“Data Science Programming”,“Snehil”,“PQR Library”)
b3 = Books(“Blockchain”,“Ojass”)
print(“Number of books in the library = “,b1.get_bookcount())
print(“Number of books in the library = “,b3.get_bookcount())
print(“Title of the book = “,b1.get_title())
print(“Title of the book = “,b2.get_title())
l1 = Library(“ABC Local Library”)
print(“Library name = “,l1.get_libname())
print(“B1 Library is: “,b1.get_libname())
print(“LIBNAME = “,b1.get_libname())
print(“Method2 by Books: “,b1._method2())
s1 = Sample()
print(“Call by Sample: “,s1.sample_method())
#print(“Call by Sample: “,s1.sample_method3()) – throws error
# as private members cant be accessed

#print(“Method2 by Books: “,b1.__method3())- throws error
# as private members cant be accessed

MASTERLIST = [{“ItemCode”: 101,“Item”:“Shirt”,“Price”:28.2},
{“ItemCode”: 102,“Item”:“Bag”,“Price”:18.2},
{“ItemCode”: 103,“Item”:“Book1”,“Price”:38.2},
{“ItemCode”: 104,“Item”:“Watch”,“Price”:58.2},
{“ItemCode”: 105,“Item”:“Shoes”,“Price”:128.2},
{“ItemCode”: 106,“Item”:“Laptop”,“Price”:1028.2}]
class ShoppingCart:
def __init__(self):
self.myshoppingcart = []

#add product to the cart
def add_prod(self):
each_item = {}
item_name = input(“Enter the product Name: “)
not_in_list = True
for items in MASTERLIST:
if item_name==items[“Item”]:
not_in_list = False
if not_in_list:
print(“Sorry, That Item is Out of Stock!”)
else:
item_size = input(“Enter the product Size: “)
item_color = input(“Enter the product Color: “)
item_quantity = int(input(“Enter the product Quantity: “))
each_item ={“Item”:item_name,“Size”:item_size,“Color”:item_color,“Quantity”:item_quantity}
self.myshoppingcart.append(each_item)
#print(self.myshoppingcart)
def display_prod(self):
print(“Item Size Color Quantity”)
for i in self.myshoppingcart:
for k,j in i.items():
print(f”{j:<10}, end=” “)
print()

def remove_prod(self):
item_name = input(“Enter the product name to remove: “)
not_in_list = True
for items in self.myshoppingcart:
if item_name == items[“Item”]:
self.myshoppingcart.remove(items)
not_in_list = False

if not_in_list:
print(“Sorry, That Item is not in your shopping cart!”)
else:
print(“Item is now removed from your shopping cart!”)

def generate_receipt(self):
print(“Item Size Color Quantity Price”)
print(“=======================================================”)
item_cost = 0
price = 0
grand_total = 0
for i in self.myshoppingcart:
for k,j in i.items():
for master_list in MASTERLIST:
if j==master_list[“Item”]:
price=master_list[“Price”]

print(f”{j:<10}, end=” “)
if k==“Quantity”:
item_cost = j*price
grand_total+=item_cost
print(f”{round(item_cost):<10}, end=” “)
print()
print(“——————————————————-“)

print(” TOTAL: $”+str(round(grand_total)))
print(“=======================================================”)

if __name__==“__main__”:
sc1 = ShoppingCart()
sc2 = ShoppingCart()
while True:
print(“1. View my cart \n2. Add to my cart\n3. Remove from my cart\n4. Generate My Receipt\n5. Exit”)
ch=input(“Enter your option: “)
if ch==“1”:
if len(sc1.myshoppingcart)==0:
print(“Your shopping cart is empty!”)
else:
sc1.display_prod()
elif ch==“2”:
sc1.add_prod()
elif ch==“3”:
sc1.remove_prod()
elif ch==“4”:
if len(sc1.myshoppingcart)==0:
print(“Your shopping cart is empty!”)
else:
sc1.generate_receipt()
elif ch==“5”:
break
else:
print(“Invalid option”)

DATABASE TUTORIAL

DESCRIPTIVE STATISTICS

Descriptive Statistics – Types of Data

 

Quantitative Data
Numeric Data
Continuous
(Weight, Temperature,
etc)
Ratio
Interval
Data
Discrete
Qualitative Data
Categorical
Nominal
(There is no order in the values): {M, F} {N,S,E,W}
Ordinal
(There is an order): {1,2,3,4,5} {Good, Average, Bad}
Text
Data
Audio

 

Video

 

UNIVARIATE – One Variable

 

•WEIGHT: FREQUENCY TABLE & HISTOGRAM
•MARKS: FREQUENCY TABLE & HISTOGRAM
•GENDER: BAR GRAPH / PIE CHART
•REGIONS: BAR GRAPH / PIE CHART
•COURSE RATING: BAR GRAPH / PIE CHART/LINE GRAPH (IN A PARTICULAR ORDER)
•TEMPERATURE: FREQUENCY TABLE & HISTOGRAM
•SALES: LINE GRAPH