JUNE 2023 DataScience Tutorial PART- II

ACCESS THE PREVIOUS CONTENT HERE

use employees;

— create table, drop table, alter table

— to read data we use Select

Select * from departments;

 

insert into departments (DID, HOD, DNAME, DCODE) values (100,’Sachin’,’CSE’,’AA00′);

insert into departments  values (101,’MEC’,’Virat’,’AZ00′);

 

insert into departments  values (103,’ECE’,’Rohit’,’KZ00′);

insert into departments  values (105,’CIV’,’Dhoni’,’CZ00′);

insert into departments  values (106,’TCE’,’Sunil’,’BZ00′);

 

Select * from employees;

 

Select @@GLOBAL.secure_file_priv;

 

— Import data into CSV

LOAD DATA INFILE ‘C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/EmployeesData.csv’

INTO TABLE EMPLOYEES

FIELDS TERMINATED BY ‘,’

ENCLOSED BY ‘”‘

LINES TERMINATED BY ‘\n’

IGNORE 1 ROWS;

 

— DELETE to remove a row

DELETE FROM EMPLOYEES WHERE EMPID=120;

 

— Update to modify the existing values in a row

Update Employees

Set Bonus_PCT = 0.45, Salary = 160000

where empid=122;

 

Select * from employees where empid=122

 

—  SELECT

Select FNAME, EMAIL, PHONE, SALARY From Employees;

 

Select FNAME, EMAIL, PHONE, SALARY From Employees where salary >=100000;

 

Select FNAME, EMAIL, PHONE, SALARY From Employees where EMAIL =’ramakrishnavendra@wnbco.co’;

 

— Create Index to make query faster

Create Index idx_salary

on Employees (Salary)

 

—  Relational operators in MYSQL:  =   >   <   >=   <=   <>

Select FNAME, EMAIL, PHONE, SALARY From Employees where EMAIL <>’ramakrishnavendra@wnbco.co’;

 

— BETWEEN  LIKE  IN

 

use employees;

 

select * from employees;

 

select FNAME, LNAME, DOB, Salary from employees where salary > 75000;

— BETWEEN is used for numbers

select FNAME, LNAME, DOB, Salary from employees where salary BETWEEN 75000 AND  95000;

— below statement is same as above (BETWEEN)

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 AND  salary <= 95000;

 

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 AND  Bonus_pct <= 0.4; — 0 to 9 rows

 

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 OR  Bonus_pct <= 0.4; — 25 to 34 rows

 

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000   — 25 rows

 

select FNAME, LNAME, DOB, Salary from employees where  Bonus_pct <= 0.4;   — 9 rows

 

— LIKE – for text data comparison

Select * from employees where FNAME like ‘A%’

 

— IN – checking for given set of values

Select * from employees where FNAME IN (‘O%’, ‘A%’,’B%’);

 

Select * from employees where FNAME IN (‘Anikeet Dey’,’Amol Jain’,’Suresh Narayan Singh Yadav’ );

 

Select * from employees where SALARY IN (36107,  110266, 107799, 198890);

—  Logical operator – AND OR NOT

select * from employees where not( deptid = 100 and salary >100000)

 

—  ORDER BY

select * from employees order by salary  DESC;

 

select * from employees order by salary  ASC;  — Ascending order is by default

 

select * from employees order by DEPTID  ASC, Salary DESC;

use employees;

 

select * from employees where bonus_pct is not null;

 

select * from employees order by FNAME;

select * from employees order by FNAME DESC;

 

UPDATE employees SET BONUS_PCT = NULL WHERE BONUS_PCT > 0.88;

 

select * from employees where bonus_pct is not null limit 3;

 

select * from employees where bonus_pct is not null order by salary DESC limit 5;

 

select avg(salary) as Max_Salary, deptid from employees group by deptid;

 

select avg(salary), count(salary), sum(salary) from employees;

 

select * from employees where salary > (select avg(salary) from employees);

 

— Find highest salary earner for each department

select deptid, MAX(salary) as max_salary FROM employees group by deptid;

 

select deptid, fname, salary from employees where (Deptid, salary)  in 

(select deptid, MAX(salary) as max_salary FROM employees group by deptid);

 

— Questions for Bala:

— 1. Top 5 run scorer

— 2. Top 5 wickets taker

— 3. Top 5 catch takers

— 4. How many matches each team has won and lost

— 5. How many teams won and toss but lost the matches

— 6. How many teams lost by batting first

— 7. How many matches were interrupted by rain

— 8. Top run scorer for each team

— 9. Top wicket takers for each team

— 10. Top catch takers for each team

 

— ——————– ——–

— Questions for Vinay:

— 1. How much is my daily sales?

— 2. Top 5 contributing product by sales

— 3. Top 5 contributing product by count

— 4. Breakup of Payment mode (card/cash/upi/others)

— 5. Top 5 highest priced products

— 6. Which day of the week do more people tend to partake in online grocery shopping

— 7. Top best selling categories (number of transactions per category)

— 8. Top best selling categories (amount of sales)

— 9. Top 20% categories contribution to % of sales

— 10. What is the average sales for each of the 7 days.

— ——————– ——–

 

use employees;

 

select deptid, avg(salary) as “Average Salary” from employees group by deptid;

— COUNT, AVG, SUM, MIN, MAX

 

— Like:  %a, a%,  %a% ,  %b__% (exactly 1 value for _),   ‘a%z’

select fname, email, salary from employees;

 

select fname, email, salary from employees where fname like ‘%ol%’;

 

— J followed by exactly three characters

select fname, email, salary from employees where fname like ‘%J___’;

 

— J followed by atleast four characters

select fname, email, salary from employees where fname like ‘%J____%’;

 

select * from employees where fname in (‘Pankaj Prajapati’,’Manju Mishra’ ,’Arijeet Dasgupta’);

 

select * from employees where fname in (select fname from employees where fname like ‘%J____%’);

select * from employees where (fname,salary) in (select fname,salary from employees where fname like ‘%J____%’);

 

select fname, email, salary from employees where fname like ‘%dey’;

 

 

 

— Questions for Karan

— How many books are currently issued

— Who has read the maximum number of books

— Is there a member who has not issued a book so far

— How many books have been issued for more than a month

— List of books with more than 2000 rupees cost

— List of Books older than 20 years

— The most read books of the library

— The most read genre of library

— Average books that are issued per day

— How many readers have not returned the book on time so far

 

— —————- —- 

— combining data from 2 tables

use employees;

select * from employees;

select employees.FNAME, DID, departments.DNAME from employees, departments

 

 

select employees.FNAME, DID, departments.DNAME from employees, departments where employees.deptid = departments.did;

 

insert into departments values (107, ‘BIOT’,’Kapil’,’BB09′)

 

insert into employees (EMPID, FNAME, LNAME, DOB, EMAIL,PHONE, DOJ,SALARY) 

values (133,’Sachin Tendulkar’,’T’,’1990-05-04′,’sachin@wnbco.co’,9999000009,’2023-05-15′,231456);

 

select * from departments;

 

select employees.FNAME, DID, departments.DNAME from employees, departments where employees.deptid = departments.did;

 

 

— JOINS

 

use employees;

 

select * from employees;

 

select * from departments;

 

select fname, hod from employees, departments where departments.did = employees.DEPTID;  — Inner

 

select fname, hod from employees, departments where employees.DEPTID = departments.did order by empid;

 

select fname, hod from employees INNER JOIN departments ON employees.DEPTID = departments.did order by empid;

 

SELECT T1.COL1, T2.COL2, T3.COL3

FROM (((T1 INNER JOIN T2 ON T1.K1 = T2.K1) INNER JOIN T3 ON T2.K2 = T3.K2) INNER JOIN T4 ON T3.K3 = T4.K3);

 

 

select fname, hod from employees LEFT JOIN departments ON employees.DEPTID = departments.did order by empid;

 

use employees;

 

select fname, hod from employees left join departments on employees.DEPTID = departments.did

UNION

select fname, hod from departments left join employees  on employees.DEPTID = departments.did;

 

 

select fname, hod from departments inner join employees  on employees.DEPTID = departments.did;

 

select * from employees;

 

select sum(EMPID) from employees;

 

select count(EMPID) from employees;

 

select count(EMPID) from employees where salary > 50000;

 

select count(EMPID) , DEPTID from employees

group by deptid

having count(EMPID) > 5;

 

select count(fname), hod from employees left join departments on employees.DEPTID = departments.did

group by deptid

having count(fname) > 3;

 

 

select * from employees;

— Salary_Grade:  <50K: E 50-100K: D    100-150K – C  150-200-B  >200K: A

 

select fname, salary, bonus_pct, 

case when bonus_pct is null then salary

else salary+salary*bonus_pct 

end as total_salary from employees;

 

select fname, salary, 

CASE

when salary < 50000 Then ‘Grade: E’

    when salary > 200000 Then ‘Grade: A’

    when salary > 150000 Then ‘Grade: B’

    when salary > 100000 Then ‘Grade: C’

    Else ‘Grade: D’

End As Salary_Grade

from employees;

use employees;

 

select * from employees;

 

select FNAME, DOB, DOJ, datediff(doj,dob) from employees;

 

 

select fname, NOW() from employees;

 

select now(); — current date and time (system)

 

select date(now());

select curdate();

— default date format in MYSQL is  YYYY-MM-DD

— DD-MM-YYYY

select date_format(curdate(), ‘%d-%m-%Y’) as Date;

 

select curdate() as todays_date, date_add(curdate(),interval 1 Day) as Tomorrow,

date_add(curdate(),interval 1 Week) as next_week,

date_add(curdate(),interval 1 Month) as Next_month,

date_add(curdate(),interval 1 Year) as Next_year;

 

— Add date: Interval, day, week or month or year;

select curdate() as todays_date, date_sub(curdate(),interval 1 Day) as Yesterday,

date_sub(curdate(),interval 1 Week) as last_week,

date_sub(curdate(),interval 1 Month) as last_month,

date_sub(curdate(),interval 1 Year) as last_year;

 

 

select day(curdate()) as day, month(curdate()) as Month, quarter(curdate()) as Quarter, Year(curdate()) as Year,

Weekday(curdate()) as Weekday, week(curdate()) as week, weekofyear(curdate()) as WeekofYear;

 

 

select * from employees where salary > (select salary from employees where empid = 104);

 

—  create a stored procedure

DELIMITER $$

 

Create Procedure GetEmpInfo()

Begin

select * from employees where salary > (select salary from employees where empid = 104);

End $$

DELIMITER ;

 

call GetEmpInfo();

 

drop Procedure GetEmpInfo;

 

— Features: exception handling, conditions and loops (While, Repeat), 

 

— Creating a View

create view EmpData

as

Select EMPID, FNAME, LNAME, DOB, EMAIL, PHONE, DOJ, DEPTID from Employees;

 

select * from empdata;

 

 

—  String functions:  LTRIM, RTRIM, Replace, substring, COncat

select concat(fname,’.’,lname,’@wnbco.co’) from employees;

 

select fname, substring(fname, 3), substring(fname, -3) from employees;

 
import pymysql
con_str = pymysql.connect(host=“localhost”, user=“root”, password=“learnSQL”,database=“employees”)
cursor = con_str.cursor()

q1 = ”’Create Table PY_EMP (
EMPID INT PRIMARY KEY,
NAME VARCHAR(30),
CITY VARCHAR(15))
”’
#cursor.execute(q1)

q2 = ”’INSERT INTO PY_EMP VALUES(1,’Laxman’,’Hyderabad’)”’
cursor.execute(q2)
q2 = ”’INSERT INTO PY_EMP VALUES(2,’Rahul’,’Bangalore’)”’
cursor.execute(q2)


q3 = ”’Select * from Py_Emp”’
cursor.execute(q3)
results = cursor.fetchall()
for row in results:
print(row)

con_str.commit()

con_str.close()
# NUMPY
# pip install numpy
import numpy as np
nums = range(16)
nums = np.reshape(nums,(8,2))
print(nums)
nums = np.reshape(nums,(4,4))
print(nums)
print(“Shape: Rows = “,nums.shape[0], “and columns = “,nums.shape[1])
# indexing
print(nums[1,2], nums[-3,-2])
print(nums[1]) # 2nd row
print(nums[:,1]) # : rows from 0th to (n-1)th
print(nums[-1], nums[:,-2], nums[-1,-2])

# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1)
print(mat1)

print(np.zeros((3,3)))
print(np.ones((3,3)))
print(np.full((5,7),2.0))
print(np.full((5,7),9))

# eye – identity matrix: square matrix with 1 on its main diagonal
mat1 = np.eye(5)
print(mat1)

# NUMPY
import numpy as np
# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1) # 4 * 3 – shape
print(mat1)
l2 = [[2,3,4],[2,1,2],[5,2,3],[3,2,2]]
# array is a function to convert list into numpy
mat2 = np.array(l2)
print(mat2)

# Matrices operations
print(mat1 + mat2)
print(np.add(mat1, mat2))

print(mat1 – mat2)
print(np.subtract(mat1, mat2))

print(mat1 * mat2)
print(np.multiply(mat1, mat2))

print(mat1 / mat2)
print(np.divide(mat1, mat2))

# actual matrix multiplication is done using matmul()
l3 = [[2,3,4],[2,1,2],[5,2,3]]
# array is a function to convert list into numpy
mat3 = np.array(l3)
print(mat3)
print(“Matrix Multiplication”)
print(np.matmul(mat1, mat3))
print(mat1 @ mat3)
## calculating determinant

l4 = [[1,3,5],[1,3,1],[2,3,4]]
mat5 = np.array(l4)
det_mat5 = np.linalg.det(mat5)
print(“Determinant of matrix 5 is”,det_mat5)
print(“Inverse of matrix 5 is: \n,np.linalg.inv(mat5))

”’
Linear Algebra Equation:
x1 + 5×2 = 7
-2×1 – 7×2 = -5

x1 = -8, x2= 3,
”’
coeff_mat = np.array([[1,5],[-2,-7]])
#var_mat = np.array([[x1],[x2]])
result_mat = np.array([[7],[-5]])
# equation here is coeff_mat * var_mat = result_mat [eg: 5 * x = 10]
# which is, var_mat = coeff_mat inv * result_mat
det_coeff_mat = np.linalg.det(coeff_mat)
if det_coeff_mat !=0:
var_mat = np.linalg.inv(coeff_mat) @ result_mat
print(“X1 = “,var_mat[0,0])
print(“X2 = “,var_mat[1,0])
else:
print(“Solution is not possible”)

# # scipy = scientific python
# pip install scipy
”’
#Inequality = OPTIMIZATION or MAXIMIZATION / MINIMIZATION PROBLEM
Computer Parts Assembly:
Laptops & Desktops
profit: 1000, 600
objective: either maximize profit or minimize cost

constraints:
1. Demand: 500, 600
2. Parts: Memory card: 5000 cards available
3. Manpower: 25000 minutes


”’
”’
Optimization using Scipy
let’s assume d = desktop, n = notebooks

Constraints:
1. d + n <= 10000
2. 2d + n <= 15000
3. 3d + 4n <= 25000

profit: 1000 d + 750 n => maximize
-1000d – 750 n =>minimize

”’
import numpy as np
from scipy.optimize import minimize, linprog
d = 1
n = 1
profit_d = 1000
profit_n = 750
profit = d * profit_d + n * profit_n
obj = [-profit_d, -profit_n]
lhs_con = [[1,1],[2,1],[3,4]]
rhs_con = [10000, 15000, 25000]

boundary = [(0, float(“inf”)), # boundary condition for # of desktops
(10, 200000)] # we just added some limit for notebooks
opt = linprog(c=obj, A_ub=lhs_con, b_ub=rhs_con, bounds=boundary, method=“revised simplex”)
print(opt)
if opt.success:
print(f”Number of desktops = {opt.x[0]} and number of laptops = {opt.x[1]})
print(“Maximum profit that can be generated = “,-1 * opt.fun)
else:
print(“Solution can not be generated”)

### ### ### PANDAS
# Pandas – dataframe which resembles Table structure
# pip install pandas
import pandas as pd
df1 = pd.DataFrame()
print(df1)
print(type(df1))

# fruit production
data = [[“Apple”, 15000, 11000,6000],
[“Banana”, 18000,22000,29000],
[“Mango”, 2, 900, 19000],
[“Guava”, 19000,11000,25000]]

fruit_production = pd.DataFrame(data)
print(fruit_production)
print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[1:3,2:]) #based on title(names)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”])
print(fruit_production)

fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”],
index=[“Fruit 1”,“Fruit 2”,“Fruit 3”,“Fruit 4”])
print(fruit_production)

## dataframe.loc() dataframe.iloc()

print(“Slicing 1:\n)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n)
print(fruit_production.loc[[“Fruit 2”, “Fruit 3”],[“February”,“March”]]) #based on title(names)

### ###
# pandas
# pip install pandas
import pandas as pd
l1 = [10,20,30,40,50]
l1 = [[“Sachin”,101,20000,“BATSMAN”],[“Kapil”,501,12000,“BOWLER”],
[“Sunil”,12,21000,“BATSMAN”],[“Zaheer”,725,2000,“BOWLER”]]
df1 = pd.DataFrame(l1,columns=[“Player”,“Wickets”,“Runs”,“Type”],
index=[“Player 1”,“Player 2”,“Player 3”,“Player 4”])
print(df1)

d1 = {‘Apple’:[12000,11000,13000],
‘Banana’: [17000,18000,19000],
‘Mango’:[11000,13000,15000]}
df2 = pd.DataFrame(d1)
print(df2)

# creating dataframe from list of dictionary
data1 = [{“Guava”:9000, “Oranges”: 5000},
{“Guava”:8000, “Oranges”: 7000},
{“Guava”:10000, “Oranges”: 6000}]
df3 = pd.DataFrame(data1)
print(df3)

print(df3.iloc[0,:]) #first row and all column values
print(df3.iloc[:,0])

print(df2.iloc[:,0:2])
print(df2.iloc[[0,2],[0,2]])

#
print(df2.loc[[0,2],[“Apple”,“Mango”]])
print(df1.loc[[“Player 1”,“Player 4”],[“Player”,“Runs”]])

df2.iloc[2,0] = 14000
print(df2)
print(“========= DF1 =============”)
df1[‘Avg’] = df1[‘Runs’] / df1[“Wickets”]
print(df1)
print(“Reading data from DF1: “)
df4 = df1[df1.Player !=‘Sachin’] #filter where clause
print(\n\n New dataset without Sachin: \n, df4)
df1 = df1.drop(“Player”,axis=1) # axis default is 0
# unlike pop() and del – drop() returns a new dataframe
print(df1)


print(“Average Wickets of all the players = “,df1[‘Wickets’].mean())
print(“Average Wickets of players by type = \n\n,df1.groupby(‘Type’).mean())
# axis = 0 refers to rows
# axis = 1 refers to columns

print(\n\nDropping columns from DF1: “)
del df1[‘Wickets’] #dropping column Wickets using del
print(df1)

df1.pop(‘Runs’) #dropping column using pop
print(df1)
#
import pandas as pd

ud_df = pd.read_csv(“D:/datasets/gitdataset/user_device.csv”)
print(ud_df) # 272 rows x 6 columns
print(“Rows: “,ud_df.shape[0])
print(“Columns: “,ud_df.shape[1])

print(ud_df.tail(1))
print(ud_df.head(1))

use_df = pd.read_csv(“D:/datasets/gitdataset/user_usage.csv”)
print(use_df) # 240 rows x 4 columns

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’)
print(result_df) # [159 rows x 9 columns] = ud_df: 159 + 113, use_df = 159 + 81

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘outer’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘left’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘right’)
print(result_df)

## Working with Pandas – Example ##
import pandas as pd
import numpy as np
df = pd.read_csv(“D:/datasets/gitdataset/hotel_bookings.csv”)
print(df.shape)
print(df.dtypes)
”’
numeric – int, float
categorical – 1) Nominal – there is no order 2) Ordinal – here order is imp
”’
df_numeric = df.select_dtypes(include=[np.number])
print(df_numeric)

df_object= df.select_dtypes(exclude=[np.number])
print(df_object) # categorical and date columns

print(df.columns)
for col in df.columns:
missing = np.mean(df[col].isnull())
if missing >0:
print(f”{col}{missing})
”’
Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING – Divide the train and test


”’
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
print(df)
Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING –
a. Divide the train and test
b. Run the model
6. EVALUATE THE MODEL:
a. Measure the performance of each algorithm on the test data
b. Metric to compare: based on Regression (MSE, RMSE, R square) or
classification (confusion matrix -accuracy, sensitivity..)
c. select the best performing model
7. DEPLOY THE BEST PERFORMING MODEL

Hypothesis test:
1. Null Hypothesis (H0): starting statement (objective)
Alternate Hypethesis (H1): Alternate of H0

Z or T test:
Chi square test: both are categorical

e.g. North zone: 50 WIN 5 LOSS – p = 0.005

# simple (single value) v composite (specifies range)
# two tailed test v one tailed test [H0: mean = 0,
H1 Left Tailed: mean <0
H1 Right Tailed: mean >0
# level of significance:
alpha value: confidence interval – 95%
p value: p value <0.05 – we reject Null Hypothesis
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
print(X)
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train)
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_})

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output