use employees;
— create table, drop table, alter table
— to read data we use Select
Select * from departments;
insert into departments (DID, HOD, DNAME, DCODE) values (100,’Sachin’,’CSE’,’AA00′);
insert into departments values (101,’MEC’,’Virat’,’AZ00′);
insert into departments values (103,’ECE’,’Rohit’,’KZ00′);
insert into departments values (105,’CIV’,’Dhoni’,’CZ00′);
insert into departments values (106,’TCE’,’Sunil’,’BZ00′);
Select * from employees;
Select @@GLOBAL.secure_file_priv;
— Import data into CSV
LOAD DATA INFILE ‘C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/EmployeesData.csv’
INTO TABLE EMPLOYEES
FIELDS TERMINATED BY ‘,’
ENCLOSED BY ‘”‘
LINES TERMINATED BY ‘\n’
IGNORE 1 ROWS;
— DELETE to remove a row
DELETE FROM EMPLOYEES WHERE EMPID=120;
— Update to modify the existing values in a row
Update Employees
Set Bonus_PCT = 0.45, Salary = 160000
where empid=122;
Select * from employees where empid=122
— SELECT
Select FNAME, EMAIL, PHONE, SALARY From Employees;
Select FNAME, EMAIL, PHONE, SALARY From Employees where salary >=100000;
Select FNAME, EMAIL, PHONE, SALARY From Employees where EMAIL =’ramakrishnavendra@wnbco.co’;
— Create Index to make query faster
Create Index idx_salary
on Employees (Salary)
— Relational operators in MYSQL: = > < >= <= <>
Select FNAME, EMAIL, PHONE, SALARY From Employees where EMAIL <>’ramakrishnavendra@wnbco.co’;
— BETWEEN LIKE IN
use employees;
select * from employees;
select FNAME, LNAME, DOB, Salary from employees where salary > 75000;
— BETWEEN is used for numbers
select FNAME, LNAME, DOB, Salary from employees where salary BETWEEN 75000 AND 95000;
— below statement is same as above (BETWEEN)
select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 AND salary <= 95000;
select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 AND Bonus_pct <= 0.4; — 0 to 9 rows
select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 OR Bonus_pct <= 0.4; — 25 to 34 rows
select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 — 25 rows
select FNAME, LNAME, DOB, Salary from employees where Bonus_pct <= 0.4; — 9 rows
— LIKE – for text data comparison
Select * from employees where FNAME like ‘A%’
— IN – checking for given set of values
Select * from employees where FNAME IN (‘O%’, ‘A%’,’B%’);
Select * from employees where FNAME IN (‘Anikeet Dey’,’Amol Jain’,’Suresh Narayan Singh Yadav’ );
Select * from employees where SALARY IN (36107, 110266, 107799, 198890);
— Logical operator – AND OR NOT
select * from employees where not( deptid = 100 and salary >100000)
— ORDER BY
select * from employees order by salary DESC;
select * from employees order by salary ASC; — Ascending order is by default
select * from employees order by DEPTID ASC, Salary DESC;
use employees;
select * from employees where bonus_pct is not null;
select * from employees order by FNAME;
select * from employees order by FNAME DESC;
UPDATE employees SET BONUS_PCT = NULL WHERE BONUS_PCT > 0.88;
select * from employees where bonus_pct is not null limit 3;
select * from employees where bonus_pct is not null order by salary DESC limit 5;
select avg(salary) as Max_Salary, deptid from employees group by deptid;
select avg(salary), count(salary), sum(salary) from employees;
select * from employees where salary > (select avg(salary) from employees);
— Find highest salary earner for each department
select deptid, MAX(salary) as max_salary FROM employees group by deptid;
select deptid, fname, salary from employees where (Deptid, salary) in
(select deptid, MAX(salary) as max_salary FROM employees group by deptid);
— Questions for Bala:
— 1. Top 5 run scorer
— 2. Top 5 wickets taker
— 3. Top 5 catch takers
— 4. How many matches each team has won and lost
— 5. How many teams won and toss but lost the matches
— 6. How many teams lost by batting first
— 7. How many matches were interrupted by rain
— 8. Top run scorer for each team
— 9. Top wicket takers for each team
— 10. Top catch takers for each team
— ——————– ——–
— Questions for Vinay:
— 1. How much is my daily sales?
— 2. Top 5 contributing product by sales
— 3. Top 5 contributing product by count
— 4. Breakup of Payment mode (card/cash/upi/others)
— 5. Top 5 highest priced products
— 6. Which day of the week do more people tend to partake in online grocery shopping
— 7. Top best selling categories (number of transactions per category)
— 8. Top best selling categories (amount of sales)
— 9. Top 20% categories contribution to % of sales
— 10. What is the average sales for each of the 7 days.
— ——————– ——–
use employees;
select deptid, avg(salary) as “Average Salary” from employees group by deptid;
— COUNT, AVG, SUM, MIN, MAX
— Like: %a, a%, %a% , %b__% (exactly 1 value for _), ‘a%z’
select fname, email, salary from employees;
select fname, email, salary from employees where fname like ‘%ol%’;
— J followed by exactly three characters
select fname, email, salary from employees where fname like ‘%J___’;
— J followed by atleast four characters
select fname, email, salary from employees where fname like ‘%J____%’;
select * from employees where fname in (‘Pankaj Prajapati’,’Manju Mishra’ ,’Arijeet Dasgupta’);
select * from employees where fname in (select fname from employees where fname like ‘%J____%’);
select * from employees where (fname,salary) in (select fname,salary from employees where fname like ‘%J____%’);
select fname, email, salary from employees where fname like ‘%dey’;
— Questions for Karan
— How many books are currently issued
— Who has read the maximum number of books
— Is there a member who has not issued a book so far
— How many books have been issued for more than a month
— List of books with more than 2000 rupees cost
— List of Books older than 20 years
— The most read books of the library
— The most read genre of library
— Average books that are issued per day
— How many readers have not returned the book on time so far
— —————- —-
— combining data from 2 tables
use employees;
select * from employees;
select employees.FNAME, DID, departments.DNAME from employees, departments
select employees.FNAME, DID, departments.DNAME from employees, departments where employees.deptid = departments.did;
insert into departments values (107, ‘BIOT’,’Kapil’,’BB09′)
insert into employees (EMPID, FNAME, LNAME, DOB, EMAIL,PHONE, DOJ,SALARY)
values (133,’Sachin Tendulkar’,’T’,’1990-05-04′,’sachin@wnbco.co’,9999000009,’2023-05-15′,231456);
select * from departments;
select employees.FNAME, DID, departments.DNAME from employees, departments where employees.deptid = departments.did;
— JOINS
use employees;
select * from employees;
select * from departments;
select fname, hod from employees, departments where departments.did = employees.DEPTID; — Inner
select fname, hod from employees, departments where employees.DEPTID = departments.did order by empid;
select fname, hod from employees INNER JOIN departments ON employees.DEPTID = departments.did order by empid;
SELECT T1.COL1, T2.COL2, T3.COL3
FROM (((T1 INNER JOIN T2 ON T1.K1 = T2.K1) INNER JOIN T3 ON T2.K2 = T3.K2) INNER JOIN T4 ON T3.K3 = T4.K3);
select fname, hod from employees LEFT JOIN departments ON employees.DEPTID = departments.did order by empid;
use employees;
select fname, hod from employees left join departments on employees.DEPTID = departments.did
UNION
select fname, hod from departments left join employees on employees.DEPTID = departments.did;
select fname, hod from departments inner join employees on employees.DEPTID = departments.did;
select * from employees;
select sum(EMPID) from employees;
select count(EMPID) from employees;
select count(EMPID) from employees where salary > 50000;
select count(EMPID) , DEPTID from employees
group by deptid
having count(EMPID) > 5;
select count(fname), hod from employees left join departments on employees.DEPTID = departments.did
group by deptid
having count(fname) > 3;
select * from employees;
— Salary_Grade: <50K: E 50-100K: D 100-150K – C 150-200-B >200K: A
select fname, salary, bonus_pct,
case when bonus_pct is null then salary
else salary+salary*bonus_pct
end as total_salary from employees;
select fname, salary,
CASE
when salary < 50000 Then ‘Grade: E’
when salary > 200000 Then ‘Grade: A’
when salary > 150000 Then ‘Grade: B’
when salary > 100000 Then ‘Grade: C’
Else ‘Grade: D’
End As Salary_Grade
from employees;
use employees;
select * from employees;
select FNAME, DOB, DOJ, datediff(doj,dob) from employees;
select fname, NOW() from employees;
select now(); — current date and time (system)
select date(now());
select curdate();
— default date format in MYSQL is YYYY-MM-DD
— DD-MM-YYYY
select date_format(curdate(), ‘%d-%m-%Y’) as Date;
select curdate() as todays_date, date_add(curdate(),interval 1 Day) as Tomorrow,
date_add(curdate(),interval 1 Week) as next_week,
date_add(curdate(),interval 1 Month) as Next_month,
date_add(curdate(),interval 1 Year) as Next_year;
— Add date: Interval, day, week or month or year;
select curdate() as todays_date, date_sub(curdate(),interval 1 Day) as Yesterday,
date_sub(curdate(),interval 1 Week) as last_week,
date_sub(curdate(),interval 1 Month) as last_month,
date_sub(curdate(),interval 1 Year) as last_year;
select day(curdate()) as day, month(curdate()) as Month, quarter(curdate()) as Quarter, Year(curdate()) as Year,
Weekday(curdate()) as Weekday, week(curdate()) as week, weekofyear(curdate()) as WeekofYear;
select * from employees where salary > (select salary from employees where empid = 104);
— create a stored procedure
DELIMITER $$
Create Procedure GetEmpInfo()
Begin
select * from employees where salary > (select salary from employees where empid = 104);
End $$
DELIMITER ;
call GetEmpInfo();
drop Procedure GetEmpInfo;
— Features: exception handling, conditions and loops (While, Repeat),
— Creating a View
create view EmpData
as
Select EMPID, FNAME, LNAME, DOB, EMAIL, PHONE, DOJ, DEPTID from Employees;
select * from empdata;
— String functions: LTRIM, RTRIM, Replace, substring, COncat
select concat(fname,’.’,lname,’@wnbco.co’) from employees;
select fname, substring(fname, 3), substring(fname, -3) from employees;
con_str = pymysql.connect(host=“localhost”, user=“root”, password=“learnSQL”,database=“employees”)
cursor = con_str.cursor()
q1 = ”’Create Table PY_EMP (
EMPID INT PRIMARY KEY,
NAME VARCHAR(30),
CITY VARCHAR(15))
”’
#cursor.execute(q1)
q2 = ”’INSERT INTO PY_EMP VALUES(1,’Laxman’,’Hyderabad’)”’
cursor.execute(q2)
q2 = ”’INSERT INTO PY_EMP VALUES(2,’Rahul’,’Bangalore’)”’
cursor.execute(q2)
q3 = ”’Select * from Py_Emp”’
cursor.execute(q3)
results = cursor.fetchall()
for row in results:
print(row)
con_str.commit()
con_str.close()
# pip install numpy
import numpy as np
nums = range(16)
nums = np.reshape(nums,(8,2))
print(nums)
nums = np.reshape(nums,(4,4))
print(nums)
print(“Shape: Rows = “,nums.shape[0], “and columns = “,nums.shape[1])
# indexing
print(nums[1,2], nums[-3,-2])
print(nums[1]) # 2nd row
print(nums[:,1]) # : rows from 0th to (n-1)th
print(nums[-1], nums[:,-2], nums[-1,-2])
# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1)
print(mat1)
print(np.zeros((3,3)))
print(np.ones((3,3)))
print(np.full((5,7),2.0))
print(np.full((5,7),9))
# eye – identity matrix: square matrix with 1 on its main diagonal
mat1 = np.eye(5)
print(mat1)
import numpy as np
# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1) # 4 * 3 – shape
print(mat1)
l2 = [[2,3,4],[2,1,2],[5,2,3],[3,2,2]]
# array is a function to convert list into numpy
mat2 = np.array(l2)
print(mat2)
# Matrices operations
print(mat1 + mat2)
print(np.add(mat1, mat2))
print(mat1 – mat2)
print(np.subtract(mat1, mat2))
print(mat1 * mat2)
print(np.multiply(mat1, mat2))
print(mat1 / mat2)
print(np.divide(mat1, mat2))
# actual matrix multiplication is done using matmul()
l3 = [[2,3,4],[2,1,2],[5,2,3]]
# array is a function to convert list into numpy
mat3 = np.array(l3)
print(mat3)
print(“Matrix Multiplication”)
print(np.matmul(mat1, mat3))
print(mat1 @ mat3)
## calculating determinant
l4 = [[1,3,5],[1,3,1],[2,3,4]]
mat5 = np.array(l4)
det_mat5 = np.linalg.det(mat5)
print(“Determinant of matrix 5 is”,det_mat5)
print(“Inverse of matrix 5 is: \n“,np.linalg.inv(mat5))
”’
Linear Algebra Equation:
x1 + 5×2 = 7
-2×1 – 7×2 = -5
x1 = -8, x2= 3,
”’
coeff_mat = np.array([[1,5],[-2,-7]])
#var_mat = np.array([[x1],[x2]])
result_mat = np.array([[7],[-5]])
# equation here is coeff_mat * var_mat = result_mat [eg: 5 * x = 10]
# which is, var_mat = coeff_mat inv * result_mat
det_coeff_mat = np.linalg.det(coeff_mat)
if det_coeff_mat !=0:
var_mat = np.linalg.inv(coeff_mat) @ result_mat
print(“X1 = “,var_mat[0,0])
print(“X2 = “,var_mat[1,0])
else:
print(“Solution is not possible”)
# # scipy = scientific python
# pip install scipy
”’
#Inequality = OPTIMIZATION or MAXIMIZATION / MINIMIZATION PROBLEM
Computer Parts Assembly:
Laptops & Desktops
profit: 1000, 600
objective: either maximize profit or minimize cost
constraints:
1. Demand: 500, 600
2. Parts: Memory card: 5000 cards available
3. Manpower: 25000 minutes
”’
Optimization using Scipy
let’s assume d = desktop, n = notebooks
Constraints:
1. d + n <= 10000
2. 2d + n <= 15000
3. 3d + 4n <= 25000
profit: 1000 d + 750 n => maximize
-1000d – 750 n =>minimize
”’
import numpy as np
from scipy.optimize import minimize, linprog
d = 1
n = 1
profit_d = 1000
profit_n = 750
profit = d * profit_d + n * profit_n
obj = [-profit_d, -profit_n]
lhs_con = [[1,1],[2,1],[3,4]]
rhs_con = [10000, 15000, 25000]
boundary = [(0, float(“inf”)), # boundary condition for # of desktops
(10, 200000)] # we just added some limit for notebooks
opt = linprog(c=obj, A_ub=lhs_con, b_ub=rhs_con, bounds=boundary, method=“revised simplex”)
print(opt)
if opt.success:
print(f”Number of desktops = {opt.x[0]} and number of laptops = {opt.x[1]}“)
print(“Maximum profit that can be generated = “,-1 * opt.fun)
else:
print(“Solution can not be generated”)
### ### ### PANDAS
# Pandas – dataframe which resembles Table structure
# pip install pandas
import pandas as pd
df1 = pd.DataFrame()
print(df1)
print(type(df1))
# fruit production
data = [[“Apple”, 15000, 11000,6000],
[“Banana”, 18000,22000,29000],
[“Mango”, 2, 900, 19000],
[“Guava”, 19000,11000,25000]]
fruit_production = pd.DataFrame(data)
print(fruit_production)
print(“Slicing 1:\n“)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n“)
print(fruit_production.loc[1:3,2:]) #based on title(names)
fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”])
print(fruit_production)
fruit_production = pd.DataFrame(data,
columns=[“Fruits”,“January”,“February”,“March”],
index=[“Fruit 1”,“Fruit 2”,“Fruit 3”,“Fruit 4”])
print(fruit_production)
## dataframe.loc() dataframe.iloc()
print(“Slicing 1:\n“)
print(fruit_production.iloc[1:3,2:]) #based on index
print(“Slicing 2:\n“)
print(fruit_production.loc[[“Fruit 2”, “Fruit 3”],[“February”,“March”]]) #based on title(names)
### ###
# pip install pandas
import pandas as pd
l1 = [10,20,30,40,50]
l1 = [[“Sachin”,101,20000,“BATSMAN”],[“Kapil”,501,12000,“BOWLER”],
[“Sunil”,12,21000,“BATSMAN”],[“Zaheer”,725,2000,“BOWLER”]]
df1 = pd.DataFrame(l1,columns=[“Player”,“Wickets”,“Runs”,“Type”],
index=[“Player 1”,“Player 2”,“Player 3”,“Player 4”])
print(df1)
d1 = {‘Apple’:[12000,11000,13000],
‘Banana’: [17000,18000,19000],
‘Mango’:[11000,13000,15000]}
df2 = pd.DataFrame(d1)
print(df2)
# creating dataframe from list of dictionary
data1 = [{“Guava”:9000, “Oranges”: 5000},
{“Guava”:8000, “Oranges”: 7000},
{“Guava”:10000, “Oranges”: 6000}]
df3 = pd.DataFrame(data1)
print(df3)
print(df3.iloc[0,:]) #first row and all column values
print(df3.iloc[:,0])
print(df2.iloc[:,0:2])
print(df2.iloc[[0,2],[0,2]])
#
print(df2.loc[[0,2],[“Apple”,“Mango”]])
print(df1.loc[[“Player 1”,“Player 4”],[“Player”,“Runs”]])
df2.iloc[2,0] = 14000
print(df2)
print(“========= DF1 =============”)
df1[‘Avg’] = df1[‘Runs’] / df1[“Wickets”]
print(df1)
print(“Reading data from DF1: “)
df4 = df1[df1.Player !=‘Sachin’] #filter where clause
print(“\n\n New dataset without Sachin: \n“, df4)
df1 = df1.drop(“Player”,axis=1) # axis default is 0
# unlike pop() and del – drop() returns a new dataframe
print(df1)
print(“Average Wickets of all the players = “,df1[‘Wickets’].mean())
print(“Average Wickets of players by type = \n\n“,df1.groupby(‘Type’).mean())
# axis = 0 refers to rows
# axis = 1 refers to columns
print(“\n\nDropping columns from DF1: “)
del df1[‘Wickets’] #dropping column Wickets using del
print(df1)
df1.pop(‘Runs’) #dropping column using pop
print(df1)
#
ud_df = pd.read_csv(“D:/datasets/gitdataset/user_device.csv”)
print(ud_df) # 272 rows x 6 columns
print(“Rows: “,ud_df.shape[0])
print(“Columns: “,ud_df.shape[1])
print(ud_df.tail(1))
print(ud_df.head(1))
use_df = pd.read_csv(“D:/datasets/gitdataset/user_usage.csv”)
print(use_df) # 240 rows x 4 columns
result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’)
print(result_df) # [159 rows x 9 columns] = ud_df: 159 + 113, use_df = 159 + 81
result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘outer’)
print(result_df)
result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘left’)
print(result_df)
result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
‘outgoing_mins_per_month’]], ud_df,
on=‘use_id’, how=‘right’)
print(result_df)
## Working with Pandas – Example ##
import pandas as pd
import numpy as np
df = pd.read_csv(“D:/datasets/gitdataset/hotel_bookings.csv”)
print(df.shape)
print(df.dtypes)
”’
numeric – int, float
categorical – 1) Nominal – there is no order 2) Ordinal – here order is imp
”’
df_numeric = df.select_dtypes(include=[np.number])
print(df_numeric)
df_object= df.select_dtypes(exclude=[np.number])
print(df_object) # categorical and date columns
print(df.columns)
for col in df.columns:
missing = np.mean(df[col].isnull())
if missing >0:
print(f”{col} – {missing}“)
Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING – Divide the train and test
”’
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
print(df)
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
a. Handle missing values
b. Feature scaling – scale the values in the column to similar range
c. Outliers / data correction
d. handling categorical data:
i. Encode the data to convert text to number
East = 0, North = 1, South = 2, West = 3
ii. Column Transform into multple columns
iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING –
a. Divide the train and test
b. Run the model
6. EVALUATE THE MODEL:
a. Measure the performance of each algorithm on the test data
b. Metric to compare: based on Regression (MSE, RMSE, R square) or
classification (confusion matrix -accuracy, sensitivity..)
c. select the best performing model
7. DEPLOY THE BEST PERFORMING MODEL
Hypothesis test:
1. Null Hypothesis (H0): starting statement (objective)
Alternate Hypethesis (H1): Alternate of H0
Z or T test:
Chi square test: both are categorical
e.g. North zone: 50 WIN 5 LOSS – p = 0.005
# simple (single value) v composite (specifies range)
# two tailed test v one tailed test [H0: mean = 0,
H1 Left Tailed: mean <0
H1 Right Tailed: mean >0
# level of significance:
alpha value: confidence interval – 95%
p value: p value <0.05 – we reject Null Hypothesis
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)
# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn
# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
print(X)
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)
# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn
# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train)
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values
”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)
# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:] # dropped one column
#print(X)
”’
# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
1. Positive correlation:
2. Negative correlation:
3. No Correlation
4. Correlation: 0 to +/- 1
5. Correlation value: 0 to +/- 0.5 : no correlation
6. Strong correlation value will be closer to +/- 1
7. Equation: straight line => y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)
”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’
## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_}“)
# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406
y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)
# Analyze the output