Jul 12 2023

JUNE 2023 DataScience Tutorial PART- II

ACCESS THE PREVIOUS CONTENT HERE

use employees;

— create table, drop table, alter table

— to read data we use Select

Select * from departments;

insert into departments (DID, HOD, DNAME, DCODE) values (100,’Sachin’,’CSE’,’AA00′);

insert into departments values (101,’MEC’,’Virat’,’AZ00′);

insert into departments values (103,’ECE’,’Rohit’,’KZ00′);

insert into departments values (105,’CIV’,’Dhoni’,’CZ00′);

insert into departments values (106,’TCE’,’Sunil’,’BZ00′);

Select * from employees;

Select @@GLOBAL.secure_file_priv;

— Import data into CSV

LOAD DATA INFILE ‘C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/EmployeesData.csv’

INTO TABLE EMPLOYEES

FIELDS TERMINATED BY ‘,’

ENCLOSED BY ‘”‘

LINES TERMINATED BY ‘\n’

IGNORE 1 ROWS;

— DELETE to remove a row

DELETE FROM EMPLOYEES WHERE EMPID=120;

— Update to modify the existing values in a row

Update Employees

Set Bonus_PCT = 0.45, Salary = 160000

where empid=122;

Select * from employees where empid=122

— SELECT

Select FNAME, EMAIL, PHONE, SALARY From Employees;

Select FNAME, EMAIL, PHONE, SALARY From Employees where salary >=100000;

Select FNAME, EMAIL, PHONE, SALARY From Employees where EMAIL =’ramakrishnavendra@wnbco.co’;

— Create Index to make query faster

Create Index idx_salary

on Employees (Salary)

— Relational operators in MYSQL: = > < >= <= <>

Select FNAME, EMAIL, PHONE, SALARY From Employees where EMAIL <>’ramakrishnavendra@wnbco.co’;

— BETWEEN LIKE IN

DAY 27 Tutorial - DB Session 6 : INSERT & SELECT WITH WHERE

use employees;

select * from employees;

select FNAME, LNAME, DOB, Salary from employees where salary > 75000;

— BETWEEN is used for numbers

select FNAME, LNAME, DOB, Salary from employees where salary BETWEEN 75000 AND 95000;

— below statement is same as above (BETWEEN)

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 AND salary <= 95000;

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 AND Bonus_pct <= 0.4; — 0 to 9 rows

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 OR Bonus_pct <= 0.4; — 25 to 34 rows

select FNAME, LNAME, DOB, Salary from employees where salary >= 75000 — 25 rows

select FNAME, LNAME, DOB, Salary from employees where Bonus_pct <= 0.4; — 9 rows

— LIKE – for text data comparison

Select * from employees where FNAME like ‘A%’

— IN – checking for given set of values

Select * from employees where FNAME IN (‘O%’, ‘A%’,’B%’);

Select * from employees where FNAME IN (‘Anikeet Dey’,’Amol Jain’,’Suresh Narayan Singh Yadav’ );

Select * from employees where SALARY IN (36107, 110266, 107799, 198890);

— Logical operator – AND OR NOT

select * from employees where not( deptid = 100 and salary >100000)

— ORDER BY

select * from employees order by salary DESC;

select * from employees order by salary ASC; — Ascending order is by default

select * from employees order by DEPTID ASC, Salary DESC;

DAY 28 DB Session 7: SELECT - Where - AND OR NOT BETWEEN IN LIKE

use employees;

select * from employees where bonus_pct is not null;

select * from employees order by FNAME;

select * from employees order by FNAME DESC;

UPDATE employees SET BONUS_PCT = NULL WHERE BONUS_PCT > 0.88;

select * from employees where bonus_pct is not null limit 3;

select * from employees where bonus_pct is not null order by salary DESC limit 5;

select avg(salary) as Max_Salary, deptid from employees group by deptid;

select avg(salary), count(salary), sum(salary) from employees;

select * from employees where salary > (select avg(salary) from employees);

— Find highest salary earner for each department

select deptid, MAX(salary) as max_salary FROM employees group by deptid;

select deptid, fname, salary from employees where (Deptid, salary) in

(select deptid, MAX(salary) as max_salary FROM employees group by deptid);

DAY 29 DB Session 8: ORDER BY, GROUP BY, MAX, MIN, AVG, SUM, COUNT

— Questions for Bala:

— 1. Top 5 run scorer

— 2. Top 5 wickets taker

— 3. Top 5 catch takers

— 4. How many matches each team has won and lost

— 5. How many teams won and toss but lost the matches

— 6. How many teams lost by batting first

— 7. How many matches were interrupted by rain

— 8. Top run scorer for each team

— 9. Top wicket takers for each team

— 10. Top catch takers for each team

— ——————– ——–

— Questions for Vinay:

— 1. How much is my daily sales?

— 2. Top 5 contributing product by sales

— 3. Top 5 contributing product by count

— 4. Breakup of Payment mode (card/cash/upi/others)

— 5. Top 5 highest priced products

— 6. Which day of the week do more people tend to partake in online grocery shopping

— 7. Top best selling categories (number of transactions per category)

— 8. Top best selling categories (amount of sales)

— 9. Top 20% categories contribution to % of sales

— 10. What is the average sales for each of the 7 days.

— ——————– ——–

use employees;

select deptid, avg(salary) as “Average Salary” from employees group by deptid;

— COUNT, AVG, SUM, MIN, MAX

— Like: %a, a%, %a% , %b__% (exactly 1 value for _), ‘a%z’

select fname, email, salary from employees;

select fname, email, salary from employees where fname like ‘%ol%’;

— J followed by exactly three characters

select fname, email, salary from employees where fname like ‘%J___’;

— J followed by atleast four characters

select fname, email, salary from employees where fname like ‘%J____%’;

select * from employees where fname in (‘Pankaj Prajapati’,’Manju Mishra’ ,’Arijeet Dasgupta’);

select * from employees where fname in (select fname from employees where fname like ‘%J____%’);

select * from employees where (fname,salary) in (select fname,salary from employees where fname like ‘%J____%’);

select fname, email, salary from employees where fname like ‘%dey’;

DAY 30 DB Session 9: Database Project Discussion, GROUP BY, LIKE, IN, SUB QUERY

— Questions for Karan

— How many books are currently issued

— Who has read the maximum number of books

— Is there a member who has not issued a book so far

— How many books have been issued for more than a month

— List of books with more than 2000 rupees cost

— List of Books older than 20 years

— The most read books of the library

— The most read genre of library

— Average books that are issued per day

— How many readers have not returned the book on time so far

— —————- —-

— combining data from 2 tables

use employees;

select * from employees;

select employees.FNAME, DID, departments.DNAME from employees, departments

select employees.FNAME, DID, departments.DNAME from employees, departments where employees.deptid = departments.did;

insert into departments values (107, ‘BIOT’,’Kapil’,’BB09′)

insert into employees (EMPID, FNAME, LNAME, DOB, EMAIL,PHONE, DOJ,SALARY)

values (133,’Sachin Tendulkar’,’T’,’1990-05-04′,’sachin@wnbco.co’,9999000009,’2023-05-15′,231456);

select * from departments;

select employees.FNAME, DID, departments.DNAME from employees, departments where employees.deptid = departments.did;

— JOINS

use employees;

select * from employees;

select * from departments;

select fname, hod from employees, departments where departments.did = employees.DEPTID; — Inner

select fname, hod from employees, departments where employees.DEPTID = departments.did order by empid;

select fname, hod from employees INNER JOIN departments ON employees.DEPTID = departments.did order by empid;

SELECT T1.COL1, T2.COL2, T3.COL3

FROM (((T1 INNER JOIN T2 ON T1.K1 = T2.K1) INNER JOIN T3 ON T2.K2 = T3.K2) INNER JOIN T4 ON T3.K3 = T4.K3);

select fname, hod from employees LEFT JOIN departments ON employees.DEPTID = departments.did order by empid;

DAY 31 - DB SESSION 10 Tutorial day: JOINS

use employees;

select fname, hod from employees left join departments on employees.DEPTID = departments.did

UNION

select fname, hod from departments left join employees on employees.DEPTID = departments.did;

select fname, hod from departments inner join employees on employees.DEPTID = departments.did;

select * from employees;

select sum(EMPID) from employees;

select count(EMPID) from employees;

select count(EMPID) from employees where salary > 50000;

select count(EMPID) , DEPTID from employees

group by deptid

having count(EMPID) > 5;

select count(fname), hod from employees left join departments on employees.DEPTID = departments.did

group by deptid

having count(fname) > 3;

select * from employees;

— Salary_Grade: <50K: E 50-100K: D 100-150K – C 150-200-B >200K: A

select fname, salary, bonus_pct,

case when bonus_pct is null then salary

else salary+salary*bonus_pct

end as total_salary from employees;

select fname, salary,

CASE

when salary < 50000 Then ‘Grade: E’

when salary > 200000 Then ‘Grade: A’

when salary > 150000 Then ‘Grade: B’

when salary > 100000 Then ‘Grade: C’

Else ‘Grade: D’

End As Salary_Grade

from employees;

DAY 32 - DB SESSION 11 : JOINS, HAVING & CASE

use employees;

select * from employees;

select FNAME, DOB, DOJ, datediff(doj,dob) from employees;

select fname, NOW() from employees;

select now(); — current date and time (system)

select date(now());

select curdate();

— default date format in MYSQL is YYYY-MM-DD

— DD-MM-YYYY

select date_format(curdate(), ‘%d-%m-%Y’) as Date;

select curdate() as todays_date, date_add(curdate(),interval 1 Day) as Tomorrow,

date_add(curdate(),interval 1 Week) as next_week,

date_add(curdate(),interval 1 Month) as Next_month,

date_add(curdate(),interval 1 Year) as Next_year;

— Add date: Interval, day, week or month or year;

select curdate() as todays_date, date_sub(curdate(),interval 1 Day) as Yesterday,

date_sub(curdate(),interval 1 Week) as last_week,

date_sub(curdate(),interval 1 Month) as last_month,

date_sub(curdate(),interval 1 Year) as last_year;

select day(curdate()) as day, month(curdate()) as Month, quarter(curdate()) as Quarter, Year(curdate()) as Year,

Weekday(curdate()) as Weekday, week(curdate()) as week, weekofyear(curdate()) as WeekofYear;

select * from employees where salary > (select salary from employees where empid = 104);

— create a stored procedure

DELIMITER $$

Create Procedure GetEmpInfo()

Begin

select * from employees where salary > (select salary from employees where empid = 104);

End $$

DELIMITER ;

call GetEmpInfo();

drop Procedure GetEmpInfo;

— Features: exception handling, conditions and loops (While, Repeat),

— Creating a View

create view EmpData

Select EMPID, FNAME, LNAME, DOB, EMAIL, PHONE, DOJ, DEPTID from Employees;

select * from empdata;

— String functions: LTRIM, RTRIM, Replace, substring, COncat

select concat(fname,’.’,lname,’@wnbco.co’) from employees;

select fname, substring(fname, 3), substring(fname, -3) from employees;

DAY 33 - DB SESSION 12: DATE FUNCTIONS, STRING FUNCTIONS, PROCEDURES, VIEWS

import pymysql
con_str = pymysql.connect(host=“localhost”, user=“root”, password=“learnSQL”,database=“employees”)
cursor = con_str.cursor()

q1 = ”’Create Table PY_EMP (
EMPID INT PRIMARY KEY,
NAME VARCHAR(30),
CITY VARCHAR(15))
”’
#cursor.execute(q1)

q2 = ”’INSERT INTO PY_EMP VALUES(1,’Laxman’,’Hyderabad’)”’
cursor.execute(q2)
q2 = ”’INSERT INTO PY_EMP VALUES(2,’Rahul’,’Bangalore’)”’
cursor.execute(q2)


q3 = ”’Select * from Py_Emp”’
cursor.execute(q3)
results = cursor.fetchall()
for row in results:
    print(row)

con_str.commit()

con_str.close()

DAY 34 -PYTHON TUTORIAL: Working with Database from Python

# NUMPY
# pip install numpy
import numpy as np
nums = range(16)
nums = np.reshape(nums,(8,2))
print(nums)
nums = np.reshape(nums,(4,4))
print(nums)
print(“Shape: Rows = “,nums.shape[0], “and columns = “,nums.shape[1])
# indexing
print(nums[1,2], nums[-3,-2])
print(nums[1])  # 2nd row
print(nums[:,1])   #   : rows from 0th to (n-1)th
print(nums[-1], nums[:,-2], nums[-1,-2])

# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1)
print(mat1)

print(np.zeros((3,3)))
print(np.ones((3,3)))
print(np.full((5,7),2.0))
print(np.full((5,7),9))

# eye – identity matrix: square matrix with 1 on its main diagonal
mat1 = np.eye(5)
print(mat1)

DAY 35 - PYTHON TUTORIAL: NUMPY - Part 1

# NUMPY
import numpy as np
# to give your own set of values, you need to provide in terms of list
l1 = [[1,5,7],[2,4,9],[1,1,3],[3,3,2]]
# array is a function to convert list into numpy
mat1 = np.array(l1)   # 4 * 3 – shape
print(mat1)
l2 = [[2,3,4],[2,1,2],[5,2,3],[3,2,2]]
# array is a function to convert list into numpy
mat2 = np.array(l2)
print(mat2)

# Matrices operations
print(mat1 + mat2)
print(np.add(mat1, mat2))

print(mat1 – mat2)
print(np.subtract(mat1, mat2))

print(mat1 * mat2)
print(np.multiply(mat1, mat2))

print(mat1 / mat2)
print(np.divide(mat1, mat2))

# actual matrix multiplication is done using matmul()
l3 = [[2,3,4],[2,1,2],[5,2,3]]
# array is a function to convert list into numpy
mat3 = np.array(l3)
print(mat3)
print(“Matrix Multiplication”)
print(np.matmul(mat1, mat3))
print(mat1 @ mat3)
## calculating determinant

l4 = [[1,3,5],[1,3,1],[2,3,4]]
mat5 = np.array(l4)
det_mat5 = np.linalg.det(mat5)
print(“Determinant of matrix 5 is”,det_mat5)
print(“Inverse of matrix 5 is: \n“,np.linalg.inv(mat5))

”’
Linear Algebra Equation:
x1 + 5×2 = 7
-2×1  – 7×2 = -5

x1 = -8, x2= 3, 
”’
coeff_mat = np.array([[1,5],[-2,-7]])
#var_mat = np.array([[x1],[x2]])
result_mat = np.array([[7],[-5]])
# equation here is  coeff_mat * var_mat = result_mat   [eg: 5 * x = 10]
# which is, var_mat = coeff_mat inv * result_mat
det_coeff_mat = np.linalg.det(coeff_mat)
if det_coeff_mat !=0:
    var_mat = np.linalg.inv(coeff_mat) @ result_mat
    print(“X1 = “,var_mat[0,0])
    print(“X2 = “,var_mat[1,0])
else:
    print(“Solution is not possible”)

# # scipy = scientific python
# pip install scipy
”’
#Inequality = OPTIMIZATION or MAXIMIZATION / MINIMIZATION PROBLEM
Computer Parts Assembly:
Laptops & Desktops
profit: 1000, 600
objective: either maximize profit or minimize cost

constraints:
1. Demand:   500, 600
2. Parts:  Memory card: 5000 cards available
3. Manpower:  25000 minutes


”’

DAY 36 - PYTHON NUMPY - Part 2 & SCIPY - Part 1

”’
Optimization using Scipy
let’s assume d = desktop, n = notebooks

Constraints:
1. d + n <= 10000
2. 2d + n <= 15000
3. 3d + 4n <= 25000

profit: 1000 d + 750 n => maximize
-1000d – 750 n =>minimize

”’
import numpy as np
from scipy.optimize import minimize, linprog
d = 1
n = 1
profit_d = 1000
profit_n = 750
profit = d * profit_d + n * profit_n
obj = [-profit_d, -profit_n]
lhs_con = [[1,1],[2,1],[3,4]]
rhs_con = [10000, 15000, 25000]

boundary = [(0, float(“inf”)),   # boundary condition for # of desktops
            (10, 200000)]   # we just added some limit for notebooks
opt = linprog(c=obj, A_ub=lhs_con, b_ub=rhs_con, bounds=boundary, method=“revised simplex”)
print(opt)
if opt.success:
    print(f”Number of desktops = {opt.x[0]} and number of laptops = {opt.x[1]}“)
    print(“Maximum profit that can be generated = “,-1 * opt.fun)
else:
    print(“Solution can not be generated”)

### ### ### PANDAS
# Pandas – dataframe which resembles Table structure
# pip install pandas
import pandas as pd
df1 = pd.DataFrame()
print(df1)
print(type(df1))

# fruit production
data = [[“Apple”, 15000, 11000,6000],
        [“Banana”, 18000,22000,29000],
        [“Mango”, 2, 900, 19000],
        [“Guava”, 19000,11000,25000]]

fruit_production = pd.DataFrame(data)
print(fruit_production)
print(“Slicing 1:\n“)
print(fruit_production.iloc[1:3,2:])   #based on index
print(“Slicing 2:\n“)
print(fruit_production.loc[1:3,2:]) #based on title(names)

fruit_production = pd.DataFrame(data,
                                columns=[“Fruits”,“January”,“February”,“March”])
print(fruit_production)

fruit_production = pd.DataFrame(data,
                                columns=[“Fruits”,“January”,“February”,“March”],
                                index=[“Fruit 1”,“Fruit 2”,“Fruit 3”,“Fruit 4”])
print(fruit_production)

## dataframe.loc()  dataframe.iloc()

print(“Slicing 1:\n“)
print(fruit_production.iloc[1:3,2:])   #based on index
print(“Slicing 2:\n“)
print(fruit_production.loc[[“Fruit 2”, “Fruit 3”],[“February”,“March”]]) #based on title(names)

### ###

DAY 37 - PYTHON SCIPY & PANDAS

# pandas
# pip install pandas
import pandas as pd
l1 = [10,20,30,40,50]
l1 = [[“Sachin”,101,20000,“BATSMAN”],[“Kapil”,501,12000,“BOWLER”],
      [“Sunil”,12,21000,“BATSMAN”],[“Zaheer”,725,2000,“BOWLER”]]
df1 = pd.DataFrame(l1,columns=[“Player”,“Wickets”,“Runs”,“Type”],
                   index=[“Player 1”,“Player 2”,“Player 3”,“Player 4”])
print(df1)

d1 = {‘Apple’:[12000,11000,13000],
      ‘Banana’: [17000,18000,19000],
      ‘Mango’:[11000,13000,15000]}
df2 = pd.DataFrame(d1)
print(df2)

# creating dataframe from list of dictionary
data1 = [{“Guava”:9000, “Oranges”: 5000},
         {“Guava”:8000, “Oranges”: 7000},
         {“Guava”:10000, “Oranges”: 6000}]
df3 = pd.DataFrame(data1)
print(df3)

print(df3.iloc[0,:])  #first row and all column values
print(df3.iloc[:,0])

print(df2.iloc[:,0:2])
print(df2.iloc[[0,2],[0,2]])

#
print(df2.loc[[0,2],[“Apple”,“Mango”]])
print(df1.loc[[“Player 1”,“Player 4”],[“Player”,“Runs”]])

df2.iloc[2,0] = 14000
print(df2)
print(“========= DF1 =============”)
df1[‘Avg’] = df1[‘Runs’] / df1[“Wickets”]
print(df1)
print(“Reading data from DF1: “)
df4 = df1[df1.Player !=‘Sachin’]  #filter where clause
print(“\n\n New dataset without Sachin: \n“, df4)
df1 = df1.drop(“Player”,axis=1)  # axis default is 0
# unlike pop() and del – drop() returns a new dataframe
print(df1)


print(“Average Wickets of all the players = “,df1[‘Wickets’].mean())
print(“Average Wickets of players by type = \n\n“,df1.groupby(‘Type’).mean())
# axis = 0 refers to rows
# axis = 1 refers to columns

print(“\n\nDropping columns from DF1: “)
del df1[‘Wickets’]  #dropping column Wickets using del
print(df1)

df1.pop(‘Runs’) #dropping column using pop
print(df1)
# 

DAY 38: PYTHON PANDAS INTRODUCTION

import pandas as pd

ud_df = pd.read_csv(“D:/datasets/gitdataset/user_device.csv”)
print(ud_df)   # 272 rows x 6 columns
print(“Rows: “,ud_df.shape[0])
print(“Columns: “,ud_df.shape[1])

print(ud_df.tail(1))
print(ud_df.head(1))

use_df = pd.read_csv(“D:/datasets/gitdataset/user_usage.csv”)
print(use_df)  # 240 rows x 4 columns

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
                             ‘outgoing_mins_per_month’]], ud_df,
                     on=‘use_id’)
print(result_df)   # [159 rows x 9 columns] = ud_df: 159 +  113, use_df = 159 + 81

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
                             ‘outgoing_mins_per_month’]], ud_df,
                     on=‘use_id’, how=‘outer’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
                             ‘outgoing_mins_per_month’]], ud_df,
                     on=‘use_id’, how=‘left’)
print(result_df)

result_df = pd.merge(use_df[[‘use_id’,‘monthly_mb’,‘outgoing_sms_per_month’,
                             ‘outgoing_mins_per_month’]], ud_df,
                     on=‘use_id’, how=‘right’)
print(result_df)

## Working with Pandas – Example  ##
import pandas as pd
import numpy as np
df = pd.read_csv(“D:/datasets/gitdataset/hotel_bookings.csv”)
print(df.shape)
print(df.dtypes)
”’
numeric – int, float
categorical – 1) Nominal – there is no order  2) Ordinal – here order is imp
”’
df_numeric = df.select_dtypes(include=[np.number])
print(df_numeric)

df_object= df.select_dtypes(exclude=[np.number])
print(df_object)  # categorical and date columns

print(df.columns)
for col in df.columns:
    missing = np.mean(df[col].isnull())
    if missing >0:
        print(f”{col} – {missing}“)

”’
Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
    a. Handle missing values
    b. Feature scaling – scale the values in the column to similar range
    c. Outliers / data correction
    d. handling categorical data:
        i. Encode the data to convert text to number
            East = 0, North = 1, South = 2, West = 3
        ii. Column Transform into multple columns
        iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING – Divide the train and test


”’
import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
print(df)

DAY 42: Introduction to data preprocessing for Machine Learning

Phases:
1. Business objective
2. Collect the relevant data
3. Preprocessing – making data ready for use
    a. Handle missing values
    b. Feature scaling – scale the values in the column to similar range
    c. Outliers / data correction
    d. handling categorical data:
        i. Encode the data to convert text to number
            East = 0, North = 1, South = 2, West = 3
        ii. Column Transform into multple columns
        iii. Delete any one column
4. EDA- Exploratory Data Analysis: to understand the data
5. MODEL BUILDING –
    a. Divide the train and test
    b. Run the model
6. EVALUATE THE MODEL:
    a. Measure the performance of each algorithm on the test data
    b. Metric to compare: based on Regression (MSE, RMSE, R square) or
        classification (confusion matrix -accuracy, sensitivity..)
    c. select the best performing model
7. DEPLOY THE BEST PERFORMING MODEL

Hypothesis test:
1. Null Hypothesis (H0): starting statement (objective)
    Alternate Hypethesis (H1): Alternate of H0

Z or T test:
Chi square test: both are categorical

e.g. North zone:  50 WIN 5 LOSS – p = 0.005

# simple (single value) v composite (specifies range)
# two tailed test v one tailed test [H0: mean = 0,
                                    H1 Left Tailed: mean <0
                                    H1 Right Tailed: mean >0
# level of significance:
    alpha value: confidence interval – 95%
    p value: p value <0.05 – we reject Null Hypothesis

DAY 43: Introduction to Model building and Model Evaluation

import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by: pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])
print(X)

DAY 44: Example for Handling missing values and Introduction to handling Categorical values

import pandas as pd
df = pd.read_csv(“https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/1_Data_PreProcessing.csv”)
X = df.iloc[:,:3].values
y = df.iloc[:,3].values
#print(“X: \n”)
#print(X)
#print(“Y: \n”)
#print(y)

# scikit-learn package to perform ML
# install the package by:  pip install scikit-learn
# but when you import, its sklearn

# Complete tutorial on sklearn:
# https://scikit-learn.org/stable/

# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=‘mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=‘passthrough’)
X=transform.fit_transform(X)
X = X[:,1:]  # dropped one column
#print(X)

# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train)
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)

DAY 45: Example for handling Categorical values, Scaling / Normalization and Training Test data split

”’
Regression: Output (Marks) is a continous variable
Algorithm: Simple (as it has only 1 X column) Linear (assuming that dataset is linear) Regression
X – independent variable(s)
Y – dependent variable
”’
import pandas as pd
import matplotlib.pyplot as plt
link = “https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv”
df = pd.read_csv(link)
X = df.iloc[:,:1].values
y = df.iloc[:,1].values

”’
# 1. Replace the missing values with mean value
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy=’mean’)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#print(X)

# 2. Handling categorical values
# encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lc = LabelEncoder()
X[:,0] = lc.fit_transform(X[:,0])

from sklearn.compose import ColumnTransformer
transform = ColumnTransformer([(‘one_hot_encoder’, OneHotEncoder(),[0])],remainder=’passthrough’)
X=transform.fit_transform(X)
X = X[:,1:]  # dropped one column
#print(X)
”’

# EDA – Exploratory Data Analysis
plt.scatter(x=df[‘Hours’],y=df[‘Marks’])
plt.show()
”’
Scatter plots – shows relationship between X and Y variables. You can have:
 1. Positive correlation: 
 2. Negative correlation:
 3. No Correlation
 4. Correlation:  0 to +/- 1 
 5. Correlation value: 0 to +/- 0.5 : no correlation
 6. Strong correlation value will be closer to +/- 1
 7. Equation: straight line =>   y = mx + c
”’
# 3. splitting it into train and test test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)
print(X_train)

”’
# 4. Scaling / Normalization
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train[:,3:])
X_test = scale.fit_transform(X_test[:,3:])
print(X_train)
”’

## RUN THE MODEL
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fit – train the model
regressor.fit(X_train, y_train)
print(f”M/Coefficient/Slope = {regressor.coef_} and the Constant = {regressor.intercept_}“)

# y = 7.5709072 X + 20.1999196152844
# M/Coefficient/Slope = [7.49202113] and the Constant = 21.593606679699406

y_pred = regressor.predict(X_test)
result_df =pd.DataFrame({‘Actual’: y_test, ‘Predicted’: y_pred})
print(result_df)

# Analyze the output

Dataset: https://raw.githubusercontent.com/swapnilsaurav/MachineLearning/master/2_Marks_Data.csv

DAY 46: Simple Linear Regression using Python

CLICK HERE TO ACCESS PART 3 OF THE CONTENT

21

Share on Facebook

admin 0 Comments

JUNE 2023 DataScience Tutorial PART- II

ACCESS THE PREVIOUS CONTENT HERE

DAY 27 Tutorial - DB Session 6 : INSERT & SELECT WITH WHERE

DAY 28 DB Session 7: SELECT - Where - AND OR NOT BETWEEN IN LIKE

DAY 29 DB Session 8: ORDER BY, GROUP BY, MAX, MIN, AVG, SUM, COUNT

DAY 30 DB Session 9: Database Project Discussion, GROUP BY, LIKE, IN, SUB QUERY

DAY 31 - DB SESSION 10 Tutorial day: JOINS

DAY 32 - DB SESSION 11 : JOINS, HAVING & CASE

DAY 33 - DB SESSION 12: DATE FUNCTIONS, STRING FUNCTIONS, PROCEDURES, VIEWS

DAY 34 -PYTHON TUTORIAL: Working with Database from Python

DAY 35 - PYTHON TUTORIAL: NUMPY - Part 1

DAY 36 - PYTHON NUMPY - Part 2 & SCIPY - Part 1

DAY 37 - PYTHON SCIPY & PANDAS

DAY 38: PYTHON PANDAS INTRODUCTION

DAY 39: PYTHON PANDAS MERGE EXAMPLES

DAY 40: Presentation - Introduction to Machine Learning

DAY 41: Presentation - Machine Learning Algorithms and Examples

DAY 42: Introduction to data preprocessing for Machine Learning

DAY 43: Introduction to Model building and Model Evaluation

DAY 44: Example for Handling missing values and Introduction to handling Categorical values

DAY 45: Example for handling Categorical values, Scaling / Normalization and Training Test data split

DAY 46: Simple Linear Regression using Python

CLICK HERE TO ACCESS PART 3 OF THE CONTENT

21

SHARES

Leave a Reply Cancel reply

About Us

Quick Links

Services

Contact Us