Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Read Data

ibm = pd.read_csv('/WA_Fn-UseC_-HR-Employee-Attrition.csv')

pd.set_option('display.max_columns', None)

Dateset Information

ibm.shape

(1470, 35)

ibm.describe()

	Age	DailyRate	DistanceFromHome	Education	EmployeeCount	EmployeeNumber	EnvironmentSatisfaction	HourlyRate	JobInvolvement	JobLevel	JobSatisfaction	MonthlyIncome	MonthlyRate	NumCompaniesWorked	PercentSalaryHike	PerformanceRating	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
count	1470.000000	1470.000000	1470.000000	1470.000000	1470.0	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.0	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000
mean	36.923810	802.485714	9.192517	2.912925	1.0	1024.865306	2.721769	65.891156	2.729932	2.063946	2.728571	6502.931293	14313.103401	2.693197	15.209524	3.153741	2.712245	80.0	0.793878	11.279592	2.799320	2.761224	7.008163	4.229252	2.187755	4.123129
std	9.135373	403.509100	8.106864	1.024165	0.0	602.024335	1.093082	20.329428	0.711561	1.106940	1.102846	4707.956783	7117.786044	2.498009	3.659938	0.360824	1.081209	0.0	0.852077	7.780782	1.289271	0.706476	6.126525	3.623137	3.222430	3.568136
min	18.000000	102.000000	1.000000	1.000000	1.0	1.000000	1.000000	30.000000	1.000000	1.000000	1.000000	1009.000000	2094.000000	0.000000	11.000000	3.000000	1.000000	80.0	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000
25%	30.000000	465.000000	2.000000	2.000000	1.0	491.250000	2.000000	48.000000	2.000000	1.000000	2.000000	2911.000000	8047.000000	1.000000	12.000000	3.000000	2.000000	80.0	0.000000	6.000000	2.000000	2.000000	3.000000	2.000000	0.000000	2.000000
50%	36.000000	802.000000	7.000000	3.000000	1.0	1020.500000	3.000000	66.000000	3.000000	2.000000	3.000000	4919.000000	14235.500000	2.000000	14.000000	3.000000	3.000000	80.0	1.000000	10.000000	3.000000	3.000000	5.000000	3.000000	1.000000	3.000000
75%	43.000000	1157.000000	14.000000	4.000000	1.0	1555.750000	4.000000	83.750000	3.000000	3.000000	4.000000	8379.000000	20461.500000	4.000000	18.000000	3.000000	4.000000	80.0	1.000000	15.000000	3.000000	3.000000	9.000000	7.000000	3.000000	7.000000
max	60.000000	1499.000000	29.000000	5.000000	1.0	2068.000000	4.000000	100.000000	4.000000	5.000000	4.000000	19999.000000	26999.000000	9.000000	25.000000	4.000000	4.000000	80.0	3.000000	40.000000	6.000000	4.000000	40.000000	18.000000	15.000000	17.000000

import statistics
for i in ibm.columns:
    print(i, " mode: ", statistics.mode(ibm[i]));

Age  mode:  35
Attrition  mode:  No
BusinessTravel  mode:  Travel_Rarely
DailyRate  mode:  691
Department  mode:  Research & Development
DistanceFromHome  mode:  2
Education  mode:  3
EducationField  mode:  Life Sciences
EmployeeCount  mode:  1
EmployeeNumber  mode:  1
EnvironmentSatisfaction  mode:  3
Gender  mode:  Male
HourlyRate  mode:  66
JobInvolvement  mode:  3
JobLevel  mode:  1
JobRole  mode:  Sales Executive
JobSatisfaction  mode:  4
MaritalStatus  mode:  Married
MonthlyIncome  mode:  2342
MonthlyRate  mode:  9150
NumCompaniesWorked  mode:  1
Over18  mode:  Y
OverTime  mode:  No
PercentSalaryHike  mode:  11
PerformanceRating  mode:  3
RelationshipSatisfaction  mode:  3
StandardHours  mode:  80
StockOptionLevel  mode:  0
TotalWorkingYears  mode:  10
TrainingTimesLastYear  mode:  2
WorkLifeBalance  mode:  3
YearsAtCompany  mode:  5
YearsInCurrentRole  mode:  2
YearsSinceLastPromotion  mode:  0
YearsWithCurrManager  mode:  2

ibm.info()

&lt;class 'pandas.core.frame.DataFrame'&gt;
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB

Data Preprocessing

ibm.drop(columns = 'EmployeeCount', inplace = True)
ibm.drop(columns = 'EmployeeNumber', inplace = True)
ibm.drop(columns = 'Over18', inplace = True)
ibm.drop(columns = 'StandardHours', inplace = True)

ibm.drop_duplicates()

	Age	Attrition	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EnvironmentSatisfaction	Gender	HourlyRate	JobInvolvement	JobLevel	JobRole	JobSatisfaction	MaritalStatus	MonthlyIncome	MonthlyRate	NumCompaniesWorked	OverTime	PercentSalaryHike	PerformanceRating	RelationshipSatisfaction	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	41	Yes	Travel_Rarely	1102	Sales	1	2	Life Sciences	2	Female	94	3	2	Sales Executive	4	Single	5993	19479	8	Yes	11	3	1	0	8	0	1	6	4	0	5
1	49	No	Travel_Frequently	279	Research & Development	8	1	Life Sciences	3	Male	61	2	2	Research Scientist	2	Married	5130	24907	1	No	23	4	4	1	10	3	3	10	7	1	7
2	37	Yes	Travel_Rarely	1373	Research & Development	2	2	Other	4	Male	92	2	1	Laboratory Technician	3	Single	2090	2396	6	Yes	15	3	2	0	7	3	3	0	0	0	0
3	33	No	Travel_Frequently	1392	Research & Development	3	4	Life Sciences	4	Female	56	3	1	Research Scientist	3	Married	2909	23159	1	Yes	11	3	3	0	8	3	3	8	7	3	0
4	27	No	Travel_Rarely	591	Research & Development	2	1	Medical	1	Male	40	3	1	Laboratory Technician	2	Married	3468	16632	9	No	12	3	4	1	6	3	3	2	2	2	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1465	36	No	Travel_Frequently	884	Research & Development	23	2	Medical	3	Male	41	4	2	Laboratory Technician	4	Married	2571	12290	4	No	17	3	3	1	17	3	3	5	2	0	3
1466	39	No	Travel_Rarely	613	Research & Development	6	1	Medical	4	Male	42	2	3	Healthcare Representative	1	Married	9991	21457	4	No	15	3	1	1	9	5	3	7	7	1	7
1467	27	No	Travel_Rarely	155	Research & Development	4	3	Life Sciences	2	Male	87	4	2	Manufacturing Director	2	Married	6142	5174	1	Yes	20	4	2	1	6	0	3	6	2	0	3
1468	49	No	Travel_Frequently	1023	Sales	2	3	Medical	4	Male	63	2	2	Sales Executive	2	Married	5390	13243	2	No	14	3	4	0	17	3	2	9	6	0	8
1469	34	No	Travel_Rarely	628	Research & Development	8	3	Medical	2	Male	82	4	2	Laboratory Technician	3	Married	4404	10228	2	No	12	3	1	0	6	3	4	4	3	1	2

1470 rows × 31 columns

ibm.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

# replace Attrition (0 - No, 1 - Yes)
ibm.replace({'Attrition' : {'Yes': 1, 'No': 0}}, inplace = True)

# replace BusinessTravel (0 - Non-Travel, 1 - Travel_Rarely, 2 - Travel_Frequently)
ibm.replace({'BusinessTravel' : {'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}}, inplace = True)

#Department
dummy = pd.get_dummies(ibm['Department'])
ibm.insert(5,'Dp_Sales&Development', dummy['Research & Development'])
ibm.insert(6,'Dp_Sales', dummy['Sales'])
ibm.insert(7,'Dp_HumanResources', dummy['Human Resources'])

ibm.drop(columns = 'Department', inplace = True)

#EducationField
dummy = pd.get_dummies(ibm['EducationField'])
ibm.insert(11,'EF_Life Sciences',dummy['Life Sciences'])
ibm.insert(12,'EF_Medical',dummy['Medical'])
ibm.insert(13,'EF_Marketing',dummy['Marketing'])
ibm.insert(14,'EF_TechnicalDegree',dummy['Technical Degree'])
ibm.insert(15,'EF_HumanResources',dummy['Human Resources'])
ibm.insert(16,'EF_Other',dummy['Other'])

ibm.drop(columns = 'EducationField', inplace = True)

# replace Gender (0 - Male; 1 - Female)
ibm.replace({'Gender': {'Male': 0, 'Female': 1}}, inplace = True)

# Job role dummy variables
dummy=pd.get_dummies(ibm['JobRole'])
ibm.insert(23, 'JR_HealthcareRepresentive', dummy['Healthcare Representative'])
ibm.insert(24, 'JR_HumanResource', dummy['Human Resources'])
ibm.insert(25, 'JR_LaboratoryTechnician', dummy['Laboratory Technician'])
ibm.insert(26, 'JR_Manager', dummy['Manager'])
ibm.insert(27, 'JR_ManufacturingDirector', dummy['Manufacturing Director'])
ibm.insert(28, 'JR_ResearchDirector', dummy['Research Director'])
ibm.insert(29, 'JR_ResearchScientist', dummy['Research Scientist'])
ibm.insert(30, 'JR_SalesExecutive', dummy['Sales Executive'])
ibm.insert(31, 'JR_SalesRepresentative', dummy['Sales Representative'])

ibm.drop(columns = 'JobRole', inplace = True)

# MaritalStatus role dummy variables
dummy=pd.get_dummies(ibm['MaritalStatus'])
ibm.insert(34, 'MS_Married', dummy['Married'])
ibm.insert(35, 'MS_Single', dummy['Single'])
ibm.insert(36, 'MS_Divorced', dummy['Divorced'])

ibm.drop(columns = 'MaritalStatus', inplace = True)

# replace Overtime (0 - No; 1 - Yes)
ibm.replace({'OverTime': {'No': 0, 'Yes': 1}}, inplace = True)

# replace Over18 (0 - N; 1 - Y)
ibm.replace({'Over18': {'N': 0, 'Y': 1}}, inplace = True)

def iqr_outliers(data):
    out=[]
    
    firstQuartile = data.quantile(0.25)
    thirdQuartile = data.quantile(0.75)
    
    iqr = thirdQuartile-firstQuartile
    
    Lower_bound = firstQuartile - 1.5 * iqr
    Upper_bound = thirdQuartile + 1.5 * iqr
    
    for i in data:
        if i &gt; Upper_bound or i &lt; Lower_bound:
            out.append(i)
            
    print("Outliers:",out , "\nCount: ", len(out), "\n")
    return out

for c_name in ibm.columns:
    print (c_name)
    iqr_outliers(ibm[c_name])

Age
Outliers: [] 
Count:  0 

Attrition
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  237 

BusinessTravel
Outliers: [2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2] 
Count:  427 

DailyRate
Outliers: [] 
Count:  0 

Dp_Sales&Development
Outliers: [] 
Count:  0 

Dp_Sales
Outliers: [] 
Count:  0 

Dp_HumanResources
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  63 

DistanceFromHome
Outliers: [] 
Count:  0 

Education
Outliers: [] 
Count:  0 

EnvironmentSatisfaction
Outliers: [] 
Count:  0 

EF_Life Sciences
Outliers: [] 
Count:  0 

EF_Medical
Outliers: [] 
Count:  0 

EF_Marketing
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  159 

EF_TechnicalDegree
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  132 

EF_HumanResources
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  27 

EF_Other
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  82 

Gender
Outliers: [] 
Count:  0 

HourlyRate
Outliers: [] 
Count:  0 

JobInvolvement
Outliers: [] 
Count:  0 

JobLevel
Outliers: [] 
Count:  0 

JobSatisfaction
Outliers: [] 
Count:  0 

JR_HealthcareRepresentive
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  131 

JR_HumanResource
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  52 

JR_LaboratoryTechnician
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  259 

JR_Manager
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  102 

JR_ManufacturingDirector
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  145 

JR_ResearchDirector
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  80 

JR_ResearchScientist
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  292 

JR_SalesExecutive
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  326 

JR_SalesRepresentative
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  83 

MonthlyIncome
Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880] 
Count:  114 

MonthlyRate
Outliers: [] 
Count:  0 

NumCompaniesWorked
Outliers: [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9] 
Count:  52 

MS_Married
Outliers: [] 
Count:  0 

MS_Single
Outliers: [] 
Count:  0 

MS_Divorced
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  327 

OverTime
Outliers: [] 
Count:  0 

PercentSalaryHike
Outliers: [] 
Count:  0 

PerformanceRating
Outliers: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] 
Count:  226 

RelationshipSatisfaction
Outliers: [] 
Count:  0 

StockOptionLevel
Outliers: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 
Count:  85 

TotalWorkingYears
Outliers: [31, 29, 37, 38, 30, 40, 36, 34, 32, 33, 37, 30, 36, 31, 33, 32, 37, 31, 32, 32, 30, 34, 30, 40, 29, 35, 31, 33, 31, 29, 32, 30, 33, 30, 29, 31, 32, 33, 36, 34, 31, 36, 33, 31, 29, 33, 29, 32, 31, 35, 29, 32, 34, 36, 32, 30, 36, 29, 34, 37, 29, 29, 35] 
Count:  63 

TrainingTimesLastYear
Outliers: [0, 5, 5, 5, 6, 5, 5, 5, 6, 6, 0, 0, 0, 5, 0, 5, 5, 5, 6, 6, 5, 0, 6, 5, 5, 0, 5, 5, 6, 5, 5, 5, 0, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5, 0, 0, 5, 5, 5, 6, 6, 5, 0, 5, 0, 5, 5, 0, 6, 0, 5, 5, 6, 6, 5, 6, 5, 0, 5, 5, 5, 5, 0, 6, 5, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5, 0, 5, 0, 5, 5, 6, 5, 6, 5, 0, 5, 5, 0, 6, 6, 5, 6, 0, 5, 0, 6, 6, 6, 6, 5, 5, 0, 5, 0, 0, 6, 0, 6, 5, 6, 5, 5, 0, 5, 6, 6, 5, 5, 0, 0, 6, 0, 0, 5, 0, 5, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 6, 5, 6, 6, 0, 6, 6, 5, 5, 0, 0, 6, 6, 0, 5, 0, 0, 0, 0, 0, 5, 5, 6, 5, 5, 0, 5, 5, 0, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 0, 0, 5, 5, 5, 5, 6, 0, 0, 6, 6, 6, 6, 5, 5, 5, 6, 5, 0, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 0, 5, 5, 5, 5, 5, 0, 0, 0, 6, 5, 6, 6, 5, 6, 0, 6, 6, 5, 6, 6, 5, 5, 5, 0] 
Count:  238 

WorkLifeBalance
Outliers: [] 
Count:  0 

YearsAtCompany
Outliers: [25, 22, 22, 27, 21, 22, 37, 25, 20, 40, 20, 24, 20, 24, 33, 20, 19, 22, 33, 24, 19, 21, 20, 36, 20, 20, 22, 24, 21, 21, 25, 21, 29, 20, 27, 20, 31, 32, 20, 20, 21, 22, 22, 34, 24, 26, 31, 20, 31, 26, 19, 21, 21, 32, 21, 19, 20, 22, 20, 21, 26, 20, 22, 24, 33, 29, 25, 21, 19, 19, 20, 19, 33, 19, 19, 20, 20, 20, 20, 20, 32, 20, 21, 33, 36, 26, 30, 22, 23, 23, 21, 21, 22, 22, 19, 22, 19, 22, 20, 20, 20, 22, 20, 20] 
Count:  104 

YearsInCurrentRole
Outliers: [15, 16, 18, 15, 18, 17, 16, 15, 16, 15, 16, 16, 15, 16, 17, 15, 15, 15, 17, 17, 16] 
Count:  21 

YearsSinceLastPromotion
Outliers: [8, 15, 8, 8, 9, 13, 12, 10, 11, 9, 12, 15, 15, 15, 9, 11, 11, 9, 12, 11, 15, 11, 10, 9, 11, 9, 8, 11, 11, 8, 13, 9, 9, 12, 10, 11, 15, 13, 9, 11, 10, 8, 8, 11, 9, 11, 12, 11, 14, 13, 14, 8, 11, 15, 10, 11, 11, 15, 11, 13, 11, 13, 15, 8, 13, 15, 11, 14, 15, 15, 9, 11, 9, 8, 9, 15, 11, 12, 9, 8, 10, 14, 8, 13, 13, 12, 14, 8, 8, 8, 14, 14, 8, 12, 13, 14, 14, 12, 11, 8, 11, 9, 12, 8, 9, 11, 9] 
Count:  107 

YearsWithCurrManager
Outliers: [17, 15, 15, 15, 15, 17, 16, 17, 15, 17, 17, 17, 17, 16] 
Count:  14

def remove_outliers(c_name):
    outliers = iqr_outliers(ibm[c_name])

    while (len(outliers)!=0):
        for i in outliers:
            ibm.drop(ibm.loc[ibm[c_name]==i].index, inplace = True)
        outliers = iqr_outliers(ibm[c_name])

remove_outliers('MonthlyIncome')

Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880] 
Count:  114 

Outliers: [15427, 13458, 14756, 13245, 13664, 13503, 13549, 13872, 13734, 13591, 16064, 13675, 13496, 13603, 13525, 16015, 13964, 15992, 14336, 13212, 16555, 14118, 13610, 13237, 16184, 15402, 14814, 13770, 16307, 13826, 14275, 13582, 14852, 13194, 13973, 13726, 13320, 13120, 13499, 13758, 13191, 16124, 13577, 14026, 13142, 13695, 13402, 13247, 14732, 16422, 13757, 16032, 16328, 14411, 16437, 15202, 16413, 13269, 13966, 15972, 15379, 12936, 12965, 13116, 13464, 16291, 15787, 13225, 13348, 13341, 13206, 13744, 13570] 
Count:  73 

Outliers: [11994, 12490, 12185, 11849, 11996, 12061, 11878, 12504, 11935, 12808, 11836, 12742, 11904, 12169, 11916, 11957, 12031] 
Count:  17 

Outliers: [11713, 11691] 
Count:  2 

Outliers: [11631] 
Count:  1 

Outliers: [] 
Count:  0

ibm

	Age	Attrition	BusinessTravel	DailyRate	Dp_Sales&Development	Dp_Sales	Dp_HumanResources	DistanceFromHome	Education	EnvironmentSatisfaction	EF_Life Sciences	EF_Medical	EF_Marketing	EF_TechnicalDegree	EF_HumanResources	EF_Other	Gender	HourlyRate	JobInvolvement	JobLevel	JobSatisfaction	JR_HealthcareRepresentive	JR_HumanResource	JR_LaboratoryTechnician	JR_Manager	JR_ManufacturingDirector	JR_ResearchDirector	JR_ResearchScientist	JR_SalesExecutive	JR_SalesRepresentative	MonthlyIncome	MonthlyRate	NumCompaniesWorked	MS_Married	MS_Single	MS_Divorced	OverTime	PercentSalaryHike	PerformanceRating	RelationshipSatisfaction	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	41	1	1	1102	0	1	0	1	2	2	1	0	0	0	0	0	1	94	3	2	4	0	0	0	0	0	0	0	1	0	5993	19479	8	0	1	0	1	11	3	1	0	8	0	1	6	4	0	5
1	49	0	2	279	1	0	0	8	1	3	1	0	0	0	0	0	0	61	2	2	2	0	0	0	0	0	0	1	0	0	5130	24907	1	1	0	0	0	23	4	4	1	10	3	3	10	7	1	7
2	37	1	1	1373	1	0	0	2	2	4	0	0	0	0	0	1	0	92	2	1	3	0	0	1	0	0	0	0	0	0	2090	2396	6	0	1	0	1	15	3	2	0	7	3	3	0	0	0	0
3	33	0	2	1392	1	0	0	3	4	4	1	0	0	0	0	0	1	56	3	1	3	0	0	0	0	0	0	1	0	0	2909	23159	1	1	0	0	1	11	3	3	0	8	3	3	8	7	3	0
4	27	0	1	591	1	0	0	2	1	1	0	1	0	0	0	0	0	40	3	1	2	0	0	1	0	0	0	0	0	0	3468	16632	9	1	0	0	0	12	3	4	1	6	3	3	2	2	2	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1465	36	0	2	884	1	0	0	23	2	3	0	1	0	0	0	0	0	41	4	2	4	0	0	1	0	0	0	0	0	0	2571	12290	4	1	0	0	0	17	3	3	1	17	3	3	5	2	0	3
1466	39	0	1	613	1	0	0	6	1	4	0	1	0	0	0	0	0	42	2	3	1	1	0	0	0	0	0	0	0	0	9991	21457	4	1	0	0	0	15	3	1	1	9	5	3	7	7	1	7
1467	27	0	1	155	1	0	0	4	3	2	1	0	0	0	0	0	0	87	4	2	2	0	0	0	0	1	0	0	0	0	6142	5174	1	1	0	0	1	20	4	2	1	6	0	3	6	2	0	3
1468	49	0	2	1023	0	1	0	2	3	4	0	1	0	0	0	0	0	63	2	2	2	0	0	0	0	0	0	0	1	0	5390	13243	2	1	0	0	0	14	3	4	0	17	3	2	9	6	0	8
1469	34	0	1	628	1	0	0	8	3	2	0	1	0	0	0	0	0	82	4	2	3	0	0	1	0	0	0	0	0	0	4404	10228	2	1	0	0	0	12	3	1	0	6	3	4	4	3	1	2

1263 rows × 48 columns

Classification

Support Vector Machine (prepared by Teh Liang Sean)

# import important library to do SVM
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics

#The target for SVM will be the attrition of IBM employees to know whether the employees will continue stay or leave IBM
x_svm_find = ibm.drop(columns = 'Attrition')
y_svm = ibm['Attrition']

# Try use SelectKBest and chi-squared (chi²) statistical test for non-negative feature to find top 15 best features
#Import library
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#Use SelectKBest class to find top 15 best features
best_15_features = SelectKBest(score_func=chi2, k=15)
fit = best_15_features.fit(x_svm_find,y_svm)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x_svm_find.columns)
#Try to concat two dataframes for a better visualization 
top_15_feature_scores = pd.concat([dfcolumns,dfscores],axis=1)
#Name the dataframe columns
top_15_feature_scores.columns = ['Features','Score']  
#Show 15 best features
print(top_15_feature_scores.nlargest(15,'Score'))

                     Features         Score
29              MonthlyIncome  26471.159476
30                MonthlyRate   1308.443569
2                   DailyRate   1111.594737
44         YearsInCurrentRole    109.263859
43             YearsAtCompany    103.805057
46       YearsWithCurrManager    100.636711
40          TotalWorkingYears     95.843571
35                   OverTime     60.367656
6            DistanceFromHome     57.197704
0                         Age     46.705340
28     JR_SalesRepresentative     27.299127
33                  MS_Single     26.251695
39           StockOptionLevel     24.376114
20  JR_HealthcareRepresentive     10.935616
24   JR_ManufacturingDirector      9.987076

ibm_svm_features_df = pd.DataFrame()
# Set up data to do SVM using top 15 best features identified
ibm_svm_features_df.insert(0,'MonthlyIncome',ibm['MonthlyIncome'])
ibm_svm_features_df.insert(1,'MonthlyRate',ibm['MonthlyRate'])
ibm_svm_features_df.insert(2,'DailyRate',ibm['DailyRate'])
ibm_svm_features_df.insert(3,'YearsInCurrentRole',ibm['YearsInCurrentRole'])
ibm_svm_features_df.insert(4,'YearsAtCompany',ibm['YearsAtCompany'])
ibm_svm_features_df.insert(5,'YearsWithCurrManager',ibm['YearsWithCurrManager'])
ibm_svm_features_df.insert(6,'TotalWorkingYears',ibm['TotalWorkingYears'])
ibm_svm_features_df.insert(7,'OverTime',ibm['OverTime'])
ibm_svm_features_df.insert(8,'DistanceFromHome',ibm['DistanceFromHome'])
ibm_svm_features_df.insert(9,'Age',ibm['Age'])
ibm_svm_features_df.insert(10,'JR_SalesRepresentative',ibm['JR_SalesRepresentative'])
ibm_svm_features_df.insert(11,'MS_Single',ibm['MS_Single'])
ibm_svm_features_df.insert(12,'StockOptionLevel',ibm['StockOptionLevel'])
ibm_svm_features_df.insert(13,'JR_HealthcareRepresentive ',ibm['JR_HealthcareRepresentive'])
ibm_svm_features_df.insert(14,'JR_ManufacturingDirector',ibm['JR_ManufacturingDirector'])

ibm_svm_features_df

	MonthlyIncome	MonthlyRate	DailyRate	YearsInCurrentRole	YearsAtCompany	YearsWithCurrManager	TotalWorkingYears	OverTime	DistanceFromHome	Age	JR_SalesRepresentative	MS_Single	StockOptionLevel	JR_HealthcareRepresentive	JR_ManufacturingDirector
0	5993	19479	1102	4	6	5	8	1	1	41	0	1	0	0	0
1	5130	24907	279	7	10	7	10	0	8	49	0	0	1	0	0
2	2090	2396	1373	0	0	0	7	1	2	37	0	1	0	0	0
3	2909	23159	1392	7	8	0	8	1	3	33	0	0	0	0	0
4	3468	16632	591	2	2	2	6	0	2	27	0	0	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1465	2571	12290	884	2	5	3	17	0	23	36	0	0	1	0	0
1466	9991	21457	613	7	7	7	9	0	6	39	0	0	1	1	0
1467	6142	5174	155	2	6	3	6	1	4	27	0	0	1	0	1
1468	5390	13243	1023	6	9	8	17	0	2	49	0	0	0	0	0
1469	4404	10228	628	3	4	2	6	0	8	34	0	0	0	0	0

1263 rows × 15 columns

#assignment ibm_svm_features to x
x_svm = ibm_svm_features_df

#Try to scale all the numeric data of each features to make svm model train more effective
from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()
x_scaled_svm = s_scaler.fit_transform(x_svm)

#Try to use tomek link to solve undersampling problem as attriction too few 'yes' value for imbalanced classification 
from imblearn.under_sampling import TomekLinks

tl_svm = TomekLinks(sampling_strategy='not minority')
x_tl_svm, y_tl_svm= tl_svm.fit_resample(x_svm, y_svm)

#Train the modals with 80% and test 20% of the data
x_train_svm, x_test_svm, y_train_svm, y_test_svm = train_test_split(x_tl_svm,y_tl_svm, test_size=0.2,random_state=40, stratify=y_tl_svm)

# Model 1 is using the manual tuning for some hyperparameters of SVM
model_1_svm=svm.SVC(C=2,kernel='sigmoid',gamma='scale',coef0=0.6,random_state=40,probability=True)
model_1_svm.fit(x_train_svm,y_train_svm)
y_predict_1_svm=model_1_svm.predict(x_test_svm)

# Modal 2 is using GridSearchCV to find the best hyperparameters for SVM using cross validation
# Only some hyperparameters are tuned 

# import GridSearchCV library
from sklearn.model_selection import GridSearchCV

#Try to tune the hyperparameter with
#kernel type: linear/rbf/sigmoid
#C which is the regularization parameter: range 0-1 increase by 0.1
#coef0 that is the independent term for kernel method (only for sigmoid): range 0.0-0.5 increase by 0.1
#degree for the polynomial ('poly') kernel method: range 0-5 increase by 1
#gamma that are kernel coefficient for 'rbf' and 'poly': scale/auto

param_grid={'kernel':('linear','rbf','sigmoid'),
        'C':[i for i in np.arange(1.0,3.0,0.1)],
        'coef0':[y for y in np.arange(0.0,1.5,0.1)],
        'degree':[z for z in np.arange(3,6,1)],
        'gamma':('auto','scale'),}
# set random state to 40
find_best_para_model=svm.SVC(random_state=40)
Grid_search_svm=GridSearchCV(find_best_para_model,param_grid, n_jobs=-1,verbose=2,cv=4)
# this may take some time to run
Grid_search_svm.fit(x_train_svm,y_train_svm)

Fitting 4 folds for each of 5400 candidates, totalling 21600 fits

# Show the best hyperparameter found by grid search
Grid_search_svm.best_params_

{'C': 2.8000000000000016,
 'coef0': 0.0,
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf'}

# Use hyperparameter found grid search to build modal 
model_2_svm=svm.SVC(C=2.8000000000000016,kernel='rbf',degree=3,gamma='scale',coef0=0.0,probability=True,random_state=40)
model_2_svm.fit(x_train_svm,y_train_svm)
y_predict_2_svm=model_2_svm.predict(x_test_svm)

#Evaluate accurracy of classification result
print('Accuracy of prediction classification result for 2 model')
print('Hyperparameters that try to tune manually (model 1): ',metrics.accuracy_score(y_test_svm, y_predict_1_svm))
print('Best hyperparameters found using GridSearchCV (model 2): ',metrics.accuracy_score(y_test_svm, y_predict_2_svm))

Accuracy of prediction classification result for 2 model
Hyperparameters that try to tune manually (model 1):  0.7416666666666667
Best hyperparameters found using GridSearchCV (model 2):  0.8166666666666667

#Evaluating classification result by confusion matrix
from sklearn.metrics import confusion_matrix
print (confusion_matrix(y_test_svm, y_predict_2_svm,[0,1]))

#Evaluating classification result by Precision, Recall and F1-Measure
from sklearn.metrics import classification_report
print (classification_report(y_test_svm, y_predict_2_svm))

[[183  12]
 [ 32  13]]
              precision    recall  f1-score   support

           0       0.85      0.94      0.89       195
           1       0.52      0.29      0.37        45

    accuracy                           0.82       240
   macro avg       0.69      0.61      0.63       240
weighted avg       0.79      0.82      0.79       240

C:\Users\USER\anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass labels=[0, 1] as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
  warnings.warn(f"Pass {args_msg} as keyword args. From version "

from matplotlib import pyplot as plt
#Evaluating classification result by ROC curves
from sklearn.metrics import roc_curve
y_pred_prob_svm = model_2_svm.predict_proba(x_test_svm)[:,1]
fpr, tpr, threshold = roc_curve(y_test_svm, y_pred_prob_svm)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr,tpr)
auc = roc_auc_score(y_test_svm,  y_pred_prob_svm)
plt.title(f'AUC: {auc}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()