import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
ibm = pd.read_csv('/WA_Fn-UseC_-HR-Employee-Attrition.csv')
pd.set_option('display.max_columns', None)
ibm.shape
(1470, 35)
ibm.describe()
Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
mean | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | 2.728571 | 6502.931293 | 14313.103401 | 2.693197 | 15.209524 | 3.153741 | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
std | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | 1.102846 | 4707.956783 | 7117.786044 | 2.498009 | 3.659938 | 0.360824 | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
min | 18.000000 | 102.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | 1.000000 | 1.000000 | 1009.000000 | 2094.000000 | 0.000000 | 11.000000 | 3.000000 | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 30.000000 | 465.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 48.000000 | 2.000000 | 1.000000 | 2.000000 | 2911.000000 | 8047.000000 | 1.000000 | 12.000000 | 3.000000 | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
50% | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | 3.000000 | 4919.000000 | 14235.500000 | 2.000000 | 14.000000 | 3.000000 | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
75% | 43.000000 | 1157.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 83.750000 | 3.000000 | 3.000000 | 4.000000 | 8379.000000 | 20461.500000 | 4.000000 | 18.000000 | 3.000000 | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
max | 60.000000 | 1499.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | 5.000000 | 4.000000 | 19999.000000 | 26999.000000 | 9.000000 | 25.000000 | 4.000000 | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
import statistics
for i in ibm.columns:
print(i, " mode: ", statistics.mode(ibm[i]));
Age mode: 35
Attrition mode: No
BusinessTravel mode: Travel_Rarely
DailyRate mode: 691
Department mode: Research & Development
DistanceFromHome mode: 2
Education mode: 3
EducationField mode: Life Sciences
EmployeeCount mode: 1
EmployeeNumber mode: 1
EnvironmentSatisfaction mode: 3
Gender mode: Male
HourlyRate mode: 66
JobInvolvement mode: 3
JobLevel mode: 1
JobRole mode: Sales Executive
JobSatisfaction mode: 4
MaritalStatus mode: Married
MonthlyIncome mode: 2342
MonthlyRate mode: 9150
NumCompaniesWorked mode: 1
Over18 mode: Y
OverTime mode: No
PercentSalaryHike mode: 11
PerformanceRating mode: 3
RelationshipSatisfaction mode: 3
StandardHours mode: 80
StockOptionLevel mode: 0
TotalWorkingYears mode: 10
TrainingTimesLastYear mode: 2
WorkLifeBalance mode: 3
YearsAtCompany mode: 5
YearsInCurrentRole mode: 2
YearsSinceLastPromotion mode: 0
YearsWithCurrManager mode: 2
ibm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 1470 non-null int64
1 Attrition 1470 non-null object
2 BusinessTravel 1470 non-null object
3 DailyRate 1470 non-null int64
4 Department 1470 non-null object
5 DistanceFromHome 1470 non-null int64
6 Education 1470 non-null int64
7 EducationField 1470 non-null object
8 EmployeeCount 1470 non-null int64
9 EmployeeNumber 1470 non-null int64
10 EnvironmentSatisfaction 1470 non-null int64
11 Gender 1470 non-null object
12 HourlyRate 1470 non-null int64
13 JobInvolvement 1470 non-null int64
14 JobLevel 1470 non-null int64
15 JobRole 1470 non-null object
16 JobSatisfaction 1470 non-null int64
17 MaritalStatus 1470 non-null object
18 MonthlyIncome 1470 non-null int64
19 MonthlyRate 1470 non-null int64
20 NumCompaniesWorked 1470 non-null int64
21 Over18 1470 non-null object
22 OverTime 1470 non-null object
23 PercentSalaryHike 1470 non-null int64
24 PerformanceRating 1470 non-null int64
25 RelationshipSatisfaction 1470 non-null int64
26 StandardHours 1470 non-null int64
27 StockOptionLevel 1470 non-null int64
28 TotalWorkingYears 1470 non-null int64
29 TrainingTimesLastYear 1470 non-null int64
30 WorkLifeBalance 1470 non-null int64
31 YearsAtCompany 1470 non-null int64
32 YearsInCurrentRole 1470 non-null int64
33 YearsSinceLastPromotion 1470 non-null int64
34 YearsWithCurrManager 1470 non-null int64
dtypes: int64(26), object(9)
memory usage: 402.1+ KB
ibm.drop(columns = 'EmployeeCount', inplace = True)
ibm.drop(columns = 'EmployeeNumber', inplace = True)
ibm.drop(columns = 'Over18', inplace = True)
ibm.drop(columns = 'StandardHours', inplace = True)
ibm.drop_duplicates()
Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Yes | 11 | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | No | 23 | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Yes | 15 | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Yes | 11 | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | No | 12 | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1465 | 36 | No | Travel_Frequently | 884 | Research & Development | 23 | 2 | Medical | 3 | Male | 41 | 4 | 2 | Laboratory Technician | 4 | Married | 2571 | 12290 | 4 | No | 17 | 3 | 3 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
1466 | 39 | No | Travel_Rarely | 613 | Research & Development | 6 | 1 | Medical | 4 | Male | 42 | 2 | 3 | Healthcare Representative | 1 | Married | 9991 | 21457 | 4 | No | 15 | 3 | 1 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
1467 | 27 | No | Travel_Rarely | 155 | Research & Development | 4 | 3 | Life Sciences | 2 | Male | 87 | 4 | 2 | Manufacturing Director | 2 | Married | 6142 | 5174 | 1 | Yes | 20 | 4 | 2 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
1468 | 49 | No | Travel_Frequently | 1023 | Sales | 2 | 3 | Medical | 4 | Male | 63 | 2 | 2 | Sales Executive | 2 | Married | 5390 | 13243 | 2 | No | 14 | 3 | 4 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
1469 | 34 | No | Travel_Rarely | 628 | Research & Development | 8 | 3 | Medical | 2 | Male | 82 | 4 | 2 | Laboratory Technician | 3 | Married | 4404 | 10228 | 2 | No | 12 | 3 | 1 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 31 columns
ibm.isnull().sum()
Age 0 Attrition 0 BusinessTravel 0 DailyRate 0 Department 0 DistanceFromHome 0 Education 0 EducationField 0 EnvironmentSatisfaction 0 Gender 0 HourlyRate 0 JobInvolvement 0 JobLevel 0 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 MonthlyRate 0 NumCompaniesWorked 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 RelationshipSatisfaction 0 StockOptionLevel 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
# replace Attrition (0 - No, 1 - Yes)
ibm.replace({'Attrition' : {'Yes': 1, 'No': 0}}, inplace = True)
# replace BusinessTravel (0 - Non-Travel, 1 - Travel_Rarely, 2 - Travel_Frequently)
ibm.replace({'BusinessTravel' : {'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}}, inplace = True)
#Department
dummy = pd.get_dummies(ibm['Department'])
ibm.insert(5,'Dp_Sales&Development', dummy['Research & Development'])
ibm.insert(6,'Dp_Sales', dummy['Sales'])
ibm.insert(7,'Dp_HumanResources', dummy['Human Resources'])
ibm.drop(columns = 'Department', inplace = True)
#EducationField
dummy = pd.get_dummies(ibm['EducationField'])
ibm.insert(11,'EF_Life Sciences',dummy['Life Sciences'])
ibm.insert(12,'EF_Medical',dummy['Medical'])
ibm.insert(13,'EF_Marketing',dummy['Marketing'])
ibm.insert(14,'EF_TechnicalDegree',dummy['Technical Degree'])
ibm.insert(15,'EF_HumanResources',dummy['Human Resources'])
ibm.insert(16,'EF_Other',dummy['Other'])
ibm.drop(columns = 'EducationField', inplace = True)
# replace Gender (0 - Male; 1 - Female)
ibm.replace({'Gender': {'Male': 0, 'Female': 1}}, inplace = True)
# Job role dummy variables
dummy=pd.get_dummies(ibm['JobRole'])
ibm.insert(23, 'JR_HealthcareRepresentive', dummy['Healthcare Representative'])
ibm.insert(24, 'JR_HumanResource', dummy['Human Resources'])
ibm.insert(25, 'JR_LaboratoryTechnician', dummy['Laboratory Technician'])
ibm.insert(26, 'JR_Manager', dummy['Manager'])
ibm.insert(27, 'JR_ManufacturingDirector', dummy['Manufacturing Director'])
ibm.insert(28, 'JR_ResearchDirector', dummy['Research Director'])
ibm.insert(29, 'JR_ResearchScientist', dummy['Research Scientist'])
ibm.insert(30, 'JR_SalesExecutive', dummy['Sales Executive'])
ibm.insert(31, 'JR_SalesRepresentative', dummy['Sales Representative'])
ibm.drop(columns = 'JobRole', inplace = True)
# MaritalStatus role dummy variables
dummy=pd.get_dummies(ibm['MaritalStatus'])
ibm.insert(34, 'MS_Married', dummy['Married'])
ibm.insert(35, 'MS_Single', dummy['Single'])
ibm.insert(36, 'MS_Divorced', dummy['Divorced'])
ibm.drop(columns = 'MaritalStatus', inplace = True)
# replace Overtime (0 - No; 1 - Yes)
ibm.replace({'OverTime': {'No': 0, 'Yes': 1}}, inplace = True)
# replace Over18 (0 - N; 1 - Y)
ibm.replace({'Over18': {'N': 0, 'Y': 1}}, inplace = True)
def iqr_outliers(data):
out=[]
firstQuartile = data.quantile(0.25)
thirdQuartile = data.quantile(0.75)
iqr = thirdQuartile-firstQuartile
Lower_bound = firstQuartile - 1.5 * iqr
Upper_bound = thirdQuartile + 1.5 * iqr
for i in data:
if i > Upper_bound or i < Lower_bound:
out.append(i)
print("Outliers:",out , "\nCount: ", len(out), "\n")
return out
for c_name in ibm.columns:
print (c_name)
iqr_outliers(ibm[c_name])
Age
Outliers: []
Count: 0
Attrition
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 237
BusinessTravel
Outliers: [2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2]
Count: 427
DailyRate
Outliers: []
Count: 0
Dp_Sales&Development
Outliers: []
Count: 0
Dp_Sales
Outliers: []
Count: 0
Dp_HumanResources
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 63
DistanceFromHome
Outliers: []
Count: 0
Education
Outliers: []
Count: 0
EnvironmentSatisfaction
Outliers: []
Count: 0
EF_Life Sciences
Outliers: []
Count: 0
EF_Medical
Outliers: []
Count: 0
EF_Marketing
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 159
EF_TechnicalDegree
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 132
EF_HumanResources
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 27
EF_Other
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 82
Gender
Outliers: []
Count: 0
HourlyRate
Outliers: []
Count: 0
JobInvolvement
Outliers: []
Count: 0
JobLevel
Outliers: []
Count: 0
JobSatisfaction
Outliers: []
Count: 0
JR_HealthcareRepresentive
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 131
JR_HumanResource
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 52
JR_LaboratoryTechnician
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 259
JR_Manager
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 102
JR_ManufacturingDirector
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 145
JR_ResearchDirector
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 80
JR_ResearchScientist
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 292
JR_SalesExecutive
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 326
JR_SalesRepresentative
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 83
MonthlyIncome
Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880]
Count: 114
MonthlyRate
Outliers: []
Count: 0
NumCompaniesWorked
Outliers: [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
Count: 52
MS_Married
Outliers: []
Count: 0
MS_Single
Outliers: []
Count: 0
MS_Divorced
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 327
OverTime
Outliers: []
Count: 0
PercentSalaryHike
Outliers: []
Count: 0
PerformanceRating
Outliers: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
Count: 226
RelationshipSatisfaction
Outliers: []
Count: 0
StockOptionLevel
Outliers: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Count: 85
TotalWorkingYears
Outliers: [31, 29, 37, 38, 30, 40, 36, 34, 32, 33, 37, 30, 36, 31, 33, 32, 37, 31, 32, 32, 30, 34, 30, 40, 29, 35, 31, 33, 31, 29, 32, 30, 33, 30, 29, 31, 32, 33, 36, 34, 31, 36, 33, 31, 29, 33, 29, 32, 31, 35, 29, 32, 34, 36, 32, 30, 36, 29, 34, 37, 29, 29, 35]
Count: 63
TrainingTimesLastYear
Outliers: [0, 5, 5, 5, 6, 5, 5, 5, 6, 6, 0, 0, 0, 5, 0, 5, 5, 5, 6, 6, 5, 0, 6, 5, 5, 0, 5, 5, 6, 5, 5, 5, 0, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5, 0, 0, 5, 5, 5, 6, 6, 5, 0, 5, 0, 5, 5, 0, 6, 0, 5, 5, 6, 6, 5, 6, 5, 0, 5, 5, 5, 5, 0, 6, 5, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5, 0, 5, 0, 5, 5, 6, 5, 6, 5, 0, 5, 5, 0, 6, 6, 5, 6, 0, 5, 0, 6, 6, 6, 6, 5, 5, 0, 5, 0, 0, 6, 0, 6, 5, 6, 5, 5, 0, 5, 6, 6, 5, 5, 0, 0, 6, 0, 0, 5, 0, 5, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 6, 5, 6, 6, 0, 6, 6, 5, 5, 0, 0, 6, 6, 0, 5, 0, 0, 0, 0, 0, 5, 5, 6, 5, 5, 0, 5, 5, 0, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 0, 0, 5, 5, 5, 5, 6, 0, 0, 6, 6, 6, 6, 5, 5, 5, 6, 5, 0, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 0, 5, 5, 5, 5, 5, 0, 0, 0, 6, 5, 6, 6, 5, 6, 0, 6, 6, 5, 6, 6, 5, 5, 5, 0]
Count: 238
WorkLifeBalance
Outliers: []
Count: 0
YearsAtCompany
Outliers: [25, 22, 22, 27, 21, 22, 37, 25, 20, 40, 20, 24, 20, 24, 33, 20, 19, 22, 33, 24, 19, 21, 20, 36, 20, 20, 22, 24, 21, 21, 25, 21, 29, 20, 27, 20, 31, 32, 20, 20, 21, 22, 22, 34, 24, 26, 31, 20, 31, 26, 19, 21, 21, 32, 21, 19, 20, 22, 20, 21, 26, 20, 22, 24, 33, 29, 25, 21, 19, 19, 20, 19, 33, 19, 19, 20, 20, 20, 20, 20, 32, 20, 21, 33, 36, 26, 30, 22, 23, 23, 21, 21, 22, 22, 19, 22, 19, 22, 20, 20, 20, 22, 20, 20]
Count: 104
YearsInCurrentRole
Outliers: [15, 16, 18, 15, 18, 17, 16, 15, 16, 15, 16, 16, 15, 16, 17, 15, 15, 15, 17, 17, 16]
Count: 21
YearsSinceLastPromotion
Outliers: [8, 15, 8, 8, 9, 13, 12, 10, 11, 9, 12, 15, 15, 15, 9, 11, 11, 9, 12, 11, 15, 11, 10, 9, 11, 9, 8, 11, 11, 8, 13, 9, 9, 12, 10, 11, 15, 13, 9, 11, 10, 8, 8, 11, 9, 11, 12, 11, 14, 13, 14, 8, 11, 15, 10, 11, 11, 15, 11, 13, 11, 13, 15, 8, 13, 15, 11, 14, 15, 15, 9, 11, 9, 8, 9, 15, 11, 12, 9, 8, 10, 14, 8, 13, 13, 12, 14, 8, 8, 8, 14, 14, 8, 12, 13, 14, 14, 12, 11, 8, 11, 9, 12, 8, 9, 11, 9]
Count: 107
YearsWithCurrManager
Outliers: [17, 15, 15, 15, 15, 17, 16, 17, 15, 17, 17, 17, 17, 16]
Count: 14
def remove_outliers(c_name):
outliers = iqr_outliers(ibm[c_name])
while (len(outliers)!=0):
for i in outliers:
ibm.drop(ibm.loc[ibm[c_name]==i].index, inplace = True)
outliers = iqr_outliers(ibm[c_name])
remove_outliers('MonthlyIncome')
Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880]
Count: 114
Outliers: [15427, 13458, 14756, 13245, 13664, 13503, 13549, 13872, 13734, 13591, 16064, 13675, 13496, 13603, 13525, 16015, 13964, 15992, 14336, 13212, 16555, 14118, 13610, 13237, 16184, 15402, 14814, 13770, 16307, 13826, 14275, 13582, 14852, 13194, 13973, 13726, 13320, 13120, 13499, 13758, 13191, 16124, 13577, 14026, 13142, 13695, 13402, 13247, 14732, 16422, 13757, 16032, 16328, 14411, 16437, 15202, 16413, 13269, 13966, 15972, 15379, 12936, 12965, 13116, 13464, 16291, 15787, 13225, 13348, 13341, 13206, 13744, 13570]
Count: 73
Outliers: [11994, 12490, 12185, 11849, 11996, 12061, 11878, 12504, 11935, 12808, 11836, 12742, 11904, 12169, 11916, 11957, 12031]
Count: 17
Outliers: [11713, 11691]
Count: 2
Outliers: [11631]
Count: 1
Outliers: []
Count: 0
ibm
Age | Attrition | BusinessTravel | DailyRate | Dp_Sales&Development | Dp_Sales | Dp_HumanResources | DistanceFromHome | Education | EnvironmentSatisfaction | EF_Life Sciences | EF_Medical | EF_Marketing | EF_TechnicalDegree | EF_HumanResources | EF_Other | Gender | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | JR_HealthcareRepresentive | JR_HumanResource | JR_LaboratoryTechnician | JR_Manager | JR_ManufacturingDirector | JR_ResearchDirector | JR_ResearchScientist | JR_SalesExecutive | JR_SalesRepresentative | MonthlyIncome | MonthlyRate | NumCompaniesWorked | MS_Married | MS_Single | MS_Divorced | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | 1 | 1 | 1102 | 0 | 1 | 0 | 1 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 94 | 3 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 5993 | 19479 | 8 | 0 | 1 | 0 | 1 | 11 | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 49 | 0 | 2 | 279 | 1 | 0 | 0 | 8 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 61 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 5130 | 24907 | 1 | 1 | 0 | 0 | 0 | 23 | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 37 | 1 | 1 | 1373 | 1 | 0 | 0 | 2 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 92 | 2 | 1 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2090 | 2396 | 6 | 0 | 1 | 0 | 1 | 15 | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 33 | 0 | 2 | 1392 | 1 | 0 | 0 | 3 | 4 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 56 | 3 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2909 | 23159 | 1 | 1 | 0 | 0 | 1 | 11 | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 27 | 0 | 1 | 591 | 1 | 0 | 0 | 2 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 40 | 3 | 1 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3468 | 16632 | 9 | 1 | 0 | 0 | 0 | 12 | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1465 | 36 | 0 | 2 | 884 | 1 | 0 | 0 | 23 | 2 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 41 | 4 | 2 | 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2571 | 12290 | 4 | 1 | 0 | 0 | 0 | 17 | 3 | 3 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
1466 | 39 | 0 | 1 | 613 | 1 | 0 | 0 | 6 | 1 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 42 | 2 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9991 | 21457 | 4 | 1 | 0 | 0 | 0 | 15 | 3 | 1 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
1467 | 27 | 0 | 1 | 155 | 1 | 0 | 0 | 4 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 87 | 4 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6142 | 5174 | 1 | 1 | 0 | 0 | 1 | 20 | 4 | 2 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
1468 | 49 | 0 | 2 | 1023 | 0 | 1 | 0 | 2 | 3 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 63 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 5390 | 13243 | 2 | 1 | 0 | 0 | 0 | 14 | 3 | 4 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
1469 | 34 | 0 | 1 | 628 | 1 | 0 | 0 | 8 | 3 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 82 | 4 | 2 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4404 | 10228 | 2 | 1 | 0 | 0 | 0 | 12 | 3 | 1 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1263 rows × 48 columns
# import important library to do SVM
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
#The target for SVM will be the attrition of IBM employees to know whether the employees will continue stay or leave IBM
x_svm_find = ibm.drop(columns = 'Attrition')
y_svm = ibm['Attrition']
# Try use SelectKBest and chi-squared (chi²) statistical test for non-negative feature to find top 15 best features
#Import library
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#Use SelectKBest class to find top 15 best features
best_15_features = SelectKBest(score_func=chi2, k=15)
fit = best_15_features.fit(x_svm_find,y_svm)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x_svm_find.columns)
#Try to concat two dataframes for a better visualization
top_15_feature_scores = pd.concat([dfcolumns,dfscores],axis=1)
#Name the dataframe columns
top_15_feature_scores.columns = ['Features','Score']
#Show 15 best features
print(top_15_feature_scores.nlargest(15,'Score'))
Features Score
29 MonthlyIncome 26471.159476
30 MonthlyRate 1308.443569
2 DailyRate 1111.594737
44 YearsInCurrentRole 109.263859
43 YearsAtCompany 103.805057
46 YearsWithCurrManager 100.636711
40 TotalWorkingYears 95.843571
35 OverTime 60.367656
6 DistanceFromHome 57.197704
0 Age 46.705340
28 JR_SalesRepresentative 27.299127
33 MS_Single 26.251695
39 StockOptionLevel 24.376114
20 JR_HealthcareRepresentive 10.935616
24 JR_ManufacturingDirector 9.987076
ibm_svm_features_df = pd.DataFrame()
# Set up data to do SVM using top 15 best features identified
ibm_svm_features_df.insert(0,'MonthlyIncome',ibm['MonthlyIncome'])
ibm_svm_features_df.insert(1,'MonthlyRate',ibm['MonthlyRate'])
ibm_svm_features_df.insert(2,'DailyRate',ibm['DailyRate'])
ibm_svm_features_df.insert(3,'YearsInCurrentRole',ibm['YearsInCurrentRole'])
ibm_svm_features_df.insert(4,'YearsAtCompany',ibm['YearsAtCompany'])
ibm_svm_features_df.insert(5,'YearsWithCurrManager',ibm['YearsWithCurrManager'])
ibm_svm_features_df.insert(6,'TotalWorkingYears',ibm['TotalWorkingYears'])
ibm_svm_features_df.insert(7,'OverTime',ibm['OverTime'])
ibm_svm_features_df.insert(8,'DistanceFromHome',ibm['DistanceFromHome'])
ibm_svm_features_df.insert(9,'Age',ibm['Age'])
ibm_svm_features_df.insert(10,'JR_SalesRepresentative',ibm['JR_SalesRepresentative'])
ibm_svm_features_df.insert(11,'MS_Single',ibm['MS_Single'])
ibm_svm_features_df.insert(12,'StockOptionLevel',ibm['StockOptionLevel'])
ibm_svm_features_df.insert(13,'JR_HealthcareRepresentive ',ibm['JR_HealthcareRepresentive'])
ibm_svm_features_df.insert(14,'JR_ManufacturingDirector',ibm['JR_ManufacturingDirector'])
ibm_svm_features_df
MonthlyIncome | MonthlyRate | DailyRate | YearsInCurrentRole | YearsAtCompany | YearsWithCurrManager | TotalWorkingYears | OverTime | DistanceFromHome | Age | JR_SalesRepresentative | MS_Single | StockOptionLevel | JR_HealthcareRepresentive | JR_ManufacturingDirector | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5993 | 19479 | 1102 | 4 | 6 | 5 | 8 | 1 | 1 | 41 | 0 | 1 | 0 | 0 | 0 |
1 | 5130 | 24907 | 279 | 7 | 10 | 7 | 10 | 0 | 8 | 49 | 0 | 0 | 1 | 0 | 0 |
2 | 2090 | 2396 | 1373 | 0 | 0 | 0 | 7 | 1 | 2 | 37 | 0 | 1 | 0 | 0 | 0 |
3 | 2909 | 23159 | 1392 | 7 | 8 | 0 | 8 | 1 | 3 | 33 | 0 | 0 | 0 | 0 | 0 |
4 | 3468 | 16632 | 591 | 2 | 2 | 2 | 6 | 0 | 2 | 27 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1465 | 2571 | 12290 | 884 | 2 | 5 | 3 | 17 | 0 | 23 | 36 | 0 | 0 | 1 | 0 | 0 |
1466 | 9991 | 21457 | 613 | 7 | 7 | 7 | 9 | 0 | 6 | 39 | 0 | 0 | 1 | 1 | 0 |
1467 | 6142 | 5174 | 155 | 2 | 6 | 3 | 6 | 1 | 4 | 27 | 0 | 0 | 1 | 0 | 1 |
1468 | 5390 | 13243 | 1023 | 6 | 9 | 8 | 17 | 0 | 2 | 49 | 0 | 0 | 0 | 0 | 0 |
1469 | 4404 | 10228 | 628 | 3 | 4 | 2 | 6 | 0 | 8 | 34 | 0 | 0 | 0 | 0 | 0 |
1263 rows × 15 columns
#assignment ibm_svm_features to x
x_svm = ibm_svm_features_df
#Try to scale all the numeric data of each features to make svm model train more effective
from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()
x_scaled_svm = s_scaler.fit_transform(x_svm)
#Try to use tomek link to solve undersampling problem as attriction too few 'yes' value for imbalanced classification
from imblearn.under_sampling import TomekLinks
tl_svm = TomekLinks(sampling_strategy='not minority')
x_tl_svm, y_tl_svm= tl_svm.fit_resample(x_svm, y_svm)
#Train the modals with 80% and test 20% of the data
x_train_svm, x_test_svm, y_train_svm, y_test_svm = train_test_split(x_tl_svm,y_tl_svm, test_size=0.2,random_state=40, stratify=y_tl_svm)
# Model 1 is using the manual tuning for some hyperparameters of SVM
model_1_svm=svm.SVC(C=2,kernel='sigmoid',gamma='scale',coef0=0.6,random_state=40,probability=True)
model_1_svm.fit(x_train_svm,y_train_svm)
y_predict_1_svm=model_1_svm.predict(x_test_svm)
# Modal 2 is using GridSearchCV to find the best hyperparameters for SVM using cross validation
# Only some hyperparameters are tuned
# import GridSearchCV library
from sklearn.model_selection import GridSearchCV
#Try to tune the hyperparameter with
#kernel type: linear/rbf/sigmoid
#C which is the regularization parameter: range 0-1 increase by 0.1
#coef0 that is the independent term for kernel method (only for sigmoid): range 0.0-0.5 increase by 0.1
#degree for the polynomial ('poly') kernel method: range 0-5 increase by 1
#gamma that are kernel coefficient for 'rbf' and 'poly': scale/auto
param_grid={'kernel':('linear','rbf','sigmoid'),
'C':[i for i in np.arange(1.0,3.0,0.1)],
'coef0':[y for y in np.arange(0.0,1.5,0.1)],
'degree':[z for z in np.arange(3,6,1)],
'gamma':('auto','scale'),}
# set random state to 40
find_best_para_model=svm.SVC(random_state=40)
Grid_search_svm=GridSearchCV(find_best_para_model,param_grid, n_jobs=-1,verbose=2,cv=4)
# this may take some time to run
Grid_search_svm.fit(x_train_svm,y_train_svm)
Fitting 4 folds for each of 5400 candidates, totalling 21600 fits
# Show the best hyperparameter found by grid search
Grid_search_svm.best_params_
{'C': 2.8000000000000016, 'coef0': 0.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
# Use hyperparameter found grid search to build modal
model_2_svm=svm.SVC(C=2.8000000000000016,kernel='rbf',degree=3,gamma='scale',coef0=0.0,probability=True,random_state=40)
model_2_svm.fit(x_train_svm,y_train_svm)
y_predict_2_svm=model_2_svm.predict(x_test_svm)
#Evaluate accurracy of classification result
print('Accuracy of prediction classification result for 2 model')
print('Hyperparameters that try to tune manually (model 1): ',metrics.accuracy_score(y_test_svm, y_predict_1_svm))
print('Best hyperparameters found using GridSearchCV (model 2): ',metrics.accuracy_score(y_test_svm, y_predict_2_svm))
Accuracy of prediction classification result for 2 model
Hyperparameters that try to tune manually (model 1): 0.7416666666666667
Best hyperparameters found using GridSearchCV (model 2): 0.8166666666666667
#Evaluating classification result by confusion matrix
from sklearn.metrics import confusion_matrix
print (confusion_matrix(y_test_svm, y_predict_2_svm,[0,1]))
#Evaluating classification result by Precision, Recall and F1-Measure
from sklearn.metrics import classification_report
print (classification_report(y_test_svm, y_predict_2_svm))
[[183 12]
[ 32 13]]
precision recall f1-score support
0 0.85 0.94 0.89 195
1 0.52 0.29 0.37 45
accuracy 0.82 240
macro avg 0.69 0.61 0.63 240
weighted avg 0.79 0.82 0.79 240
C:\Users\USER\anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass labels=[0, 1] as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
warnings.warn(f"Pass {args_msg} as keyword args. From version "
from matplotlib import pyplot as plt
#Evaluating classification result by ROC curves
from sklearn.metrics import roc_curve
y_pred_prob_svm = model_2_svm.predict_proba(x_test_svm)[:,1]
fpr, tpr, threshold = roc_curve(y_test_svm, y_pred_prob_svm)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr,tpr)
auc = roc_auc_score(y_test_svm, y_pred_prob_svm)
plt.title(f'AUC: {auc}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()