from sklearn import datasets
import pandas as pd
iris = datasets.load_iris()
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']
def abc(k, *val):
if k < val[0]:
return 0
else:
return 1
df.sl.apply(abc, args=(5,))
0 1 1 0 2 0 3 0 4 1 5 1 6 0 7 1 8 0 9 0 10 1 11 0 12 0 13 0 14 1 15 1 16 1 17 1 18 1 19 1 20 1 21 1 22 0 23 1 24 0 25 1 26 1 27 1 28 1 29 0 .. 120 1 121 1 122 1 123 1 124 1 125 1 126 1 127 1 128 1 129 1 130 1 131 1 132 1 133 1 134 1 135 1 136 1 137 1 138 1 139 1 140 1 141 1 142 1 143 1 144 1 145 1 146 1 147 1 148 1 149 1 Name: sl, dtype: int64
def label(val, *boundaries):
if (val < boundaries[0]):
return 'a'
elif (val < boundaries[1]):
return 'b'
elif (val < boundaries[2]):
return 'c'
else:
return 'd'
def toLabel(df, old_feature_name):
second = df[old_feature_name].mean()
minimum = df[old_feature_name].min()
first = (minimum + second)/2
maximum = df[old_feature_name].max()
third = (maximum + second)/2
return df[old_feature_name].apply(label, args= (first, second, third))
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df
sl | sw | pl | pw | sl_labeled | sw_labeled | pl_labeled | pw_labeled | |
---|---|---|---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | b | c | a | a |
1 | 4.9 | 3.0 | 1.4 | 0.2 | a | b | a | a |
2 | 4.7 | 3.2 | 1.3 | 0.2 | a | c | a | a |
3 | 4.6 | 3.1 | 1.5 | 0.2 | a | c | a | a |
4 | 5.0 | 3.6 | 1.4 | 0.2 | a | c | a | a |
5 | 5.4 | 3.9 | 1.7 | 0.4 | b | d | a | a |
6 | 4.6 | 3.4 | 1.4 | 0.3 | a | c | a | a |
7 | 5.0 | 3.4 | 1.5 | 0.2 | a | c | a | a |
8 | 4.4 | 2.9 | 1.4 | 0.2 | a | b | a | a |
9 | 4.9 | 3.1 | 1.5 | 0.1 | a | c | a | a |
10 | 5.4 | 3.7 | 1.5 | 0.2 | b | c | a | a |
11 | 4.8 | 3.4 | 1.6 | 0.2 | a | c | a | a |
12 | 4.8 | 3.0 | 1.4 | 0.1 | a | b | a | a |
13 | 4.3 | 3.0 | 1.1 | 0.1 | a | b | a | a |
14 | 5.8 | 4.0 | 1.2 | 0.2 | b | d | a | a |
15 | 5.7 | 4.4 | 1.5 | 0.4 | b | d | a | a |
16 | 5.4 | 3.9 | 1.3 | 0.4 | b | d | a | a |
17 | 5.1 | 3.5 | 1.4 | 0.3 | b | c | a | a |
18 | 5.7 | 3.8 | 1.7 | 0.3 | b | d | a | a |
19 | 5.1 | 3.8 | 1.5 | 0.3 | b | d | a | a |
20 | 5.4 | 3.4 | 1.7 | 0.2 | b | c | a | a |
21 | 5.1 | 3.7 | 1.5 | 0.4 | b | c | a | a |
22 | 4.6 | 3.6 | 1.0 | 0.2 | a | c | a | a |
23 | 5.1 | 3.3 | 1.7 | 0.5 | b | c | a | a |
24 | 4.8 | 3.4 | 1.9 | 0.2 | a | c | a | a |
25 | 5.0 | 3.0 | 1.6 | 0.2 | a | b | a | a |
26 | 5.0 | 3.4 | 1.6 | 0.4 | a | c | a | a |
27 | 5.2 | 3.5 | 1.5 | 0.2 | b | c | a | a |
28 | 5.2 | 3.4 | 1.4 | 0.2 | b | c | a | a |
29 | 4.7 | 3.2 | 1.6 | 0.2 | a | c | a | a |
... | ... | ... | ... | ... | ... | ... | ... | ... |
120 | 6.9 | 3.2 | 5.7 | 2.3 | d | c | d | d |
121 | 5.6 | 2.8 | 4.9 | 2.0 | b | b | c | d |
122 | 7.7 | 2.8 | 6.7 | 2.0 | d | b | d | d |
123 | 6.3 | 2.7 | 4.9 | 1.8 | c | b | c | c |
124 | 6.7 | 3.3 | 5.7 | 2.1 | c | c | d | d |
125 | 7.2 | 3.2 | 6.0 | 1.8 | d | c | d | c |
126 | 6.2 | 2.8 | 4.8 | 1.8 | c | b | c | c |
127 | 6.1 | 3.0 | 4.9 | 1.8 | c | b | c | c |
128 | 6.4 | 2.8 | 5.6 | 2.1 | c | b | d | d |
129 | 7.2 | 3.0 | 5.8 | 1.6 | d | b | d | c |
130 | 7.4 | 2.8 | 6.1 | 1.9 | d | b | d | d |
131 | 7.9 | 3.8 | 6.4 | 2.0 | d | d | d | d |
132 | 6.4 | 2.8 | 5.6 | 2.2 | c | b | d | d |
133 | 6.3 | 2.8 | 5.1 | 1.5 | c | b | c | c |
134 | 6.1 | 2.6 | 5.6 | 1.4 | c | b | d | c |
135 | 7.7 | 3.0 | 6.1 | 2.3 | d | b | d | d |
136 | 6.3 | 3.4 | 5.6 | 2.4 | c | c | d | d |
137 | 6.4 | 3.1 | 5.5 | 1.8 | c | c | d | c |
138 | 6.0 | 3.0 | 4.8 | 1.8 | c | b | c | c |
139 | 6.9 | 3.1 | 5.4 | 2.1 | d | c | d | d |
140 | 6.7 | 3.1 | 5.6 | 2.4 | c | c | d | d |
141 | 6.9 | 3.1 | 5.1 | 2.3 | d | c | c | d |
142 | 5.8 | 2.7 | 5.1 | 1.9 | b | b | c | d |
143 | 6.8 | 3.2 | 5.9 | 2.3 | c | c | d | d |
144 | 6.7 | 3.3 | 5.7 | 2.5 | c | c | d | d |
145 | 6.7 | 3.0 | 5.2 | 2.3 | c | b | c | d |
146 | 6.3 | 2.5 | 5.0 | 1.9 | c | a | c | d |
147 | 6.5 | 3.0 | 5.2 | 2.0 | c | b | c | d |
148 | 6.2 | 3.4 | 5.4 | 2.3 | c | c | d | d |
149 | 5.9 | 3.0 | 5.1 | 1.8 | c | b | c | c |
150 rows × 8 columns
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)
set(df['sl_labeled'])
{'a', 'b', 'c', 'd'}
df["output"] = iris.target
df
sl_labeled | sw_labeled | pl_labeled | pw_labeled | output | |
---|---|---|---|---|---|
0 | b | c | a | a | 0 |
1 | a | b | a | a | 0 |
2 | a | c | a | a | 0 |
3 | a | c | a | a | 0 |
4 | a | c | a | a | 0 |
5 | b | d | a | a | 0 |
6 | a | c | a | a | 0 |
7 | a | c | a | a | 0 |
8 | a | b | a | a | 0 |
9 | a | c | a | a | 0 |
10 | b | c | a | a | 0 |
11 | a | c | a | a | 0 |
12 | a | b | a | a | 0 |
13 | a | b | a | a | 0 |
14 | b | d | a | a | 0 |
15 | b | d | a | a | 0 |
16 | b | d | a | a | 0 |
17 | b | c | a | a | 0 |
18 | b | d | a | a | 0 |
19 | b | d | a | a | 0 |
20 | b | c | a | a | 0 |
21 | b | c | a | a | 0 |
22 | a | c | a | a | 0 |
23 | b | c | a | a | 0 |
24 | a | c | a | a | 0 |
25 | a | b | a | a | 0 |
26 | a | c | a | a | 0 |
27 | b | c | a | a | 0 |
28 | b | c | a | a | 0 |
29 | a | c | a | a | 0 |
... | ... | ... | ... | ... | ... |
120 | d | c | d | d | 2 |
121 | b | b | c | d | 2 |
122 | d | b | d | d | 2 |
123 | c | b | c | c | 2 |
124 | c | c | d | d | 2 |
125 | d | c | d | c | 2 |
126 | c | b | c | c | 2 |
127 | c | b | c | c | 2 |
128 | c | b | d | d | 2 |
129 | d | b | d | c | 2 |
130 | d | b | d | d | 2 |
131 | d | d | d | d | 2 |
132 | c | b | d | d | 2 |
133 | c | b | c | c | 2 |
134 | c | b | d | c | 2 |
135 | d | b | d | d | 2 |
136 | c | c | d | d | 2 |
137 | c | c | d | c | 2 |
138 | c | b | c | c | 2 |
139 | d | c | d | d | 2 |
140 | c | c | d | d | 2 |
141 | d | c | c | d | 2 |
142 | b | b | c | d | 2 |
143 | c | c | d | d | 2 |
144 | c | c | d | d | 2 |
145 | c | b | c | d | 2 |
146 | c | a | c | d | 2 |
147 | c | b | c | d | 2 |
148 | c | c | d | d | 2 |
149 | c | b | c | c | 2 |
150 rows × 5 columns
def fit(data):
output_name = data.columns[-1]
features = data.columns[0:-1]
counts = {}
possible_outputs = set(data[output_name])
for output in possible_outputs:
counts[output] = {}
smallData = data[data[output_name] == output]
counts[output]["total_count"] = len(smallData)
for f in features:
counts[output][f] = {}
possible_values = set(smallData[f])
for value in possible_values:
val_count = len(smallData[smallData[f] == value])
counts[output][f][value] = val_count
return counts
fit(df)
{0: {'pl_labeled': {'a': 50}, 'pw_labeled': {'a': 50}, 'sl_labeled': {'a': 28, 'b': 22}, 'sw_labeled': {'a': 1, 'b': 7, 'c': 32, 'd': 10}, 'total_count': 50}, 1: {'pl_labeled': {'b': 7, 'c': 43}, 'pw_labeled': {'b': 10, 'c': 40}, 'sl_labeled': {'a': 3, 'b': 21, 'c': 24, 'd': 2}, 'sw_labeled': {'a': 13, 'b': 29, 'c': 8}, 'total_count': 50}, 2: {'pl_labeled': {'c': 20, 'd': 30}, 'pw_labeled': {'c': 16, 'd': 34}, 'sl_labeled': {'a': 1, 'b': 5, 'c': 29, 'd': 15}, 'sw_labeled': {'a': 5, 'b': 28, 'c': 15, 'd': 2}, 'total_count': 50}}