33import collections
44import random
55
6+
67class MLSMOTE :
78 """Over-sampling using MLSMOTE.
89
@@ -35,47 +36,49 @@ class MLSMOTE:
3536 Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019.
3637
3738 """
38- def __init__ (self ,categorical_features ,k_neighbors = 5 ,sampling_strategy = 'ranking' ):
39- self .k_neighbors = k_neighbors
40- self .sampling_strategy_ = sampling_strategy
39+
40+ def __init__ (self , categorical_features , k_neighbors = 5 , sampling_strategy = 'ranking' ):
41+ self .k_neighbors = k_neighbors
42+ self .sampling_strategy_ = sampling_strategy
4143 self .categorical_features = categorical_features
42- self .continuous_features_ = None
44+ self .continuous_features_ = None
4345 self .unique_labels = []
44- self .labels = []
45- self .features = []
46+ self .labels = []
47+ self .features = []
4648
47- def fit_resample (self ,X , y ):
49+ def fit_resample (self , X , y ):
4850 self .n_features_ = X .shape [1 ]
49- self .labels = np .array ([np .array (xi ) for xi in y ])
51+ self .labels = np .array ([np .array (xi ) for xi in y ])
5052
5153 self ._validate_estimator ()
5254
5355 X_resampled = X .copy ()
5456 y_resampled = y .copy ()
5557
5658 self .unique_labels = self ._collect_unique_labels (y )
57- self .features = X
59+ self .features = X
5860
59- X_synth = []
60- y_synth = []
61+ X_synth = []
62+ y_synth = []
6163
62- append_X_synth = X_synth .append
63- append_y_synth = y_synth .append
64- mean_ir = self ._get_mean_imbalance_ratio ()
64+ append_X_synth = X_synth .append
65+ append_y_synth = y_synth .append
66+ mean_ir = self ._get_mean_imbalance_ratio ()
6567 for label in self .unique_labels :
66- irlbl = self ._get_imbalance_ratio_per_label (label )
68+ irlbl = self ._get_imbalance_ratio_per_label (label )
6769 if irlbl > mean_ir :
68- min_bag = self ._get_all_instances_of_label (label )
70+ min_bag = self ._get_all_instances_of_label (label )
6971 for sample in min_bag :
70- distances = self ._calc_distances (sample ,min_bag )
71- distances = np .sort (distances ,order = 'distance' )
72- neighbours = distances [:self .k_neighbors ]
73- ref_neigh = np .random .choice (neighbours ,1 )[0 ]
74- X_new ,y_new = self ._create_new_sample (sample ,ref_neigh [1 ],[x [1 ] for x in neighbours ])
72+ distances = self ._calc_distances (sample , min_bag )
73+ distances = np .sort (distances , order = 'distance' )
74+ neighbours = distances [:self .k_neighbors ]
75+ ref_neigh = np .random .choice (neighbours , 1 )[0 ]
76+ X_new , y_new = self ._create_new_sample (
77+ sample , ref_neigh [1 ], [x [1 ] for x in neighbours ])
7578 append_X_synth (X_new )
7679 append_y_synth (y_new )
7780
78- return np .concatenate ((X_resampled ,np .array (X_synth ))),np .array (y_resampled .tolist ()+ y_synth )
81+ return np .concatenate ((X_resampled , np .array (X_synth ))), np .array (y_resampled .tolist ()+ y_synth )
7982
8083 def _validate_estimator (self ):
8184 categorical_features = np .asarray (self .categorical_features )
@@ -101,102 +104,110 @@ def _collect_unique_labels(self, y):
101104 """A support function that flattens the labelsets and return one set of unique labels"""
102105 return np .unique (np .array ([a for x in y for a in (x if isinstance (x , list ) else [x ])]))
103106
104- def _create_new_sample (self ,sample_id ,ref_neigh_id ,neighbour_ids ):
105- sample = self .features [sample_id ]
106- sample_labels = self .labels [sample_id ]
107- synth_sample = np .copy (sample )
108- ref_neigh = self .features [ref_neigh_id ]
109- neighbours_labels = []
107+ def _create_new_sample (self , sample_id , ref_neigh_id , neighbour_ids ):
108+ sample = self .features [sample_id ]
109+ sample_labels = self .labels [sample_id ]
110+ synth_sample = np .copy (sample )
111+ ref_neigh = self .features [ref_neigh_id ]
112+ neighbours_labels = []
110113 for ni in neighbour_ids :
111114 neighbours_labels .append (self .labels [ni ].tolist ())
112115 for i in range (synth_sample .shape [0 ]):
113116 if i in self .continuous_features_ :
114- diff = ref_neigh [i ]- sample [i ]
115- offset = diff * random .uniform (0 ,1 )
116- synth_sample [i ]= sample [i ]+ offset
117+ diff = ref_neigh [i ]- sample [i ]
118+ offset = diff * random .uniform (0 , 1 )
119+ synth_sample [i ] = sample [i ]+ offset
117120 if i in self .categorical_features_ :
118- synth_sample [i ]= self ._get_most_frequent_value (self .features [neighbour_ids ,i ])
119-
120- labels = sample_labels .tolist ()
121- labels += [a for x in neighbours_labels for a in (x if isinstance (x , list ) else [x ])]
122- labels = list (set (labels ))
123- if self .sampling_strategy_ == 'ranking' :
124- head_index = int ((self .k_neighbors + 1 )/ 2 )
125- y = labels [:head_index ]
126- if self .sampling_strategy_ == 'union' :
127- y = labels [:]
128- if self .sampling_strategy_ == 'intersection' :
129- y = list (set .intersection (* neighbours_labels ))
130-
131- X = synth_sample
132- return X ,y
133-
134-
135- def _calc_distances (self ,sample ,min_bag ):
136- distances = []
137- append_distances = distances .append
121+ synth_sample [i ] = self ._get_most_frequent_value (
122+ self .features [neighbour_ids , i ])
123+
124+ labels = sample_labels .tolist ()
125+ labels += [a for x in neighbours_labels for a in (
126+ x if isinstance (x , list ) else [x ])]
127+ labels = list (set (labels ))
128+ if self .sampling_strategy_ == 'ranking' :
129+ head_index = int ((self .k_neighbors + 1 )/ 2 )
130+ y = labels [:head_index ]
131+ if self .sampling_strategy_ == 'union' :
132+ y = labels [:]
133+ if self .sampling_strategy_ == 'intersection' :
134+ y = list (set .intersection (* neighbours_labels ))
135+
136+ X = synth_sample
137+ return X , y
138+
139+ def _calc_distances (self , sample , min_bag ):
140+ distances = []
141+ append_distances = distances .append
138142 for bag_sample in min_bag :
139- nominal_distances = np .array ([self ._get_vdm (self .features [sample ,cat ],self .features [bag_sample ,cat ])for cat in self .categorical_features_ ])
140- ordinal_distances = np .array ([self ._get_euclidean_distance (self .features [sample ,num ],self .features [bag_sample ,num ])for num in self .continuous_features_ ])
141- dists = np .array ([nominal_distances .sum (),ordinal_distances .sum ()])
142- append_distances ((dists .sum (),bag_sample ))
143- dtype = np .dtype ([('distance' , float ), ('index' , int )])
144- return np .array (distances ,dtype = dtype )
145-
146-
147- def _get_euclidean_distance (self ,first ,second ):
148- euclidean_distance = np .linalg .norm (first - second )
143+ nominal_distances = np .array ([self ._get_vdm (
144+ self .features [sample , cat ], self .features [bag_sample , cat ])for cat in self .categorical_features_ ])
145+ ordinal_distances = np .array ([self ._get_euclidean_distance (
146+ self .features [sample , num ], self .features [bag_sample , num ])for num in self .continuous_features_ ])
147+ dists = np .array (
148+ [nominal_distances .sum (), ordinal_distances .sum ()])
149+ append_distances ((dists .sum (), bag_sample ))
150+ dtype = np .dtype ([('distance' , float ), ('index' , int )])
151+ return np .array (distances , dtype = dtype )
152+
153+ def _get_euclidean_distance (self , first , second ):
154+ euclidean_distance = np .linalg .norm (first - second )
149155 return euclidean_distance
150156
151- def _get_vdm (self ,first ,second ):
157+ def _get_vdm (self , first , second ):
152158 """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
153159 def f (c ):
154- N_ax = len (np .where (self .features [:,self .categorical_features_ ]== first ))
155- N_ay = len (np .where (self .features [:,self .categorical_features_ ]== second ))
156- c_instances = self ._get_all_instances_of_label (c )
157- N_axc = len (np .where (self .features [np .ix_ (c_instances ,self .categorical_features_ )]== first )[0 ])
158- N_ayc = len (np .where (self .features [np .ix_ (c_instances ,self .categorical_features_ )]== second )[0 ])
160+ N_ax = len (
161+ np .where (self .features [:, self .categorical_features_ ] == first ))
162+ N_ay = len (
163+ np .where (self .features [:, self .categorical_features_ ] == second ))
164+ c_instances = self ._get_all_instances_of_label (c )
165+ N_axc = len (np .where (self .features [np .ix_ (
166+ c_instances , self .categorical_features_ )] == first )[0 ])
167+ N_ayc = len (np .where (self .features [np .ix_ (
168+ c_instances , self .categorical_features_ )] == second )[0 ])
159169 return np .square (np .abs ((N_axc / N_ax )- (N_ayc / N_ay )))
160-
170+
161171 return np .sum (np .array ([f (c )for c in self .unique_labels ]))
162172
163- def _get_all_instances_of_label (self ,label ):
164- instance_ids = []
165- append_instance_id = instance_ids .append
166- for i ,label_set in enumerate (self .labels ):
173+ def _get_all_instances_of_label (self , label ):
174+ instance_ids = []
175+ append_instance_id = instance_ids .append
176+ for i , label_set in enumerate (self .labels ):
167177 if label in label_set :
168178 append_instance_id (i )
169179 return np .array (instance_ids )
170180
171181 def _get_mean_imbalance_ratio (self ):
172- ratio_sum = np .sum (np .array (list (map (self ._get_imbalance_ratio_per_label ,self .unique_labels ))))
182+ ratio_sum = np .sum (
183+ np .array (list (map (self ._get_imbalance_ratio_per_label , self .unique_labels ))))
173184 return ratio_sum / self .unique_labels .shape [0 ]
174185
175- def _get_imbalance_ratio_per_label (self ,label ):
176- sum_array = list (map (self ._sum_h ,self .unique_labels ))
177- sum_array = np .array (sum_array )
186+ def _get_imbalance_ratio_per_label (self , label ):
187+ sum_array = list (map (self ._sum_h , self .unique_labels ))
188+ sum_array = np .array (sum_array )
178189 return sum_array .max ()/ self ._sum_h (label )
179190
180- def _sum_h (self ,label ):
181- h_sum = 0
182- def h (l ,Y ):
191+ def _sum_h (self , label ):
192+ h_sum = 0
193+
194+ def h (l , Y ):
183195 if l in Y :
184196 return 1
185197 else :
186198 return 0
187199
188200 for label_set in self .labels :
189- h_sum += h (label ,label_set )
201+ h_sum += h (label , label_set )
190202 return h_sum
191203
192-
193- def _get_label_frequencies (self ,labels ):
204+ def _get_label_frequencies (self , labels ):
194205 """"A support function to get the frequencies of labels"""
195- frequency_map = np .array (np .unique (labels , return_counts = True )).T
196- frequencies = np .array ([x [1 ] for x in count_map ])
206+ frequency_map = np .array (np .unique (labels , return_counts = True )).T
207+ frequencies = np .array ([x [1 ] for x in count_map ])
197208 return frequencies
198-
209+
199210 def _get_most_frequent_value (self , values ):
200211 """"A support function to get most frequent value if a list of values"""
201212 uniques , indices = np .unique (values , return_inverse = True )
202- return uniques [np .argmax (np .bincount (indices ))]
213+ return uniques [np .argmax (np .bincount (indices ))]
0 commit comments