EHN: Parallelisation for SMOTEENN and SMOTETomek (#547)

glemaitre · web-flow · commit dff46709df49 · 2019-02-21T12:34:26.000+01:00
diff --git a/doc/whats_new/v0.5.rst b/doc/whats_new/v0.5.rst
@@ -17,6 +17,12 @@ Documentation
   :class:`imblearn.over_sampling.SVMSMOTE` in the API documenation.
   :issue:`530` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+Enhancement
+...........
+
+- Add Parallelisation for SMOTEENN and SMOTETomek.
+  :issue:`547` by :user:`Michael Hsieh <Microsheep>`.
+
 Maintenance
 ...........
 
diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py
@@ -39,6 +39,17 @@ class SMOTEENN(BaseSampler):
         a :class:`imblearn.over_sampling.SMOTE` object with default parameters
         will be given.
 
+    enn : object, optional (default=\
+EditedNearestNeighbours(sampling_strategy='all'))
+        The :class:`imblearn.under_sampling.EditedNearestNeighbours` object
+        to use. If not given, a
+        :class:`imblearn.under_sampling.EditedNearestNeighbours` object with
+        sampling strategy='all' will be given.
+
+    n_jobs : int, optional (default=1)
+        The number of threads to open if possible.
+        Will not apply to smote and enn given by the user.
+
     ratio : str, dict, or callable
         .. deprecated:: 0.4
            Use the parameter ``sampling_strategy`` instead. It will be removed
@@ -86,12 +97,14 @@ def __init__(self,
                  random_state=None,
                  smote=None,
                  enn=None,
+                 n_jobs=1,
                  ratio=None):
         super(SMOTEENN, self).__init__()
         self.sampling_strategy = sampling_strategy
         self.random_state = random_state
         self.smote = smote
         self.enn = enn
+        self.n_jobs = n_jobs
         self.ratio = ratio
 
     def _validate_estimator(self):
@@ -107,6 +120,7 @@ def _validate_estimator(self):
             self.smote_ = SMOTE(
                 sampling_strategy=self.sampling_strategy,
                 random_state=self.random_state,
+                n_jobs=self.n_jobs,
                 ratio=self.ratio)
 
         if self.enn is not None:
@@ -117,7 +131,9 @@ def _validate_estimator(self):
                                  ' Got {} instead.'.format(type(self.enn)))
         # Otherwise create a default EditedNearestNeighbours
         else:
-            self.enn_ = EditedNearestNeighbours(sampling_strategy='all')
+            self.enn_ = EditedNearestNeighbours(
+                            sampling_strategy='all',
+                            n_jobs=self.n_jobs)
 
     def _fit_resample(self, X, y):
         self._validate_estimator()
diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py
@@ -41,10 +41,14 @@ class SMOTETomek(BaseSampler):
         a :class:`imblearn.over_sampling.SMOTE` object with default parameters
         will be given.
 
-    tomek : object, optional (default=Tomek())
-        The :class:`imblearn.under_sampling.Tomek` object to use. If not given,
-        a :class:`imblearn.under_sampling.Tomek` object with default parameters
-        will be given.
+    tomek : object, optional (default=TomekLinks(sampling_strategy='all'))
+        The :class:`imblearn.under_sampling.TomekLinks` object to use. If not
+        given, a :class:`imblearn.under_sampling.TomekLinks` object with
+        sampling strategy='all' will be given.
+
+    n_jobs : int, optional (default=1)
+        The number of threads to open if possible.
+        Will not apply to smote and tomek given by the user.
 
     ratio : str, dict, or callable
         .. deprecated:: 0.4
@@ -94,12 +98,14 @@ def __init__(self,
                  random_state=None,
                  smote=None,
                  tomek=None,
+                 n_jobs=1,
                  ratio=None):
         super(SMOTETomek, self).__init__()
         self.sampling_strategy = sampling_strategy
         self.random_state = random_state
         self.smote = smote
         self.tomek = tomek
+        self.n_jobs = n_jobs
         self.ratio = ratio
 
     def _validate_estimator(self):
@@ -116,6 +122,7 @@ def _validate_estimator(self):
             self.smote_ = SMOTE(
                 sampling_strategy=self.sampling_strategy,
                 random_state=self.random_state,
+                n_jobs=self.n_jobs,
                 ratio=self.ratio)
 
         if self.tomek is not None:
@@ -126,7 +133,9 @@ def _validate_estimator(self):
                                  'Got {} instead.'.format(type(self.tomek)))
         # Otherwise create a default TomekLinks
         else:
-            self.tomek_ = TomekLinks(sampling_strategy='all')
+            self.tomek_ = TomekLinks(
+                            sampling_strategy='all',
+                            n_jobs=self.n_jobs)
 
     def _fit_resample(self, X, y):
         self._validate_estimator()
diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py
@@ -98,6 +98,22 @@ def test_validate_estimator_default():
     assert_array_equal(y_resampled, y_gt)
 
 
+def test_parallelisation():
+    # Check if default job count is 1
+    smt = SMOTEENN(random_state=RND_SEED)
+    smt._validate_estimator()
+    assert smt.n_jobs == 1
+    assert smt.smote_.n_jobs == 1
+    assert smt.enn_.n_jobs == 1
+
+    # Check if job count is set
+    smt = SMOTEENN(random_state=RND_SEED, n_jobs=8)
+    smt._validate_estimator()
+    assert smt.n_jobs == 8
+    assert smt.smote_.n_jobs == 8
+    assert smt.enn_.n_jobs == 8
+
+
 @pytest.mark.parametrize(
     "smote_params, err_msg",
     [({'smote': 'rnd'}, "smote needs to be a SMOTE"),
diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py
@@ -104,6 +104,22 @@ def test_validate_estimator_default():
     assert_array_equal(y_resampled, y_gt)
 
 
+def test_parallelisation():
+    # Check if default job count is 1
+    smt = SMOTETomek(random_state=RND_SEED)
+    smt._validate_estimator()
+    assert smt.n_jobs == 1
+    assert smt.smote_.n_jobs == 1
+    assert smt.tomek_.n_jobs == 1
+
+    # Check if job count is set
+    smt = SMOTETomek(random_state=RND_SEED, n_jobs=8)
+    smt._validate_estimator()
+    assert smt.n_jobs == 8
+    assert smt.smote_.n_jobs == 8
+    assert smt.tomek_.n_jobs == 8
+
+
 @pytest.mark.parametrize(
     "smote_params, err_msg",
     [({'smote': 'rnd'}, "smote needs to be a SMOTE"),