feature-engine · solegalli · Oct 9, 2020 · Oct 3, 2020 · Oct 3, 2020 · Oct 3, 2020
diff --git a/feature_engine/selection/__init__.py b/feature_engine/selection/__init__.py
@@ -4,8 +4,6 @@
 
 from .drop_features import DropFeatures
 from .drop_constant_features import DropConstantFeatures
+from .drop_duplicate_features import DropDuplicateFeatures
 
-__all__ = [
-    'DropFeatures',
-    'DropConstantFeatures'
-]
+__all__ = ["DropFeatures", "DropConstantFeatures", "DropDuplicateFeatures"]
diff --git a/feature_engine/selection/drop_duplicate_features.py b/feature_engine/selection/drop_duplicate_features.py
@@ -0,0 +1,132 @@
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.utils.validation import check_is_fitted
+from feature_engine.dataframe_checks import (
+    _is_dataframe,
+    _check_input_matches_training_df,
+)
+from feature_engine.variable_manipulation import _find_all_variables, _define_variables
+
+
+class DropDuplicateFeatures(BaseEstimator, TransformerMixin):
+    """
+    DropDuplicateFeatures finds and removes duplicated features in a dataframe.
+
+    Duplicated features are identical features, regardless of the variable or column name. If they
+    show the same values for every observation, then they are considered duplicated.
+
+    The transformer will first identify and store the duplicated variables. Next, the transformer
+    will drop these variables from a dataframe.
+
+    Parameters
+    ----------
+
+    variables: list, default=None
+        The list of variables to evaluate. If None, the transformer will evaluate all variables in
+        the dataset.
+
+    """
+
+    def __init__(self, variables=None):
+        self.variables = _define_variables(variables)
+
+    def fit(self, X, y=None):
+
+        """
+        Find duplicated features.
+
+        Parameters
+        ----------
+
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The input dataframe.
+
+        y: None
+            y is not needed for this transformer. You can pass y or None.
+
+
+        Attributes
+        ----------
+
+        duplicated_features_: set
+            The duplicated features.
+
+        duplicated_feature_sets_: list
+            Groups of duplicated features. Or in other words, features that are duplicated with
+            each other. Each list represents a group of duplicated features.
+        """
+
+        # check input dataframe
+        X = _is_dataframe(X)
+
+        # find all variables or check those entered are in the dataframe
+        self.variables = _find_all_variables(X, self.variables)
+
+        # create tuples of duplicated feature groups
+        self.duplicated_feature_sets_ = []
+
+        # set to collect features that are duplicated
+        self.duplicated_features_ = set()
+
+        # create set of examined features
+        _examined_features = set()
+
+        for feature in self.variables:
+
+            # append so we can remove when we create the combinations
+            _examined_features.add(feature)
+
+            if feature not in self.duplicated_features_:
+
+                _temp_set = set([feature])
+
+                # features that have not been examined, are not currently examined and were
+                # not found duplicates
+                _features_to_compare = [
+                    f
+                    for f in self.variables
+                    if f not in _examined_features.union(self.duplicated_features_)
+                ]
+
+                # create combinations:
+                for f2 in _features_to_compare:
+
+                    if X[feature].equals(X[f2]):
+                        self.duplicated_features_.add(f2)
+                        _temp_set.add(f2)
+
+                # if there are duplicated features
+                if len(_temp_set) > 1:
+                    self.duplicated_feature_sets_.append(_temp_set)
+
+        self.input_shape_ = X.shape
+
+        return self
+
+    def transform(self, X):
+        """
+        Drops the duplicated features from a dataframe.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features].
+            The input samples.
+
+        Returns
+        -------
+        X_transformed: pandas dataframe of shape = [n_samples, n_features - (duplicated features)]
+            The transformed dataframe with the remaining subset of variables.
+
+        """
+        # check if fit is performed prior to transform
+        check_is_fitted(self)
+
+        # check if input is a dataframe
+        X = _is_dataframe(X)
+
+        # check if number of columns in test dataset matches to train dataset
+        _check_input_matches_training_df(X, self.input_shape_[1])
+
+        # returned non-duplicate features
+        X = X.drop(columns=self.duplicated_features_)
+
+        return X
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,94 +5,203 @@
 
 @pytest.fixture(scope="module")
 def dataframe_vartypes():
-    data = {'Name': ['tom', 'nick', 'krish', 'jack'],
-            'City': ['London', 'Manchester', 'Liverpool', 'Bristol'],
-            'Age': [20, 21, 19, 18],
-            'Marks': [0.9, 0.8, 0.7, 0.6],
-            'dob': pd.date_range('2020-02-24', periods=4, freq='T')
-            }
+    data = {
+        "Name": ["tom", "nick", "krish", "jack"],
+        "City": ["London", "Manchester", "Liverpool", "Bristol"],
+        "Age": [20, 21, 19, 18],
+        "Marks": [0.9, 0.8, 0.7, 0.6],
+        "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
+    }
 
     df = pd.DataFrame(data)
     return df
 
 
 @pytest.fixture(scope="module")
 def dataframe_na():
-    data = {'Name': ['tom', 'nick', 'krish', np.nan, 'peter', np.nan, 'fred', 'sam'],
-            'City': ['London', 'Manchester', np.nan, np.nan, 'London', 'London', 'Bristol', 'Manchester'],
-            'Studies': ['Bachelor', 'Bachelor', np.nan, np.nan, 'Bachelor', 'PhD', 'None', 'Masters'],
-            'Age': [20, 21, 19, np.nan, 23, 40, 41, 37],
-            'Marks': [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6],
-            'dob': pd.date_range('2020-02-24', periods=8, freq='T')
-            }
+    data = {
+        "Name": ["tom", "nick", "krish", np.nan, "peter", np.nan, "fred", "sam"],
+        "City": [
+            "London",
+            "Manchester",
+            np.nan,
+            np.nan,
+            "London",
+            "London",
+            "Bristol",
+            "Manchester",
+        ],
+        "Studies": [
+            "Bachelor",
+            "Bachelor",
+            np.nan,
+            np.nan,
+            "Bachelor",
+            "PhD",
+            "None",
+            "Masters",
+        ],
+        "Age": [20, 21, 19, np.nan, 23, 40, 41, 37],
+        "Marks": [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6],
+        "dob": pd.date_range("2020-02-24", periods=8, freq="T"),
+    }
 
     df = pd.DataFrame(data)
     return df
 
 
 @pytest.fixture(scope="module")
 def dataframe_enc():
-    df = {'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4,
-          'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4,
-          'target': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]}
+    df = {
+        "var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4,
+        "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
+        "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
+    }
     df = pd.DataFrame(df)
     return df
 
 
 @pytest.fixture(scope="module")
 def dataframe_enc_rare():
-    df = {'var_A': ['B'] * 9 + ['A'] * 6 + ['C'] * 4 + ['D'] * 1,
-          'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4,
-          'target': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]}
+    df = {
+        "var_A": ["B"] * 9 + ["A"] * 6 + ["C"] * 4 + ["D"] * 1,
+        "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
+        "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
+    }
     df = pd.DataFrame(df)
     return df
 
+
 @pytest.fixture(scope="module")
 def dataframe_enc_na():
-    df = {'var_A': ['B'] * 9 + ['A'] * 6 + ['C'] * 4 + ['D'] * 1,
-          'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4,
-          'target': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]}
+    df = {
+        "var_A": ["B"] * 9 + ["A"] * 6 + ["C"] * 4 + ["D"] * 1,
+        "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
+        "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
+    }
     df = pd.DataFrame(df)
-    df.loc[0, 'var_A'] = np.nan
+    df.loc[0, "var_A"] = np.nan
     return df
 
+
 @pytest.fixture(scope="module")
 def dataframe_enc_big():
-    df = {'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6,
-          'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6,
-          'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6, }
+    df = {
+        "var_A": ["A"] * 6
+        + ["B"] * 10
+        + ["C"] * 4
+        + ["D"] * 10
+        + ["E"] * 2
+        + ["F"] * 2
+        + ["G"] * 6,
+        "var_B": ["A"] * 10
+        + ["B"] * 6
+        + ["C"] * 4
+        + ["D"] * 10
+        + ["E"] * 2
+        + ["F"] * 2
+        + ["G"] * 6,
+        "var_C": ["A"] * 4
+        + ["B"] * 6
+        + ["C"] * 10
+        + ["D"] * 10
+        + ["E"] * 2
+        + ["F"] * 2
+        + ["G"] * 6,
+    }
     df = pd.DataFrame(df)
     return df
 
+
 @pytest.fixture(scope="module")
 def dataframe_enc_big_na():
-    df = {'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6,
-          'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6,
-          'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6, }
+    df = {
+        "var_A": ["A"] * 6
+        + ["B"] * 10
+        + ["C"] * 4
+        + ["D"] * 10
+        + ["E"] * 2
+        + ["F"] * 2
+        + ["G"] * 6,
+        "var_B": ["A"] * 10
+        + ["B"] * 6
+        + ["C"] * 4
+        + ["D"] * 10
+        + ["E"] * 2
+        + ["F"] * 2
+        + ["G"] * 6,
+        "var_C": ["A"] * 4
+        + ["B"] * 6
+        + ["C"] * 10
+        + ["D"] * 10
+        + ["E"] * 2
+        + ["F"] * 2
+        + ["G"] * 6,
+    }
     df = pd.DataFrame(df)
-    df.loc[0, 'var_A'] = np.nan
+    df.loc[0, "var_A"] = np.nan
     return df
 
+
 @pytest.fixture(scope="module")
 def dataframe_normal_dist():
     np.random.seed(0)
     mu, sigma = 0, 0.1  # mean and standard deviation
     s = np.random.normal(mu, sigma, 100)
     df = pd.DataFrame(s)
-    df.columns = ['var']
+    df.columns = ["var"]
     return df
 
-@pytest.fixture(scope='module')
+
+@pytest.fixture(scope="module")
 def dataframe_constant_features():
-    data = {'Name': ['tom', 'nick', 'krish', 'jack'],
-            'City': ['London', 'Manchester', 'Liverpool', 'Bristol'],
-            'Age': [20, 21, 19, 18],
-            'Marks': [0.9, 0.8, 0.7, 0.6],
-            'dob': pd.date_range('2020-02-24', periods=4, freq='T'),
-            'const_feat_num': [1, 1, 1, 1],
-            'const_feat_cat': ['a', 'a', 'a', 'a'],
-            'quasi_feat_num': [1, 1, 1, 2],
-            'quasi_feat_cat': ['a', 'a', 'a', 'b']}
+    data = {
+        "Name": ["tom", "nick", "krish", "jack"],
+        "City": ["London", "Manchester", "Liverpool", "Bristol"],
+        "Age": [20, 21, 19, 18],
+        "Marks": [0.9, 0.8, 0.7, 0.6],
+        "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
+        "const_feat_num": [1, 1, 1, 1],
+        "const_feat_cat": ["a", "a", "a", "a"],
+        "quasi_feat_n
A0FE
um": [1, 1, 1, 2],
+        "quasi_feat_cat": ["a", "a", "a", "b"],
+    }
+
+    df = pd.DataFrame(data)
+    return df
+
+
+@pytest.fixture(scope="module")
+def dataframe_duplicate_features():
+    data = {
+        "Name": ["tom", "nick", "krish", "jack"],
+        "dob2": pd.date_range("2020-02-24", periods=4, freq="T"),
+        "City": ["London", "Manchester", "Liverpool", "Bristol"],
+        "Age": [20, 21, 19, 18],
+        "Marks": [0.9, 0.8, 0.7, 0.6],
+        "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
+        "City2": ["London", "Manchester", "Liverpool", "Bristol"],
+        "dob3": pd.date_range("2020-02-24", periods=4, freq="T"),
+        "Age2": [20, 21, 19, 18],
+    }
+
+    df = pd.DataFrame(data)
+    return df
+
+
+@pytest.fixture(scope="module")
+def dataframe_duplicate_features_with_na():
+    data = {
+        "Name": ["tom", "nick", "krish", "jack", np.nan],
+        "dob2": pd.date_range("2020-02-24", periods=5, freq="T"),
+        "City": ["London", "Manchester", "Liverpool", "Bristol", np.nan],
+        "Age": [20, 21, np.nan, 18, 34],
+        "Marks": [0.9, 0.8, 0.7, 0.6, 0.5],
+        "dob": pd.date_range("2020-02-24", periods=5, freq="T"),
+        "City2": ["London", "Manchester", "Liverpool", "Bristol", np.nan],
+        "dob3": pd.date_range("2020-02-24", periods=5, freq="T"),
+        "Age2": [20, 21, np.nan, 18, 34],
+    }
 
     df = pd.DataFrame(data)
     return df