8000 Drop duplicate features #114 by Tejash-Shah · Pull Request #144 · feature-engine/feature_engine · GitHub
[go: up one dir, main page]

Skip to content

Drop duplicate features #114 #144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 9, 2020
Merged
6 changes: 2 additions & 4 deletions feature_engine/selection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

from .drop_features import DropFeatures
from .drop_constant_features import DropConstantFeatures
from .drop_duplicate_features import DropDuplicateFeatures

__all__ = [
'DropFeatures',
'DropConstantFeatures'
]
__all__ = ["DropFeatures", "DropConstantFeatures", "DropDuplicateFeatures"]
132 changes: 132 additions & 0 deletions feature_engine/selection/drop_duplicate_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.utils.validation import check_is_fitted
from feature_engine.dataframe_checks import (
_is_dataframe,
_check_input_matches_training_df,
)
from feature_engine.variable_manipulation import _find_all_variables, _define_variables


class DropDuplicateFeatures(BaseEstimator, TransformerMixin):
"""
DropDuplicateFeatures finds and removes duplicated features in a dataframe.

Duplicated features are identical features, regardless of the variable or column name. If they
show the same values for every observation, then they are considered duplicated.

The transformer will first identify and store the duplicated variables. Next, the transformer
will drop these variables from a dataframe.

Parameters
----------

variables: list, default=None
The list of variables to evaluate. If None, the transformer will evaluate all variables in
the dataset.

"""

def __init__(self, variables=None):
self.variables = _define_variables(variables)

def fit(self, X, y=None):

"""
8000 Find duplicated features.

Parameters
----------

X: pandas dataframe of shape = [n_samples, n_features]
The input dataframe.

y: None
y is not needed for this transformer. You can pass y or None.


Attributes
----------

duplicated_features_: set
The duplicated features.

duplicated_feature_sets_: list
Groups of duplicated features. Or in other words, features that are duplicated with
each other. Each list represents a group of duplicated features.
"""

# check input dataframe
X = _is_dataframe(X)

# find all variables or check those entered are in the dataframe
self.variables = _find_all_variables(X, self.variables)

# create tuples of duplicated feature groups
self.duplicated_feature_sets_ = []

# set to collect features that are duplicated
self.duplicated_features_ = set()

# create set of examined features
_examined_features = set()

for feature in self.variables:

# append so we can remove when we create the combinations
_examined_features.add(feature)

if feature not in self.duplicated_features_:

_temp_set = set([feature])

# features that have not been examined, are not currently examined and were
# not found duplicates
_features_to_compare = [
f
for f in self.variables
if f not in _examined_features.union(self.duplicated_features_)
]

# create combinations:
for f2 in _features_to_compare:

if X[feature].equals(X[f2]):
self.duplicated_features_.add(f2)
_temp_set.add(f2)

# if there are duplicated features
if len(_temp_set) > 1:
self.duplicated_feature_sets_.append(_temp_set)

self.input_shape_ = X.shape

return self

def transform(self, X):
"""
Drops the duplicated features from a dataframe.

Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features].
The input samples.

Returns
-------
X_transformed: pandas dataframe of shape = [n_samples, n_features - (duplicated features)]
The transformed dataframe with the remaining subset of variables.

"""
# check if fit is performed prior to transform
check_is_fitted(self)

# check if input is a dataframe
X = _is_dataframe(X)

# check if number of columns in test dataset matches to train dataset
_check_input_matches_training_df(X, self.input_shape_[1])

# returned non-duplicate features
X = X.drop(columns=self.duplicated_features_)

return X
191 changes: 150 additions & 41 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,94 +5,203 @@

@pytest.fixture(scope="module")
def dataframe_vartypes():
data = {'Name': ['tom', 'nick', 'krish', 'jack'],
'City': ['London', 'Manchester', 'Liverpool', 'Bristol'],
'Age': [20, 21, 19, 18],
'Marks': [0.9, 0.8, 0.7, 0.6],
'dob': pd.date_range('2020-02-24', periods=4, freq='T')
}
data = {
"Name": ["tom", "nick", "krish", "jack"],
"City": ["London", "Manchester", "Liverpool", "Bristol"],
"Age": [20, 21, 19, 18],
"Marks": [0.9, 0.8, 0.7, 0.6],
"dob": pd.date_range("2020-02-24", periods=4, freq="T"),
}

df = pd.DataFrame(data)
return df


@pytest.fixture(scope="module")
def dataframe_na():
data = {'Name': ['tom', 'nick', 'krish', np.nan, 'peter', np.nan, 'fred', 'sam'],
'City': ['London', 'Manchester', np.nan, np.nan, 'London', 'London', 'Bristol', 'Manchester'],
'Studies': ['Bachelor', 'Bachelor', np.nan, np.nan, 'Bachelor', 'PhD', 'None', 'Masters'],
'Age': [20, 21, 19, np.nan, 23, 40, 41, 37],
'Marks': [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6],
'dob': pd.date_range('2020-02-24', periods=8, freq='T')
}
data = {
"Name": ["tom", "nick", "krish", np.nan, "peter", np.nan, "fred", "sam"],
"City": [
"London",
"Manchester",
np.nan,
np.nan,
"London",
"London",
"Bristol",
"Manchester",
],
"Studies": [
"Bachelor",
"Bachelor",
np.nan,
np.nan,
"Bachelor",
"PhD",
"None",
"Masters",
],
"Age": [20, 21, 19, np.nan, 23, 40, 41, 37],
"Marks": [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6],
"dob": pd.date_range("2020-02-24", periods=8, freq="T"),
}

df = pd.DataFrame(data)
return df


@pytest.fixture(scope="module")
def dataframe_enc():
df = {'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4,
'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4,
'target': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]}
df = {
"var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4,
"var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
"target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
}
df = pd.DataFrame(df)
return df


@pytest.fixture(scope="module")
def dataframe_enc_rare():
df = {'var_A': ['B'] * 9 + ['A'] * 6 + ['C'] * 4 + ['D'] * 1,
'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4,
'target': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]}
df = {
"var_A": ["B"] * 9 + ["A"] * 6 + ["C"] * 4 + ["D"] * 1,
"var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
"target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
}
df = pd.DataFrame(df)
return df


@pytest.fixture(scope="module")
def dataframe_enc_na():
df = {'var_A': ['B'] * 9 + ['A'] * 6 + ['C'] * 4 + ['D'] * 1,
'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4,
'target': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]}
df = {
"var_A": ["B"] * 9 + ["A"] * 6 + ["C"] * 4 + ["D"] * 1,
"var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
"target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
}
df = pd.DataFrame(df)
df.loc[0, 'var_A'] = np.nan
df.loc[0, "var_A"] = np.nan
return df


@pytest.fixture(scope="module")
def dataframe_enc_big():
df = {'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6,
'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6,
'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6, }
df = {
"var_A": ["A"] * 6
+ ["B"] * 10
+ ["C"] * 4
+ ["D"] * 10
+ ["E"] * 2
+ ["F"] * 2
+ ["G"] * 6,
"var_B": ["A"] * 10
+ ["B"] * 6
+ ["C"] * 4
+ ["D"] * 10
+ ["E"] * 2
+ ["F"] * 2
+ ["G"] * 6,
"var_C": ["A"] * 4
+ ["B"] * 6
+ ["C"] * 10
+ ["D"] * 10
+ ["E"] * 2
+ ["F"] * 2
+ ["G"] * 6,
}
df = pd.DataFrame(df)
return df


@pytest.fixture(scope="module")
def dataframe_enc_big_na():
df = {'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6,
'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6,
'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6, }
df = {
"var_A": ["A"] * 6
+ ["B"] * 10
+ ["C"] * 4
+ ["D"] * 10
+ ["E"] * 2
+ ["F"] * 2
+ ["G"] * 6,
"var_B": ["A"] * 10
+ ["B"] * 6
+ ["C"] * 4
+ ["D"] * 10
+ ["E"] * 2
+ ["F"] * 2
+ ["G"] * 6,
"var_C": ["A"] * 4
+ ["B"] * 6
+ ["C"] * 10
+ ["D"] * 10
+ ["E"] * 2
+ ["F"] * 2
+ ["G"] * 6,
}
df = pd.DataFrame(df)
df.loc[0, 'var_A'] = np.nan
df.loc[0, "var_A"] = np.nan
return df


@pytest.fixture(scope="module")
def dataframe_normal_dist():
np.random.seed(0)
mu, sigma = 0, 0.1 # mean and standard deviation
s = np.random.normal(mu, sigma, 100)
df = pd.DataFrame(s)
df.columns = ['var']
df.columns = ["var"]
return df

@pytest.fixture(scope='module')

@pytest.fixture(scope="module")
def dataframe_constant_features():
data = {'Name': ['tom', 'nick', 'krish', 'jack'],
'City': ['London', 'Manchester', 'Liverpool', 'Bristol'],
'Age': [20, 21, 19, 18],
'Marks': [0.9, 0.8, 0.7, 0.6],
'dob': pd.date_range('2020-02-24', periods=4, freq='T'),
'const_feat_num': [1, 1, 1, 1],
'const_feat_cat': ['a', 'a', 'a', 'a'],
'quasi_feat_num': [1, 1, 1, 2],
'quasi_feat_cat': ['a', 'a', 'a', 'b']}
data = {
"Name": ["tom", "nick", "krish", "jack"],
"City": ["London", "Manchester", "Liverpool", "Bristol"],
"Age": [20, 21, 19, 18],
"Marks": [0.9, 0.8, 0.7, 0.6],
"dob": pd.date_range("2020-02-24", periods=4, freq="T"),
"const_feat_num": [1, 1, 1, 1],
"const_feat_cat": ["a", "a", "a", "a"],
"quasi_feat_n A0FE um": [1, 1, 1, 2],
"quasi_feat_cat": ["a", "a", "a", "b"],
}

df = pd.DataFrame(data)
return df


@pytest.fixture(scope="module")
def dataframe_duplicate_features():
data = {
"Name": ["tom", "nick", "krish", "jack"],
"dob2": pd.date_range("2020-02-24", periods=4, freq="T"),
"City": ["London", "Manchester", "Liverpool", "Bristol"],
"Age": [20, 21, 19, 18],
"Marks": [0.9, 0.8, 0.7, 0.6],
"dob": pd.date_range("2020-02-24", periods=4, freq="T"),
"City2": ["London", "Manchester", "Liverpool", "Bristol"],
"dob3": pd.date_range("2020-02-24", periods=4, freq="T"),
"Age2": [20, 21, 19, 18],
}

df = pd.DataFrame(data)
return df


@pytest.fixture(scope="module")
def dataframe_duplicate_features_with_na():
data = {
"Name": ["tom", "nick", "krish", "jack", np.nan],
"dob2": pd.date_range("2020-02-24", periods=5, freq="T"),
"City": ["London", "Manchester", "Liverpool", "Bristol", np.nan],
"Age": [20, 21, np.nan, 18, 34],
"Marks": [0.9, 0.8, 0.7, 0.6, 0.5],
"dob": pd.date_range("2020-02-24", periods=5, freq="T"),
"City2": ["London", "Manchester", "Liverpool", "Bristol", np.nan],
"dob3": pd.date_range("2020-02-24", periods=5, freq="T"),
"Age2": [20, 21, np.nan, 18, 34],
}

df = pd.DataFrame(data)
return df
Loading
0