import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.model_selection
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
def tts( dataset: pd.DataFrame,
label_col: str,
test_size: float,
stratify: bool,
random_state: int) ->
tuple[pd.DataFrame,pd.DataFrame,pd.Series,pd.Series]:
features = dataset.drop(columns=[label_col])
labels = dataset[label_col]
if stratify:
train_features, test_features, train_labels, test_labels =
train_test_split(features, labels,
test_size=test_size,
stratify=labels,
random_state=random_state)
else:
train_features, test_features, train_labels, test_labels =
train_test_split(features, labels,
test_size=test_size,
random_state=random_state)
return train_features, test_features, train_labels, test_labels
class PreprocessDataset:
def __init__(self,
train_features:pd.DataFrame,
test_features:pd.DataFrame,
one_hot_encode_cols:list[str],
min_max_scale_cols:list[str],
n_components:int,
feature_engineering_functions:dict
):
self.train_features = train_features
self.test_features = test_features
self.one_hot_encode_cols = one_hot_encode_cols
self.min_max_scale_cols = min_max_scale_cols
self.n_components = n_components
self.feature_engineering_functions = feature_engineering_functions
def one_hot_encode_columns_train(self) -> pd.DataFrame:
encoder = OneHotEncoder()
encoded_data =
encoder.fit_transform(self.train_features[self.one_hot_encode_cols])
encoded_df = pd.DataFrame(encoded_data.toarray(),
columns=encoder.get_feature_names_out(self.one_hot_encode_cols))
self.train_features = pd.concat([self.train_features, encoded_df],
axis=1).drop(self.one_hot_encode_cols,
axis=1)
return self.train_features
def one_hot_encode_columns_test(self) -> pd.DataFrame:
encoder = OneHotEncoder()
encoded_data =
encoder.fit_transform(self.test_features[self.one_hot_encode_cols])
encoded_df = pd.DataFrame(encoded_data.toarray(),
columns=encoder.get_feature_names_out(self.one_hot_encode_cols))
self.test_features = pd.concat([self.test_features, encoded_df],
axis=1).drop(self.one_hot_encode_cols, axis=1)
return self.test_features
def min_max_scaled_columns_train(self) -> pd.DataFrame:
scaler = MinMaxScaler()
self.train_features[self.min_max_scale_cols] = scaler.fit_transform(
self.train_features[self.min_max_scale_cols])
return self.train_features
def min_max_scaled_columns_test(self) -> pd.DataFrame:
# Columns to be scaled: 'cost' and 'height'
columns_to_scale = ['cost', 'height']
# Create a copy of the DataFrame to avoid modifying the original data
min_max_scaled_dataset = self.data.copy()
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Fit and transform the numerical columns
min_max_scaled_dataset[columns_to_scale] =
scaler.fit_transform(min_max_scaled_dataset[columns_to_scale])
return min_max_scaled_dataset
def pca_train(self) -> pd.DataFrame:
pca = PCA(n_components=self.n_components)
pca_data = pca.fit_transform(self.train_features)
pca_df = pd.DataFrame(data=pca_data, columns=[f"PCA_{i + 1}" for i in
range(self.n_components)])
self.train_features = pd.concat([self.train_features, pca_df], axis=1)
return self.train_features
def pca_test(self) -> pd.DataFrame:
pca = PCA(n_components=self.n_components)
pca_data = pca.fit_transform(self.test_features)
pca_df = pd.DataFrame(data=pca_data, columns=[f"PCA_{i + 1}" for i in
range(self.n_components)])
self.test_features = pd.concat([self.test_features, pca_df], axis=1)
return self.test_features
def feature_engineering_train(self) -> pd.DataFrame:
for func_name, func in self.feature_engineering_functions.items():
self.train_features[func_name] = func(self.train_features)
return self.train_features
def feature_engineering_test(self) -> pd.DataFrame:
for func_name, func in self.feature_engineering_functions.items():
self.test_features[func_name] = func(self.test_features)
return self.test_features
def preprocess_train(self) -> pd.DataFrame:
self.train_features = self.one_hot_encode_columns_train()
self.train_features = self.min_max_scaled_columns_train()
self.train_features = self.pca_train()
self.train_features = self.feature_engineering_train()
return self.train_features
def preprocess_test(self) -> pd.DataFrame:
self.test_features = self.one_hot_encode_columns_test()
self.test_features = self.min_max_scaled_columns_test()
self.test_features = self.pca_test()
self.test_features = self.feature_engineering_test()
return self.test_features