10000 crude categorical ordinal encoding by frequency · frederikhoengaard/lazy-learn@744f495 · GitHub
[go: up one dir, main page]

Skip to content

Commit 744f495

Browse files
crude categorical ordinal encoding by frequency
1 parent f3fbd0d commit 744f495

File tree

8 files changed

+111
-17
lines changed

8 files changed

+111
-17
lines changed

python/src/lazylearn/lazylearn.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from model_selection.splitters import test_train_splitter
33
from preprocessing.time.date_processor import date_processor
44
from preprocessing.time.duration import duration_builder
5+
from regression.models.randomforest.randomforest import RandomForestRegressionRunner
56

67

78
class LazyLearner:
@@ -11,9 +12,11 @@ def __init__(self, random_state=None):
1112
self.models = None
1213
self.leaderboard = None
1314
self.random_state = random_state
15+
self.target = None
1416

1517
def create_project(self, data, target, task="infer"):
1618
# ingest data
19+
self.target = target
1720
self.dataset = Ingestion().run(data)
1821

1922
if task == "infer":
@@ -30,15 +33,16 @@ def create_project(self, data, target, task="infer"):
3033

3134
# split partitions
3235

33-
self.dataset = test_train_splitter(self.dataset, random_state=self.random_state)
36+
self.dataset = test_train_splitter(
37+
self.dataset, random_state=self.random_state
38+
) # noqa
3439

3540
# set modelling configurations
3641

3742
def run_autopilot(self):
38-
raise NotImplementedError
43+
simple_random_forest = RandomForestRegressionRunner(
44+
target=self.target, dataset=self.dataset
45+
)
46+
simple_random_forest.fit()
3947

40-
# preprocess
41-
42-
# train
43-
44-
# eval
48+
return simple_random_forest

python/src/lazylearn/pipeline/pipeline.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import List
22

33
from models.models import Dataset
4-
from pandas import DataFrame
4+
from pandas import DataFrame, Series
55

66

77
class Pipeline:
@@ -21,6 +21,9 @@ class PipelineStep:
2121
def apply(self, pipeline: Pipeline):
2222
pass
2323

24+
def fit(self, pipeline: Pipeline):
25+
pass
26+
2427

2528
class IngestionPipeline(Pipeline):
2629
def __init__(self):
@@ -44,8 +47,19 @@ def response(self):
4447
class ModelPipeline(Pipeline):
4548
def __init__(self):
4649
super().__init__()
50+
self._is_fitted = False
51+
self.feature_list: list = []
52+
53+
def fit(self):
54+
[step.fit 341A (self) for step in self._steps]
55+
self._is_fitted = True
4756

4857

4958
class RegressionPipeline(ModelPipeline):
5059
def __init__(self):
5160
super().__init__()
61+
self.train_features_df: DataFrame = None
62+
self.train_targets: Series = None
63+
self.holdout_features_df: DataFrame = None
64+
self.holdout_targets: Series = None
65+
self.holdout_score: float = None
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,56 @@
1+
from models.models import Dataset
2+
from pipeline.pipeline import ModelPipeline
3+
4+
15
class OrdinalConverter:
26
def __init__(
37
self,
8+
cat_vars: list,
49
max_cardinality: int = None,
510
min_support: int = 5,
611
other_category: bool = True,
712
method: str = "freq",
813
):
14+
self.cat_vars = cat_vars
915
self.card_max = max_cardinality
1016
self.min_support = min_support
1117
self.other_category = other_category
1218
self.method = method
19+
self.cat_freqs = {}
20+
self.cat_maps = {}
21+
22+
def fit(self, pipeline: ModelPipeline):
23+
for var in self.cat_vars:
24+
pipeline.train_features_df = self.convert(pipeline.train_features_df, var)
25+
pipeline.feature_list.append(var)
26+
27+
def convert(self, df, col_name):
28+
"""
29+
30+
:param df:
31+
:param col_name:
32+
:return:
33+
"""
34+
if self.method == "freq":
35+
self.cat_freqs[col_name] = {}
36+
for item in df[col_name].tolist():
37+
if item in self.cat_freqs[col_name]:
38+
self.cat_freqs[col_name][item] += 1
39+
else:
40+
self.cat_freqs[col_name][item] = 1
41+
42+
freq_pairs = sorted(
43+
[(key, val) for key, val in self.cat_freqs[col_name].items()],
44+
key=lambda x: x[1],
45+
)
46+
print(freq_pairs)
47+
self.cat_maps[col_name] = {key: val for key, val in freq_pairs}
1348

14-
def convert(self, df, col):
15-
pass
49+
df[col_name] = df[col_name].apply(
50+
lambda x: self.cat_maps[col_name][x]
51+
if self.cat_maps[col_name][x] >= self.min_support
52+
else -1
53+
)
54+
return df
55+
else:
56+
raise ValueError("Unsupported encoding method, try [freq]")

python/src/lazylearn/preprocessing/imputation/__init__.py

Whitespace-only changes.

python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from pipeline.pipeline import PipelineStep, RegressionPipeline
2+
from sklearn.ensemble import RandomForestRegressor
3+
from sklearn.metrics import mean_absolute_error
4+
5+
6+
class RandomForestRegressorStep(PipelineStep):
7+
def __init__(self):
8+
self.regressor = RandomForestRegressor()
9+
10+
def fit(self, pipeline: RegressionPipeline):
11+
self.regressor.fit(X=pipeline.train_features_df, y=pipeline.train_targets)
12+
13+
#y_hat = self.regressor.predict(X=pipeline.holdout_features_df)
14+
#pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat)
15+
16+
def predict(self, pipeline: RegressionPipeline):
17+
raise NotImplementedError
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,32 @@
11
from models.models import Dataset
22
from pipeline.pipeline import RegressionPipeline
3+
from preprocessing.encoding.encoders import OrdinalConverter
4+
from regression.models.randomforest.random_forest_steps.regressor_step import (
5+
RandomForestRegressorStep,
6+
)
7+
from sklearn.ensemble import RandomForestRegressor
38

49

5-
class RandomForestRegressionPipeline(RegressionPipeline):
6-
def __init__(self):
7-
self.target = None
8-
self.dataset: Dataset = None
10+
class RandomForestRegressionRunner:
11+
def __init__(self, target, dataset):
12+
self.target = target
13+
self.dataset: Dataset = dataset
14+
self.pipeline = RegressionPipeline()
915

10-
def run(self):
16+
self.pipeline.train_features_df = self.dataset.partitions["train"].copy()
17+
self.pipeline.train_targets = self.dataset.partitions["train"][target]
18+
self.pipeline.holdout_features_df = self.dataset.partitions["test"].copy()
19+
self.pipeline.holdout_targets = self.dataset.partitions["test"][target]
20+
21+
def fit(self):
1122
# preprocess numeric vars
23+
cat_vars = self.dataset.type_collections["categorical"]
24+
25+
self.pipeline.add(OrdinalConverter(cat_vars=cat_vars))
26+
27+
# self.pipeline.add(RandomForestRegressorStep())
1228

13-
# preprocess categorical vars
29+
self.pipeline.fit()
1430

15-
pass
31+
def predict(self):
32+
raise NotImplementedError
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from lazylearn import LazyLearner

0 commit comments

Comments
 (0)
0