8000 basic pipeline functional · frederikhoengaard/lazy-learn@35acb80 · GitHub
[go: up one dir, main page]

Skip to content

Commit 35acb80

Browse files
basic pipeline functional
1 parent cd2c4ff commit 35acb80

File tree

6 files changed

+90
-25
lines changed

6 files changed

+90
-25
lines changed

python/src/lazylearn/lazylearn.py

+21-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
from model_selection.splitters import test_train_splitter
33
from preprocessing.time.date_processor import date_processor
44
from preprocessing.time.duration import duration_builder
5-
from regression.models.randomforest.randomforest import RandomForestRegressionRunner
5+
from regression.models.randomforest.randomforest import ( # noqa
6+
RandomForestRegressionRunner,
7+
)
8+
from sklearn.metrics import mean_absolute_error
69

710

811
class LazyLearner:
@@ -40,9 +43,25 @@ def create_project(self, data, target, task="infer"):
4043
# set modelling configurations
4144

4245
def run_autopilot(self):
46+
"""
47+
TODO: Everything here must be abstracted away into strategies
48+
TODO: such that several models are run and their scores are added to
49+
TODO: the leaderboard
50+
51+
:return:
52+
"""
53+
4354
simple_random_forest = RandomForestRegressionRunner(
44-
target=self.target, dataset=self.dataset
55+
target=self.target,
56+
dataset=self.dataset,
57+
random_state=self.random_state, # noqa
4558
)
4659
simple_random_forest.fit()
4760

61+
# get holdout scores
62+
simple_random_forest.predict(self.dataset.partitions["test"])
63+
simple_random_forest.pipeline.holdout_score = mean_absolute_error(
64+
self.dataset.partitions["test"][self.target],
65+
simple_random_forest.pipeline.tmp_pred,
66+
)
4867
return simple_random_forest

python/src/lazylearn/pipeline/pipeline.py

+11
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ def apply(self, pipeline: Pipeline):
2424
def fit(self, pipeline: Pipeline):
2525
pass
2626

27+
def predict(self, pipeline: Pipeline):
28+
pass
29+
2730

2831
class IngestionPipeline(Pipeline):
2932
def __init__(self):
@@ -49,11 +52,19 @@ def __init__(self):
4952
super().__init__()
5053
self._is_fitted = False
5154
self.feature_list: list = []
55+
self.tmp_test = None
56+
self.tmp_pred = None
57+
self.target = None
5258

5359
def fit(self):
5460
[step.fit(self) for step in self._steps]
5561
self._is_fitted = True
5662

63+
def predict(self):
64+
assert self._is_fitted
65+
[step.predict(self) for step in self._steps]
66+
return self.tmp_pred
67+
5768

5869
class RegressionPipeline(ModelPipeline):
5970
def __init__(self):

python/src/lazylearn/preprocessing/encoding/encoders.py

+29-7
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from models.models import Dataset
1+
from pandas import DataFrame
22
from pipeline.pipeline import ModelPipeline
33

44

@@ -21,15 +21,25 @@ def __init__(
2121

2222
def fit(self, pipeline: ModelPipeline):
2323
for var in self.cat_vars:
24-
pipeline.train_features_df = self.convert(pipeline.train_features_df, var)
24+
pipeline.train_features_df = self.convert(
25+
pipeline.train_features_df, var
26+
) # noqa
2527
pipeline.feature_list.append(var)
2628

27-
def convert(self, df, col_name):
29+
def convert(self, df: DataFrame, col_name: str) -> DataFrame:
2830
"""
31+
Encodes a categorical column ordinally.
32+
Currently only the "freq" method is supported,
33+
and it encodes a value with an integer id by
34+
increasing frequency i.e. more frequent values
35+
receive a higher encoding
2936
30-
:param df:
31-
:param col_name:
32-
:return:
37+
Note that this should only be done on the training
38+
data!
39+
40+
:param df: pandas DataFrame of features
41+
:param col_name: column to consider
42+
:return: transformed DataFrame
3343
"""
3444
if self.method == "freq":
3545
self.cat_freqs[col_name] = {}
@@ -43,7 +53,7 @@ def convert(self, df, col_name):
4353
[(key, val) for key, val in self.cat_freqs[col_name].items()],
4454
key=lambda x: x[1],
4555
)
46-
print(freq_pairs)
56+
4757
self.cat_maps[col_name] = {key: val for key, val in freq_pairs}
4858

4959
df[col_name] = df[col_name].apply(
@@ -54,3 +64,15 @@ def convert(self, df, col_name):
5464
return df
5565
else:
5666
raise ValueError("Unsupported encoding method, try [freq]")
67+
68+
def predict(self, pipeline: ModelPipeline):
69+
df = pipeline.tmp_test
70+
71+
for var in self.cat_vars:
72+
df[var] = df[var].apply(
73+
lambda x: self.cat_maps[var][x]
74+
if x in self.cat_maps[var]
75+
else -2 # noqa
76+
)
77+
78+
pipeline.tmp_test = df
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
11
from pipeline.pipeline import PipelineStep, RegressionPipeline
22
from sklearn.ensemble import RandomForestRegressor
3-
from sklearn.metrics import mean_absolute_error
43

54

65
class RandomForestRegressorStep(PipelineStep):
7-
def __init__(self):
8-
self.regressor = RandomForestRegressor()
6+
def __init__(self, random_state=None):
7+
self.regressor = RandomForestRegressor(random_state=random_state)
98

109
def fit(self, pipeline: RegressionPipeline):
11-
self.regressor.fit(X=pipeline.train_features_df, y=pipeline.train_targets)
12-
13-
# y_hat = self.regressor.predict(X=pipeline.holdout_features_df)
14-
# pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat)
10+
pipeline.feature_list = [
11+
item for item in pipeline.feature_list if item != pipeline.target
12+
]
13+
print("Fitting RandomForestRegressor")
14+
self.regressor.fit(
15+
X=pipeline.train_features_df[pipeline.feature_list],
16+
y=pipeline.train_targets,
17+
) # noqa
18+
print("RandomForestRegressor fitted!")
1519

1620
def predict(self, pipeline: RegressionPipeline):
17-
raise NotImplementedError
21+
pipeline.tmp_pred = self.regressor.predict(
22+
X=pipeline.tmp_test[pipeline.feature_list]
23+
)

python/src/lazylearn/regression/models/randomforest/randomforest.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -4,29 +4,37 @@
44
from regression.models.randomforest.random_forest_steps.regressor_step import (
55
RandomForestRegressorStep,
66
)
7-
from sklearn.ensemble import RandomForestRegressor
87

98

109
class RandomForestRegressionRunner:
11-
def __init__(self, target, dataset):
10+
def __init__(self, target, dataset, random_state=42):
1211
self.target = target
1312
self.dataset: Dataset = dataset
13+
self.random_state = random_state
1414
self.pipeline = RegressionPipeline()
15+
self.pipeline.target = target
1516

16-
self.pipeline.train_features_df = self.dataset.partitions["train"].copy()
17+
self.pipeline.train_features_df = self.dataset.partitions[
18+
"train"
19+
].copy() # noqa
1720
self.pipeline.train_targets = self.dataset.partitions["train"][target]
18-
self.pipeline.holdout_features_df = self.dataset.partitions["test"].copy()
21+
self.pipeline.holdout_features_df = self.dataset.partitions[
22+
"test"
23+
].copy() # noqa
1924
self.pipeline.holdout_targets = self.dataset.partitions["test"][target]
2025

2126
def fit(self):
2227
# preprocess numeric vars
2328
cat_vars = self.dataset.type_collections["categorical"]
29+
num_vars = self.dataset.type_collections["numeric"]
30+
self.pipeline.feature_list.extend(num_vars)
2431

2532
self.pipeline.add(OrdinalConverter(cat_vars=cat_vars))
2633

27-
# self.pipeline.add(RandomForestRegressorStep())
34+
self.pipeline.add(RandomForestRegressorStep())
2835

2936
self.pipeline.fit()
3037

31-
def predict(self):
32-
raise NotImplementedError
38+
def predict(self, features):
39+
self.pipeline.tmp_test = features
40+
return self.pipeline.predict()
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
from lazylearn import LazyLearner

0 commit comments

Comments
 (0)
0