frederikhoengaard
diff --git a/‎python/src/lazylearn/lazylearn.py
+11-7 b/‎python/src/lazylearn/lazylearn.py
+11-7
diff --git a/‎python/src/lazylearn/pipeline/pipeline.py
+15-1 b/‎python/src/lazylearn/pipeline/pipeline.py
+15-1
diff --git a/‎python/src/lazylearn/preprocessing/encoding/encoders.py
+43-2 b/‎python/src/lazylearn/preprocessing/encoding/encoders.py
+43-2
diff --git a/‎python/src/lazylearn/preprocessing/imputation/__init__.py b/‎python/src/lazylearn/preprocessing/imputation/__init__.py
diff --git a/‎python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py b/‎python/src/lazylearn/regression/models/randomforest/random_forest_steps/__init__.py
diff --git a/‎python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
+17 b/‎python/src/lazylearn/regression/models/randomforest/random_forest_steps/regressor_step.py
+17
diff --git a/‎python/src/lazylearn/regression/models/randomforest/randomforest.py
+24-7 b/‎python/src/lazylearn/regression/models/randomforest/randomforest.py
+24-7
diff --git a/‎python/src/lazylearn/strategies/strategy_builder.py
+1 b/‎python/src/lazylearn/strategies/strategy_builder.py
+1
@@ -2,6 +2,7 @@
 from model_selection.splitters import test_train_splitter
 from preprocessing.time.date_processor import date_processor
 from preprocessing.time.duration import duration_builder
+from regression.models.randomforest.randomforest import RandomForestRegressionRunner
 
 
 class LazyLearner:
@@ -11,9 +12,11 @@ def __init__(self, random_state=None):
         self.models = None
         self.leaderboard = None
         self.random_state = random_state
+        self.target = None
 
     def create_project(self, data, target, task="infer"):
         # ingest data
+        self.target = target
         self.dataset = Ingestion().run(data)
 
         if task == "infer":
@@ -30,15 +33,16 @@ def create_project(self, data, target, task="infer"):
 
         # split partitions
 
-        self.dataset = test_train_splitter(self.dataset, random_state=self.random_state)
+        self.dataset = test_train_splitter(
+            self.dataset, random_state=self.random_state
+        )  # noqa
 
         # set modelling configurations
 
     def run_autopilot(self):
-        raise NotImplementedError
+        simple_random_forest = RandomForestRegressionRunner(
+            target=self.target, dataset=self.dataset
+        )
+        simple_random_forest.fit()
 
-        # preprocess
-
-        # train
-
-        # eval
+        return simple_random_forest
@@ -1,7 +1,7 @@
 from typing import List
 
 from models.models import Dataset
-from pandas import DataFrame
+from pandas import DataFrame, Series
 
 
 class Pipeline:
@@ -21,6 +21,9 @@ class PipelineStep:
     def apply(self, pipeline: Pipeline):
         pass
 
+    def fit(self, pipeline: Pipeline):
+        pass
+
 
 class IngestionPipeline(Pipeline):
     def __init__(self):
@@ -44,8 +47,19 @@ def response(self):
 class ModelPipeline(Pipeline):
     def __init__(self):
         super().__init__()
+        self._is_fitted = False
+        self.feature_list: list = []
+
+    def fit(self):
+        [step.fit
341A
(self) for step in self._steps]
+        self._is_fitted = True
 
 
 class RegressionPipeline(ModelPipeline):
     def __init__(self):
         super().__init__()
+        self.train_features_df: DataFrame = None
+        self.train_targets: Series = None
+        self.holdout_features_df: DataFrame = None
+        self.holdout_targets: Series = None
+        self.holdout_score: float = None
@@ -1,15 +1,56 @@
+from models.models import Dataset
+from pipeline.pipeline import ModelPipeline
+
+
 class OrdinalConverter:
     def __init__(
         self,
+        cat_vars: list,
         max_cardinality: int = None,
         min_support: int = 5,
         other_category: bool = True,
         method: str = "freq",
     ):
+        self.cat_vars = cat_vars
         self.card_max = max_cardinality
         self.min_support = min_support
         self.other_category = other_category
         self.method = method
+        self.cat_freqs = {}
+        self.cat_maps = {}
+
+    def fit(self, pipeline: ModelPipeline):
+        for var in self.cat_vars:
+            pipeline.train_features_df = self.convert(pipeline.train_features_df, var)
+            pipeline.feature_list.append(var)
+
+    def convert(self, df, col_name):
+        """
+
+        :param df:
+        :param col_name:
+        :return:
+        """
+        if self.method == "freq":
+            self.cat_freqs[col_name] = {}
+            for item in df[col_name].tolist():
+                if item in self.cat_freqs[col_name]:
+                    self.cat_freqs[col_name][item] += 1
+                else:
+                    self.cat_freqs[col_name][item] = 1
+
+            freq_pairs = sorted(
+                [(key, val) for key, val in self.cat_freqs[col_name].items()],
+                key=lambda x: x[1],
+            )
+            print(freq_pairs)
+            self.cat_maps[col_name] = {key: val for key, val in freq_pairs}
 
-    def convert(self, df, col):
-        pass
+            df[col_name] = df[col_name].apply(
+                lambda x: self.cat_maps[col_name][x]
+                if self.cat_maps[col_name][x] >= self.min_support
+                else -1
+            )
+            return df
+        else:
+            raise ValueError("Unsupported encoding method, try [freq]")
@@ -0,0 +1,17 @@
+from pipeline.pipeline import PipelineStep, RegressionPipeline
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error
+
+
+class RandomForestRegressorStep(PipelineStep):
+    def __init__(self):
+        self.regressor = RandomForestRegressor()
+
+    def fit(self, pipeline: RegressionPipeline):
+        self.regressor.fit(X=pipeline.train_features_df, y=pipeline.train_targets)
+
+        #y_hat = self.regressor.predict(X=pipeline.holdout_features_df)
+        #pipeline.holdout_score = mean_absolute_error(pipeline.holdout_targets, y_hat)
+
+    def predict(self, pipeline: RegressionPipeline):
+        raise NotImplementedError
@@ -1,15 +1,32 @@
 from models.models import Dataset
 from pipeline.pipeline import RegressionPipeline
+from preprocessing.encoding.encoders import OrdinalConverter
+from regression.models.randomforest.random_forest_steps.regressor_step import (
+    RandomForestRegressorStep,
+)
+from sklearn.ensemble import RandomForestRegressor
 
 
-class RandomForestRegressionPipeline(RegressionPipeline):
-    def __init__(self):
-        self.target = None
-        self.dataset: Dataset = None
+class RandomForestRegressionRunner:
+    def __init__(self, target, dataset):
+        self.target = target
+        self.dataset: Dataset = dataset
+        self.pipeline = RegressionPipeline()
 
-    def run(self):
+        self.pipeline.train_features_df = self.dataset.partitions["train"].copy()
+        self.pipeline.train_targets = self.dataset.partitions["train"][target]
+        self.pipeline.holdout_features_df = self.dataset.partitions["test"].copy()
+        self.pipeline.holdout_targets = self.dataset.partitions["test"][target]
+
+    def fit(self):
         # preprocess numeric vars
+        cat_vars = self.dataset.type_collections["categorical"]
+
+        self.pipeline.add(OrdinalConverter(cat_vars=cat_vars))
+
+        # self.pipeline.add(RandomForestRegressorStep())
 
-        # preprocess categorical vars
+        self.pipeline.fit()
 
-        pass
+    def predict(self):
+        raise NotImplementedError
@@ -0,0 +1 @@
+from lazylearn import LazyLearner