frederikhoengaard
diff --git a/‎.github/workflows/verify.yaml
+4-4 b/‎.github/workflows/verify.yaml
+4-4
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎Pipfile
+3 b/‎Pipfile
+3
diff --git a/‎Pipfile.lock
+1,024-3 b/‎Pipfile.lock
+1,024-3
diff --git a/‎README.md
+22-5 b/‎README.md
+22-5
diff --git a/‎doc/logo/grayscale_transparent.png
58.1 KB b/‎doc/logo/grayscale_transparent.png
58.1 KB
diff --git a/‎doc/logo/original.png
61.2 KB b/‎doc/logo/original.png
61.2 KB
diff --git a/‎doc/logo/transparent.png
65 KB b/‎doc/logo/transparent.png
65 KB
diff --git a/‎doc/logo/transparent_small.png
70.2 KB b/‎doc/logo/transparent_small.png
70.2 KB
diff --git a/‎pyproject.toml
+2-2 b/‎pyproject.toml
+2-2
diff --git a/‎python/src/lazylearn/errors/__init__.py b/‎python/src/lazylearn/errors/__init__.py
diff --git a/‎python/src/lazylearn/errors/errors.py
+2 b/‎python/src/lazylearn/errors/errors.py
+2
diff --git a/‎python/src/lazylearn/ingestion/ingestion_pipeline.py
+32 b/‎python/src/lazylearn/ingestion/ingestion_pipeline.py
+32
diff --git a/‎python/src/lazylearn/ingestion/ingestion_pipeline_steps/__init__.py b/‎python/src/lazylearn/ingestion/ingestion_pipeline_steps/__init__.py
diff --git a/‎python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
+21 b/‎python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py
+21
diff --git a/‎python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
+86 b/‎python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
+86
diff --git a/‎python/src/lazylearn/ingestion/ingestion_pipeline_steps/summary_stats_step.py
+22 b/‎python/src/lazylearn/ingestion/ingestion_pipeline_steps/summary_stats_step.py
+22
diff --git a/‎python/src/lazylearn/lazylearn.py
+18 b/‎python/src/lazylearn/lazylearn.py
+18
diff --git a/‎python/src/lazylearn/models/models.py
+9-1 b/‎python/src/lazylearn/models/models.py
+9-1
diff --git a/‎python/src/lazylearn/pipeline/pipeline.py
+34 b/‎python/src/lazylearn/pipeline/pipeline.py
+34
diff --git a/‎python/src/lazylearn/preprocessing/__init__.py b/‎python/src/lazylearn/preprocessing/__init__.py
diff --git a/‎python/src/lazylearn/preprocessing/encoding/__init__.py b/‎python/src/lazylearn/preprocessing/encoding/__init__.py
diff --git a/‎python/src/lazylearn/preprocessing/encoding/encoders.py
+15 b/‎python/src/lazylearn/preprocessing/encoding/encoders.py
+15
diff --git a/‎python/src/test/ingestion/ingestion_pipeline_steps/test_data_parser_step.py
+12 b/‎python/src/test/ingestion/ingestion_pipeline_steps/test_data_parser_step.py
+12
diff --git a/‎python/src/test/ingestion/ingestion_pipeline_steps/test_interpreter_step.py
+19 b/‎python/src/test/ingestion/ingestion_pipeline_steps/test_interpreter_step.py
+19
@@ -4,7 +4,7 @@ on:
   pull_request:
 
 env:
-  PYTHONPATH: ./python/src/main/
+  PYTHONPATH: ./python/src/lazylearn/
 
 jobs:
   testing:
 
       - name: black
         run: |
-          python -m black --check python/src/main/
+          python -m black --check python/src/lazylearn/
 
       - name: isort
         run: |
-          python -m isort python/src/main/ --multi-line 3 --profile black --check
+          python -m isort python/src/lazylearn/ --multi-line 3 --profile black --check
 
       - name: flake8
         run: |
-          python -m flake8 python/src/main/
+          python -m flake8 python/src/lazylearn/
@@ -13,6 +13,9 @@ __pycache__/
 # JetBrains
 .idea
 
+# local
+notebooks/
+
 # Distribution / packaging
 .Python
 build/
 
@@ -6,6 +6,9 @@ verify_ssl = true
 [packages]
 loguru = "==0.6.*"
 pandas = "==1.5.*"
+scikit-learn = "*"
+tqdm = "*"
+jupyter = "*"
 
 [dev-packages]
 black = "==23.*"
 
@@ -1,12 +1,29 @@
-# lazy-learn
 
----
+<img width="500" src="doc/logo/transparent_small.png">
 
-## About
-
-lazy-learn is a high-level Python interface for automated machine learning (AutoML). While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system.
+**lazy-learn** is a high-level Python interface for automated machine learning (AutoML). While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system.
 
 The aim of lazy-learn is exactly that. Given a dataset, easy-learn will analyse types and distributions of attributes, preprocess, feature-engineer and ultimately train models to be used for further evaluation or inference. 
 
 ## Usage
 
+Using lazy-learn revolves around the `LazyLearner` class. You can think of it as a kind of project, and it is the wrapper for any experiment within lazy-learn.
+
+## Installation
+
+### Dependencies
+
+lazy-learn requires:
+
+- pandas
+- scikit-learn
+
+### User Installation 
+```
+pip install lazy-learn
+```
+
+## Help and Support
+### Documentation
+
+### Citation
@@ -8,7 +8,7 @@ version = "0.0.1"
 authors = [
   { name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" },
 ]
-description = "A small example package"
+description = "lazy-learn is a high-level Python interface for automated machine learning (AutoML) for the lazy data scientist. While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system. lazy-learn aims at providing the most approachable and fastest access to building baseline models."
 readme = "README.md"
 requires-python = ">=3.7"
 classifiers = [
@@ -18,4 +18,4 @@ classifiers = [
 ]
 
 [project.urls]
-"Homepage" = "https://github.com/pypa/sampleproject"
+"Homepage" = "https://github.com/frederikhoengaard/lazy-learn"
@@ -0,0 +1,2 @@
+class DataSourceError(Exception):
+    """Raised if passing an incompatible argument as data source"""
+from ingestion.ingestion_pipeline_steps.data_parser_step import DataSourceParser  # noqa
+from ingestion.ingestion_pipeline_steps.interpreter_step import (  # noqa
+    ColumnTypeInterpreter,
+)
+from ingestion.ingestion_pipeline_steps.summary_stats_step import (  # noqa
+    SummaryStatistics,
+)
+from pipeline.pipeline import IngestionPipeline
+
+
+class Ingestion:
+    def __init__(self):
+        pass
+
+    def run(self, data):
+        """
+
+        :param data:
+        :return:
+        """
+        pipeline = IngestionPipeline()
+        pipeline.raw_data = data
+
+        pipeline.add(DataSourceParser())
+
+        pipeline.add(ColumnTypeInterpreter())
+
+        pipeline.add(SummaryStatistics())
+
+        pipeline.run()
+
+        return pipeline.response()
@@ -0,0 +1,21 @@
+from errors.errors import DataSourceError
+from pandas import DataFrame
+from pipeline.pipeline import IngestionPipeline, PipelineStep
+
+
+class DataSourceParser(PipelineStep):
+    def apply(self, pipeline: IngestionPipeline):
+        """
+        This method is responsible for parsing the raw data
+        source from its parent pipeline into a DataFrame
+        object.
+
+        :param pipeline: parent IngestionPipeline
+        :return:
+        """
+        assert pipeline.raw_data is not None
+
+        if isinstance(pipeline.raw_data, DataFrame):
+            pipeline.df = pipeline.raw_data
+        else:
+            raise DataSourceError
@@ -0,0 +1,86 @@
+import pandas as pd
+from pandas import Series
+from pipeline.pipeline import IngestionPipeline
+from tqdm import tqdm
+
+
+class ColumnTypeInterpreter:
+    def apply(self, pipeline: IngestionPipeline):
+        """
+        This method is responsible for inferring the
+        types of the columns of the project dataset
+
+        :param pipeline: parent IngestionPipeline
+        :return:
+        """
+        self.df = pipeline.df
+        columns = pipeline.df.columns
+        column_types = {}
+
+        for column_name in tqdm(columns):
+            column_types[column_name] = self.analyze_column(
+                pipeline.df[column_name]
+            )  # noqa
+
+        pipeline.column_type_map = column_types
+
+    def analyze_column(self, column: Series):
+        """
+
+        :param column:
+        :return:
+        """
+        values = column.tolist()
+        types = [type(value) for value in values]
+
+        if self.categorical_test(values):
+            return "categorical"
+
+        elif self.numeric_test(types):
+            return "numeric"
+
+        elif self.datetime_check(column):
+            return "datetime"
+        else:
+            return "object"
+
+    @staticmethod
+    def categorical_test(values: list):
+        """
+        Tests whether a column is of categorical type.
+        This is decided as the case if the number of unique values is
+        less than 5% of the total number of values in the column.
+
+        :param values: list of values of any type
+        :return: True if column is categorical, False otherwise
+        """
+        n_total = len(values)
+        n_unique = len(set(values))
+        percentage_unique = n_unique / n_total
+
+        if percentage_unique < 0.05:
+            return True
+        return False
+
+    @staticmethod
+    def numeric_test(types: list):
+        """
+        Tests whether a column is of numeric tyoe.
+        This is decided as the case if all values
+        of a column is either float or int.
+
+        :param types: list of type objects
+        :return: True if column is numeric, False otherwise
+        """
+        return all([item == float or item == int for item in set(types)])
+
+    @staticmethod
+    def string_test(types: set):
+        raise NotImplementedError
+
+    def datetime_check(self, column: Series):
+        try:
+            self.df[column.name] = pd.to_datetime(column)
+            return True
+        except Exception as e:  # noqa
+            return False
@@ -0,0 +1,22 @@
+from pipeline.pipeline import IngestionPipeline, PipelineStep
+
+
+class SummaryStatistics(PipelineStep):
+    def apply(self, pipeline: IngestionPipeline):
+        """
+        This step computes summary statistics for
+        numeric attributes in the dataset.
+
+        :param pipeline: parent IngestionPipeline
+        :return:
+        """
+        numeric_attributes = [
+            column
+            for column in pipeline.column_type_map
+            if pipeline.column_type_map[column] == "numeric"
+        ]
+
+        for attr in numeric_attributes:
+            pipeline.summary_stats[attr] = (
+                pipeline.df[attr].describe().to_dict()
+            )  # noqa
@@ -0,0 +1,18 @@
+from ingestion.ingestion_pipeline import Ingestion
+
+
+class LazyLearner:
+    def __init__(self):
+        self.dataset = None
+
+    def create_project(self, data, target, task="infer"):
+        # ingest data
+        ingestion_response = Ingestion().run(data)  # noqa
+
+        # preprocess
+
+        # set modelling configurations
+
+        # train
+
+        # eval
@@ -1,7 +1,15 @@
+from pandas import DataFrame
+
+
 class Dataset:
-    def __init__(self):
+    def __init__(self, df: DataFrame, column_type_map: dict):
         self.name = None
         self.description = None
+        self.df = df
+        self.column_type_map = column_type_map
+
+    def save(self):
+        raise NotImplementedError
 
 
 class Model:
 
@@ -0,0 +1,34 @@
+from typing import List
+
+from models.models import Dataset
+from pandas import DataFrame
+
+
+class Pipeline:
+    def __init__(self):
+        self._has_run: bool = False
+        self._steps: List[PipelineStep] = []
+
+    def add(self, pipeline_step):
+        self._steps.append(pipeline_step)
+
+    def run(self):
+        [step.apply(self) for step in self._steps]
+        self._has_run = True
+
+
+class PipelineStep:
+    def apply(self, pipeline: Pipeline):
+        pass
+
+
+class IngestionPipeline(Pipeline):
+    def __init__(self):
+        super().__init__()
+        self.raw_data = None
+        self.df: DataFrame = None
+        self.column_type_map: dict = None
+        self.summary_stats: dict = {}
+
+    def response(self):
+        return Dataset(df=self.df, column_type_map=self.column_type_map)
@@ -0,0 +1,15 @@
+class OrdinalConverter:
+    def __init__(
+        self,
+        max_cardinality: int = None,
+        min_support: int = 5,
+        other_category: bool = True,
+        method: str = "freq",
+    ):
+        self.card_max = max_cardinality
+        self.min_support = min_support
+        self.other_category = other_category
+        self.method = method
+
+    def convert(self, df, col):
+        pass
@@ -0,0 +1,12 @@
+from ingestion.ingestion_pipeline_steps.data_parser_step import DataSourceParser  # noqa
+from pipeline.pipeline import IngestionPipeline
+from sklearn.datasets import load_iris
+
+
+def test_iris_okay():
+    pipeline = IngestionPipeline()
+    pipeline.raw_data = load_iris(return_X_y=True, as_frame=True)[0]
+    pipeline.add(DataSourceParser())
+    pipeline.run()
+
+    assert pipeline.raw_data.equals(pipeline.df)
@@ -0,0 +1,19 @@
+from ingestion.ingestion_pipeline_steps.interpreter_step import (  # noqa
+    ColumnTypeInterpreter,
+)
+from pipeline.pipeline import IngestionPipeline
+from sklearn.datasets import load_iris
+
+
+def test_iris_types_numeric():
+    pipeline = IngestionPipeline()
+    pipeline.df = load_iris(return_X_y=True, as_frame=True)[0]
+    pipeline.add(ColumnTypeInterpreter())
+    pipeline.run()
+
+    assert pipeline.column_type_map == {
+        "sepal length (cm)": "numeric",
+        "sepal width (cm)": "numeric",
+        "petal length (cm)": "numeric",
+        "petal width (cm)": "numeric",
+    }
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ version = "0.0.1"`
`8`	`8`	`authors = [`
`9`	`9`	`{ name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" },`
`10`	`10`	`]`
`11`		`-description = "A small example package"`
	`11`	`+description = "lazy-learn is a high-level Python interface for automated machine learning (AutoML) for the lazy data scientist. While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system. lazy-learn aims at providing the most approachable and fastest access to building baseline models."`
`12`	`12`	`readme = "README.md"`
`13`	`13`	`requires-python = ">=3.7"`
`14`	`14`	`classifiers = [`
`@@ -18,4 +18,4 @@ classifiers = [`
`18`	`18`	`]`
`19`	`19`
`20`	`20`	`[project.urls]`
`21`		`-"Homepage" = "https://github.com/pypa/sampleproject"`
	`21`	`+"Homepage" = "https://github.com/frederikhoengaard/lazy-learn"`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+class DataSourceError(Exception):`
	`2`	`+ """Raised if passing an incompatible argument as data source"""`