8000 Merge pull request #6 from frederikhoengaard/ingestion · frederikhoengaard/lazy-learn@5d80083 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5d80083

Browse files
Merge pull request #6 from frederikhoengaard/ingestion
Initial commit Ingestion - non-functional
2 parents 6024221 + 9feb2e6 commit 5d80083

27 files changed

+1365
-17
lines changed

.github/workflows/verify.yaml

+4-4
< 9E12 tr class="diff-line-row">
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ on:
44
pull_request:
55

66
env:
7-
PYTHONPATH: ./python/src/main/
7+
PYTHONPATH: ./python/src/lazylearn/
88

99
jobs:
1010
testing:
@@ -54,12 +54,12 @@ jobs:
5454
5555
- name: black
5656
run: |
57-
python -m black --check python/src/main/
57+
python -m black --check python/src/lazylearn/
5858
5959
- name: isort
6060
run: |
61-
python -m isort python/src/main/ --multi-line 3 --profile black --check
61+
python -m isort python/src/lazylearn/ --multi-line 3 --profile black --check
6262
6363
- name: flake8
6464
run: |
65-
python -m flake8 python/src/main/
65+
python -m flake8 python/src/lazylearn/

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ __pycache__/
1313
# JetBrains
1414
.idea
1515

16+
# local
17+
notebooks/
18+
1619
# Distribution / packaging
1720
.Python
1821
build/

Pipfile

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ verify_ssl = true
66
[packages]
77
loguru = "==0.6.*"
88
pandas = "==1.5.*"
9+
scikit-learn = "*"
10+
tqdm = "*"
11+
jupyter = "*"
912

1013
[dev-packages]
1114
black = "==23.*"

Pipfile.lock

+1,024-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

+22-5
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,29 @@
1-
# lazy-learn
21

3-
---
2+
<img width="500" src="doc/logo/transparent_small.png">
43

5-
## About
6-
7-
lazy-learn is a high-level Python interface for automated machine learning (AutoML). While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system.
4+
**lazy-learn** is a high-level Python interface for automated machine learning (AutoML). While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system.
85

96
The aim of lazy-learn is exactly that. Given a dataset, easy-learn will analyse types and distributions of attributes, preprocess, feature-engineer and ultimately train models to be used for further evaluation or inference.
107

118
## Usage
129

10+
Using lazy-learn revolves around the `LazyLearner` class. You can think of it as a kind of project, and it is the wrapper for any experiment within lazy-learn.
11+
12+
## Installation
13+
14+
### Dependencies
15+
16+
lazy-learn requires:
17+
18+
- pandas
19+
- scikit-learn
20+
21+
### User Installation
22+
```
23+
pip install lazy-learn
24+
```
25+
26+
## Help and Support
27+
### Documentation
28+
29+
### Citation

doc/logo/grayscale_transparent.png

58.1 KB
Loading

doc/logo/original.png

61.2 KB
Loading

doc/logo/transparent.png

65 KB
Loading

doc/logo/transparent_small.png

70.2 KB
Loading

pyproject.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ version = "0.0.1"
88
authors = [
99
{ name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" },
1010
]
11-
description = "A small example package"
11+
description = "lazy-learn is a high-level Python interface for automated machine learning (AutoML) for the lazy data scientist. While there are many AutoML libraries available each typically solves a niche area of the overall ML pipeline without providing a covering and approachable end-to-end system. lazy-learn aims at providing the most approachable and fastest access to building baseline models."
1212
readme = "README.md"
1313
requires-python = ">=3.7"
1414
classifiers = [
@@ -18,4 +18,4 @@ classifiers = [
1818
]
1919

2020
[project.urls]
21-
"Homepage" = "https://github.com/pypa/sampleproject"
21+
"Homepage" = "https://github.com/frederikhoengaard/lazy-learn"

python/src/lazylearn/errors/__init__.py

Whitespace-only changes.

python/src/lazylearn/errors/errors.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class DataSourceError(Exception):
2+
"""Raised if passing an incompatible argument as data source"""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from ingestion.ingestion_pipeline_steps.data_parser_step import DataSourceParser # noqa
2+
from ingestion.ingestion_pipeline_steps.interpreter_step import ( # noqa
3+
ColumnTypeInterpreter,
4+
)
5+
from ingestion.ingestion_pipeline_steps.summary_stats_step import ( # noqa
6+
SummaryStatistics,
7+
)
8+
from pipeline.pipeline import IngestionPipeline
9+
10+
11+
class Ingestion:
12+
def __init__(self):
13+
pass
14+
15+
def run(self, data):
16+
"""
17+
18+
:param data:
19+
:return:
20+
"""
21+
pipeline = IngestionPipeline()
22+
pipeline.raw_data = data
23+
24+
pipeline.add(DataSourceParser())
25+
26+
pipeline.add(ColumnTypeInterpreter())
27+
28+
pipeline.add(SummaryStatistics())
29+
30+
pipeline.run()
31+
32+
return pipeline.response()

python/src/lazylearn/ingestion/ingestion_pipeline_steps/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from errors.errors import DataSourceError
2+
from pandas import DataFrame
3+
from pipeline.pipeline import IngestionPipeline, PipelineStep
4+
5+
6+
class DataSourceParser(PipelineStep):
7+
def apply(self, pipeline: IngestionPipeline):
8+
"""
9+
This method is responsible for parsing the raw data
10+
source from its parent pipeline into a DataFrame
11+
object.
12+
13+
:param pipeline: parent IngestionPipeline
14+
:return:
15+
"""
16+
assert pipeline.raw_data is not None
17+
18+
if isinstance(pipeline.raw_data, DataFrame):
19+
pipeline.df = pipeline.raw_data
20+
else:
21+
raise DataSourceError
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import pandas as pd
2+
from pandas import Series
3+
from pipeline.pipeline import IngestionPipeline
4+
from tqdm import tqdm
5+
6+
7+
class ColumnTypeInterpreter:
8+
def apply(self, pipeline: IngestionPipeline):
9+
"""
10+
This method is responsible for inferring the
11+
types of the columns of the project dataset
12+
13+
:param pipeline: parent IngestionPipeline
14+
:return:
15+
"""
16+
self.df = pipeline.df
17+
columns = pipeline.df.columns
18+
column_types = {}
19+
20+
for column_name in tqdm(columns):
21+
column_types[column_name] = self.analyze_column(
22+
pipeline.df[column_name]
23+
) # noqa
24+
25+
pipeline.column_type_map = column_types
26+
27+
def analyze_column(self, column: Series):
28+
"""
29+
30+
:param column:
31+
:return:
32+
"""
33+
values = column.tolist()
34+
types = [type(value) for value in values]
35+
36+
if self.categorical_test(values):
37+
return "categorical"
38+
39+
elif self.numeric_test(types):
40+
return "numeric"
41+
42+
elif self.datetime_check(column):
43+
return "datetime"
44+
else:
45+
return "object"
46+
47+
@staticmethod
48+
def categorical_test(values: list):
49+
"""
50+
Tests whether a column is of categorical type.
51+
This is decided as the case if the number of unique values is
52+
less than 5% of the total number of values in the column.
53+
54+
:param values: list of values of any type
55+
:return: True if column is categorical, False otherwise
56+
"""
57+
n_total = len(values)
58+
n_unique = len(set(values))
59+
percentage_unique = n_unique / n_total
60+
61+
if percentage_unique < 0.05:
62+
return True
63+
return False
64+
65+
@staticmethod
66+
def numeric_test(types: list):
67+
"""
68+
Tests whether a column is of numeric tyoe.
69+
This is decided as the case if all values
70+
of a column is either float or int.
71+
72+
:param types: list of type objects
73+
:return: True if column is numeric, False otherwise
74+
"""
75+
return all([item == float or item == int for item in set(types)])
76+
77+
@staticmethod
78+
def string_test(types: set):
79+
raise NotImplementedError
80+
81+
def datetime_check(self, column: Series):
82+
try:
83+
self.df[column.name] = pd.to_datetime(column)
84+
return True
85+
except Exception as e: # noqa
86+
return False
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from pipeline.pipeline import IngestionPipeline, PipelineStep
2+
3+
4+
class SummaryStatistics(PipelineStep):
5+
def apply(self, pipeline: IngestionPipeline):
6+
"""
7+
This step computes summary statistics for
8+
numeric attributes in the dataset.
9+
10+
:param pipeline: parent IngestionPipeline
11+
:return:
12+
"""
13+
numeric_attributes = [
14+
column
15+
for column in pipeline.column_type_map
16+
if pipeline.column_type_map[column] == "numeric"
17+
]
18+
19+
for attr in numeric_attributes:
20+
pipeline.summary_stats[attr] = (
21+
pipeline.df[attr].describe().to_dict()
22+
) # noqa

python/src/lazylearn/lazylearn.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from ingestion.ingestion_pipeline import Ingestion
2+
3+
4+
class LazyLearner:
5+
def __init__(self):
6+
self.dataset = None
7+
8+
def create_project(self, data, target, task="infer"):
9+
# ingest data
10+
ingestion_response = Ingestion().run(data) # noqa
11+
12+
# preprocess
13+
14+
# set modelling configurations
15+
16+
# train
17+
18+
# eval

python/src/lazylearn/models/models.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
1+
from pandas import DataFrame
2+
3+
14
class Dataset:
2-
def __init__(self):
5+
def __init__(self, df: DataFrame, column_type_map: dict):
36
self.name = None
47
self.description = None
8+
self.df = df
9+
self.column_type_map = column_type_map
10+
11+
def save(self):
12+
raise NotImplementedError
513

614

715
class Model:
+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from typing import List
2+
3+
from models.models import Dataset
4+
from pandas import DataFrame
5+
6+
7+
class Pipeline:
8+
def __init__(self):
9+
self._has_run: bool = False
10+
self._steps: List[PipelineStep] = []
11+
12+
def add(self, pipeline_step):
13+
self._steps.append(pipeline_step)
14+
15+
def run(self):
16+
[step.apply(self) for step in self._steps]
17+
self._has_run = True
18+
19+
20+
class PipelineStep:
21+
def apply(self, pipeline: Pipeline):
22+
pass
23+
24+
25+
class IngestionPipeline(Pipeline):
26+
def __init__(self):
27+
super().__init__()
28+
self.raw_data = None
29+
self.df: DataFrame = None
30+
self.column_type_map: dict = None
31+
self.summary_stats: dict = {}
32+
33+
def response(self):
34+
return Dataset(df=self.df, column_type_map=self.column_type_map)

python/src/lazylearn/preprocessing/__init__.py

Whitespace-only changes.

python/src/lazylearn/preprocessing/encoding/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
class OrdinalConverter:
2+
def __init__(
3+
self,
4+
max_cardinality: int = None,
5+
min_support: int = 5,
6+
other_category: bool = True,
7+
method: str = "freq",
8+
):
9+
self.card_max = max_cardinality
10+
self.min_support = min_support
11+
self.other_category = other_category
12+
self.method = method
13+
14+
def convert(self, df, col):
15+
pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from ingestion.ingestion_pipeline_steps.data_parser_step import DataSourceParser # noqa
2+
from pipeline.pipeline import IngestionPipeline
3+
from sklearn.datasets import load_iris
4+
5+
6+
def test_iris_okay():
7+
pipeline = IngestionPipeline()
8+
pipeline.raw_data = load_iris(return_X_y=True, as_frame=True)[0]
9+
pipeline.add(DataSourceParser())
10+
pipeline.run()
11+
12+
assert pipeline.raw_data.equals(pipeline.df)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from ingestion.ingestion_pipeline_steps.interpreter_step import ( # noqa
2+
ColumnTypeInterpreter,
3+
)
4+
from pipeline.pipeline import IngestionPipeline
5+
from sklearn.datasets import load_iris
6+
7+
8+
def test_iris_types_numeric():
9+
pipeline = IngestionPipeline()
10+
pipeline.df = load_iris(return_X_y=True, as_frame=True)[0]
11+
pipeline.add(ColumnTypeInterpreter())
12+
pipeline.run()
13+
14+
assert pipeline.column_type_map == {
15+
"sepal length (cm)": "numeric",
16+
"sepal width (cm)": "numeric",
17+
"petal length (cm)": "numeric",
18+
"petal width (cm)": "numeric",
19+
}

0 commit comments

Comments
 (0)
0