1
- from models . models import Dataset
1
+ from pandas import DataFrame
2
2
from pipeline .pipeline import ModelPipeline
3
3
4
4
@@ -21,15 +21,25 @@ def __init__(
21
21
22
22
def fit (self , pipeline : ModelPipeline ):
23
23
for var in self .cat_vars :
24
- pipeline .train_features_df = self .convert (pipeline .train_features_df , var )
24
+ pipeline .train_features_df = self .convert (
25
+ pipeline .train_features_df , var
26
+ ) # noqa
25
27
pipeline .feature_list .append (var )
26
28
27
- def convert (self , df , col_name ) :
29
+ def convert (self , df : DataFrame , col_name : str ) -> DataFrame :
28
30
"""
31
+ Encodes a categorical column ordinally.
32
+ Currently only the "freq" method is supported,
33
+ and it encodes a value with an integer id by
34
+ increasing frequency i.e. more frequent values
35
+ receive a higher encoding
29
36
30
- :param df:
31
- :param col_name:
32
- :return:
37
+ Note that this should only be done on the training
38
+ data!
39
+
40
+ :param df: pandas DataFrame of features
41
+ :param col_name: column to consider
42
+ :return: transformed DataFrame
33
43
"""
34
44
if self .method == "freq" :
35
45
self .cat_freqs [col_name ] = {}
@@ -43,7 +53,7 @@ def convert(self, df, col_name):
43
53
[(key , val ) for key , val in self .cat_freqs [col_name ].items ()],
44
54
key = lambda x : x [1 ],
45
55
)
46
- print ( freq_pairs )
56
+
47
57
self .cat_maps [col_name ] = {key : val for key , val in freq_pairs }
48
58
49
59
df [col_name ] = df [col_name ].apply (
@@ -54,3 +64,15 @@ def convert(self, df, col_name):
54
64
return df
55
65
else :
56
66
raise ValueError ("Unsupported encoding method, try [freq]" )
67
+
68
+ def predict (self , pipeline : ModelPipeline ):
69
+ df = pipeline .tmp_test
70
+
71
+ for var in self .cat_vars :
72
+ df [var ] = df [var ].apply (
73
+ lambda x : self .cat_maps [var ][x ]
74
+ if x in self .cat_maps [var ]
75
+ else - 2 # noqa
76
+ )
77
+
78
+ pipeline .tmp_test = df
0 commit comments