-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFeatureManager.py
54 lines (42 loc) · 1.75 KB
/
FeatureManager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
<
76EA
div data-line-number="29" class="child-of-line-7 child-of-line-20 react-line-number react-code-text" style="padding-right:16px">2930
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from Cleanser import Cleanser
from tokenizers.LemmaTokenizer import LemmaTokenizer
class FeatureManager:
def __init__(self, data, combination, logger, cleanse, embeddings):
self.logger = logger
self.logger.info("PREPARING FEATURES {}".format(combination))
self.logger.info("***********************")
self.clf = None
self.data = data
if cleanse:
self.data = Cleanser(self.data).data
self.features = self.create_features(self.data, combination, embeddings)
def stack_features(self, features):
if len(features) < 1:
raise ValueError("No features found")
X = features[0]
if len(features) == 1:
return X
for feature in features:
X = sparse.hstack((X, feature))
return X
def create_features(self, data, combination, embeddings):
features_tfidf = []
for feature in combination:
self.logger.info("Processing feature {}".format(feature))
if feature == 'd2v':
if embeddings is not None:
features_tfidf.append(embeddings)
else:
continue
else:
count_vect = CountVectorizer(tokenizer=LemmaTokenizer())
tfidf_transformer = TfidfTransformer()
X_train_counts = count_vect.fit_transform(data[feature])
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
features_tfidf.append(X_train_tfidf)
if len(features_tfidf) == 0:
return None
else:
return self.stack_features(features_tfidf)