8000 [MRG+1] Ngram Performance (#7567) · raghavrv/scikit-learn@d8e54d9 · GitHub
[go: up one dir, main page]

Skip to content

Commit d8e54d9

Browse files
jtdoepkejnothman
authored andcommitted
[MRG+1] Ngram Performance (scikit-learn#7567)
1 parent 25917ba commit d8e54d9

File tree

1 file changed

+32
-6
lines changed

1 file changed

+32
-6
lines changed

sklearn/feature_extraction/text.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,12 +131,24 @@ def _word_ngrams(self, tokens, stop_words=None):
131131
min_n, max_n = self.ngram_range
132132
if max_n != 1:
133133
original_tokens = tokens
134-
tokens = []
134+
if min_n == 1:
135+
# no need to do any slicing for unigrams
136+
# just iterate through the original tokens
137+
tokens = list(original_tokens)
138+
min_n += 1
139+
else:
140+
tokens = []
141+
135142
n_original_tokens = len(original_tokens)
143+
144+
# bind method outside of loop to reduce overhead
145+
tokens_append = tokens.append
146+
space_join = " ".join
147+
136148
for n in xrange(min_n,
137149
min(max_n + 1, n_original_tokens + 1)):
138150
for i in xrange(n_original_tokens - n + 1):
139-
tokens.append(" ".join(original_tokens[i: i + n]))
151+
tokens_append(space_join(original_tokens[i: i + n]))
140152

141153
return tokens
142154

@@ -146,11 +158,21 @@ def _char_ngrams(self, text_document):
146158
text_document = self._white_spaces.sub(" ", text_document)
147159

148160
text_len = len(text_document)
149-
ngrams = []
150161
min_n, max_n = self.ngram_range
162+
if min_n == 1:
163+
# no need to do any slicing for unigrams
164+
# iterate through the string
165+
ngrams = list(text_document)
166+
min_n += 1
167+
else:
168+
ngrams = []
169+
170+
# bind method outside of loop to reduce overhead
171+
ngrams_append = ngrams.append
172+
151173
for n in xrange(min_n, min(max_n + 1, text_len + 1)):
152174
for i in xrange(text_len - n + 1):
153-
ngrams.append(text_document[i: i + n])
175+
ngrams_append(text_document[i: i + n])
154176
return ngrams
155177

156178
def _char_wb_ngrams(self, text_document):
@@ -164,15 +186,19 @@ def _char_wb_ngrams(self, text_document):
164186

165187
min_n, max_n = self.ngram_range
166188
ngrams = []
189+
190+
# bind method outside of loop to reduce overhead
191+
ngrams_append = ngrams.append
192+
167193
for w in text_document.split():
168194
w = ' ' + w + ' '
169195
w_len = len(w)
170196
for n in xrange(min_n, max_n + 1):
171197
offset = 0
172-
ngrams.append(w[offset:offset + n])
198+
ngrams_append(w[offset:offset + n])
173199
while offset + n < w_len:
174200
offset += 1
175-
ngrams.append(w[offset:offset + n])
201+
ngrams_append(w[offset:offset + n])
176202
if offset == 0: # count a short word (w_len < n) only once
177203
break
178204
return ngrams

0 commit comments

Comments
 (0)
0