@@ -131,12 +131,24 @@ def _word_ngrams(self, tokens, stop_words=None):
131
131
min_n , max_n = self .ngram_range
132
132
if max_n != 1 :
133
133
original_tokens = tokens
134
- tokens = []
134
+ if min_n == 1 :
135
+ # no need to do any slicing for unigrams
136
+ # just iterate through the original tokens
137
+ tokens = list (original_tokens )
138
+ min_n += 1
139
+ else :
140
+ tokens = []
141
+
135
142
n_original_tokens = len (original_tokens )
143
+
144
+ # bind method outside of loop to reduce overhead
145
+ tokens_append = tokens .append
146
+ space_join = " " .join
147
+
136
148
for n in xrange (min_n ,
137
149
min (max_n + 1 , n_original_tokens + 1 )):
138
150
for i in xrange (n_original_tokens - n + 1 ):
139
- tokens . append ( " " . join (original_tokens [i : i + n ]))
151
+ tokens_append ( space_join (original_tokens [i : i + n ]))
140
152
141
153
return tokens
142
154
@@ -146,11 +158,21 @@ def _char_ngrams(self, text_document):
146
158
text_document = self ._white_spaces .sub (" " , text_document )
147
159
148
160
text_len = len (text_document )
149
- ngrams = []
150
161
min_n , max_n = self .ngram_range
162
+ if min_n == 1 :
163
+ # no need to do any slicing for unigrams
164
+ # iterate through the string
165
+ ngrams = list (text_document )
166
+ min_n += 1
167
+ else :
168
+ ngrams = []
169
+
170
+ # bind method outside of loop to reduce overhead
171
+ ngrams_append = ngrams .append
172
+
151
173
for n in xrange (min_n , min (max_n + 1 , text_len + 1 )):
152
174
for i in xrange (text_len - n + 1 ):
153
- ngrams . append (text_document [i : i + n ])
175
+ ngrams_append (text_document [i : i + n ])
154
176
return ngrams
155
177
156
178
def _char_wb_ngrams (self , text_document ):
@@ -164,15 +186,19 @@ def _char_wb_ngrams(self, text_document):
164
186
165
187
min_n , max_n = self .ngram_range
166
188
ngrams = []
189
+
190
+ # bind method outside of loop to reduce overhead
191
+ ngrams_append = ngrams .append
192
+
167
193
for w in text_document .split ():
168
194
w = ' ' + w + ' '
169
195
w_len = len (w )
170
196
for n in xrange (min_n , max_n + 1 ):
171
197
offset = 0
172
- ngrams . append (w [offset :offset + n ])
198
+ ngrams_append (w [offset :offset + n ])
173
199
while offset + n < w_len :
174
200
offset += 1
175
- ngrams . append (w [offset :offset + n ])
201
+ ngrams_append (w [offset :offset + n ])
176
202
if offset == 0 : # count a short word (w_len < n) only once
177
203
break
178
204
return ngrams
0 commit comments