27
27
from nltk .stem .api import StemmerI
28
28
from nltk .compat import python_2_unicode_compatible
29
29
30
- class _CannotReplaceSuffix (Exception ):
31
- pass
32
-
33
30
@python_2_unicode_compatible
34
31
class PorterStemmer (StemmerI ):
35
32
"""
@@ -185,6 +182,16 @@ def _contains_vowel(self, stem):
185
182
if not self ._cons (stem , i ):
186
183
return True
187
184
return False
185
+
186
+ def _ends_double_consonant (self , word ):
187
+ """Implements condition *d from the paper
188
+
189
+ Returns True if word ends with a double consonant
190
+ """
191
+ return (
192
+ word [- 1 ] == word [- 2 ] and
193
+ self ._cons (word , len (word )- 1 )
194
+ )
188
195
189
196
def _ends_cvc (self , word ):
190
197
"""Implements condition *o from the paper
@@ -210,24 +217,12 @@ def _ends_cvc(self, word):
210
217
def _replace_suffix (self , word , suffix , replacement ):
211
218
"""Replaces `suffix` of `word` with `replacement"""
212
219
assert word .endswith (suffix ), "Given word doesn't end with given suffix"
213
- return word [:- len (suffix )] + replacement
214
-
215
- def _replace_suffix_if (self , word , suffix , replacement , condition ):
216
- """If `condition`, replace suffix with replacement, else raise
217
-
218
- `condition` should be a lambda that takes the word and stem as
219
- arguments and returns True or False.
220
- """
221
- if not word .endswith (suffix ):
222
- raise _CannotReplaceSuffix ("word does not end with suffix" )
220
+ if suffix == '' :
221
+ return word + replacement
223
222
else :
224
- stem = self ._replace_suffix (word , suffix , replacement )
225
- if condition is None or condition (stem ):
226
- return stem
227
- else :
228
- raise _CannotReplaceSuffix ("condition not met" )
223
+ return word [:- len (suffix )] + replacement
229
224
230
- def _apply_first_possible_rule (self , word , rules ):
225
+ def _apply_rule_list (self , word , rules ):
231
226
"""Applies the first applicable suffix-removal rule to the word
232
227
233
228
Takes a word and a list of suffix-removal rules represented as
@@ -237,10 +232,21 @@ def _apply_first_possible_rule(self, word, rules):
237
232
or None if the rule is unconditional.
238
233
"""
239
234
for rule in rules :
240
- try :
241
- return self ._replace_suffix_if (word , * rule )
242
- except _CannotReplaceSuffix :
243
- pass
235
+ suffix , replacement , condition = rule
236
+ if suffix == '*d' and self ._ends_double_consonant (word ):
237
+ stem = word [:- 2 ]
238
+ if condition is None or condition (stem ):
239
+ return stem + replacement
240
+ else :
241
+ # Don't try any further rules
242
+ return word
243
+ if word .endswith (suffix ):
244
+ stem = self ._replace_suffix (word , suffix , '' )
245
+ if condition is None or condition (stem ):
246
+ return stem + replacement
247
+ else :
248
+ # Don't try any further rules
249
+ return word
244
250
245
251
return word
246
252
@@ -255,18 +261,14 @@ def _step1a(self, word):
255
261
SS -> SS caress -> caress
256
262
S -> cats -> cat
257
263
"""
258
- return self ._apply_first_possible_rule (word , [
259
- ('sses' , 'ss' , None ), # SSES -> SS
260
-
261
- # this NLTK-only rule extends the original algorithm, so
262
- # that 'flies'->'fli' but 'dies'->'die' etc
263
- (
264
- 'ies' ,
265
- 'ie' ,
266
- lambda stem : (self .mode == self .NLTK_EXTENSIONS and
267
- len (word ) == 4 )
268
- ),
264
+ # this NLTK-only rule extends the original algorithm, so
265
+ # that 'flies'->'fli' but 'dies'->'die' etc
266
+ if self .mode == self .NLTK_EXTENSIONS :
267
+ if word .endswith ('ies' ) and len (word ) == 4 :
268
+ return self ._replace_suffix (word , 'ies' , 'ie' )
269
269
270
+ return self ._apply_rule_list (word , [
271
+ ('sses' , 'ss' , None ), # SSES -> SS
270
272
('ies' , 'i' , None ), # IES -> I
271
273
('ss' , 'ss' , None ), # SS -> SS
272
274
('s' , '' , None ), # S ->
@@ -308,47 +310,42 @@ def _step1b(self, word):
308
310
# this NLTK-only block extends the original algorithm, so that
309
311
# 'spied'->'spi' but 'died'->'die' etc
310
312
if self .mode == self .NLTK_EXTENSIONS :
311
- try :
312
- return self ._replace_suffix_if (
313
- word , 'ied' , 'ie' , lambda stem : len (word ) == 4
314
- )
315
- except _CannotReplaceSuffix :
316
- pass
317
-
318
- try :
319
- # (m>0) EED -> EE
320
- return self ._replace_suffix_if (
321
- word , 'eed' , 'ee' , lambda stem : self ._measure (stem ) > 0
322
- )
323
- except _CannotReplaceSuffix :
324
- pass
313
+ if word .endswith ('ied' ):
314
+ if len (word ) == 4 :
315
+ return self ._replace_suffix (word , 'ied' , 'ie' )
316
+ else :
317
+ return self ._replace_suffix (word , 'ied' , 'i' )
318
+
319
+ # (m>0) EED -> EE
320
+ if word .endswith ('eed' ):
321
+ stem = self ._replace_suffix (word , 'eed' , '' )
322
+ if self ._measure (stem ) > 0 :
323
+ return stem + 'ee'
324
+ else :
325
+ return word
325
326
326
327
rule_2_or_3_succeeded = False
327
- for rule in [
328
- ('ed' , '' , self ._contains_vowel ), # (*v*) ED ->
329
- ('ing' , '' , self ._contains_vowel ), # (*v*) ING ->
330
- ]:
331
- try :
332
- intermediate_stem = self ._replace_suffix_if (word , * rule )
333
- rule_2_or_3_succeeded = True
334
- break
335
- except _CannotReplaceSuffix :
336
- pass
328
+
329
+ for suffix in ['ed' , 'ing' ]:
330
+ if word .endswith (suffix ):
331
+ intermediate_stem = self ._replace_suffix (word , suffix , '' )
332
+ if self ._contains_vowel (intermediate_stem ):
333
+ rule_2_or_3_succeeded = True
334
+ break
337
335
338
336
if not rule_2_or_3_succeeded :
339
337
return word
340
-
341
- final_letter = intermediate_stem [- 1 ]
342
- return self ._apply_first_possible_rule (intermediate_stem , [
338
+
339
+ return self ._apply_rule_list (intermediate_stem , [
343
340
('at' , 'ate' , None ), # AT -> ATE
344
341
('bl' , 'ble' , None ), # BL -> BLE
345
342
('iz' , 'ize' , None ), # IZ -> IZE
346
343
# (*d and not (*L or *S or *Z))
347
344
# -> single letter
348
345
(
349
- final_letter * 2 ,
350
- final_letter ,
351
- lambda stem : final_letter not in ('l' , 's' , 'z' )
346
+ '*d' ,
347
+ intermediate_stem [ - 1 ] ,
348
+ lambda stem : intermediate_stem [ - 1 ] not in ('l' , 's' , 'z' )
352
349
),
353
350
# (m=1 and *o) -> E
354
351
(
@@ -393,16 +390,14 @@ def nltk_condition(stem):
393
390
def original_condition (stem ):
394
391
return self ._contains_vowel (stem )
395
392
396
- try :
397
- return self ._replace_suffix_if (
398
- word ,
393
+ return self ._apply_rule_list (word , [
394
+ (
399
395
'y' ,
400
396
'i' ,
401
397
nltk_condition if self .mode == self .NLTK_EXTENSIONS
402
398
else original_condition
403
399
)
404
- except _CannotReplaceSuffix :
405
- return word
400
+ ])
406
401
407
402
def _step2 (self , word ):
408
403
"""Implements Step 2 from "An algorithm for suffix stripping"
@@ -438,16 +433,15 @@ def _step2(self, word):
438
433
# Instead of applying the ALLI -> AL rule after '(a)bli' per
439
434
# the published algorithm, instead we apply it first, and,
440
435
# if it succeeds, run the result through step2 again.
441
- try :
442
- stem = self ._replace_suffix_if (
443
- word ,
444
- 'alli' ,
445
- 'al' ,
446
- self ._has_positive_measure
436
+ if (
437
+ word .endswith ('alli' ) and
438
+ self ._has_positive_measure (
439
+ self ._replace_suffix (word , 'alli' , '' )
440
+ )
441
+ ):
442
+ return self ._step2 (
443
+ self ._replace_suffix (word , 'alli' , 'al' )
447
444
)
448
- return self ._step2 (stem )
449
- except _CannotReplaceSuffix :
450
- pass
451
445
452
446
bli_rule = ('bli' , 'ble' , self ._has_positive_measure )
453
447
abli_rule = ('abli' , 'able' , self ._has_positive_measure )
@@ -496,7 +490,7 @@ def _step2(self, word):
496
490
("logi" , "log" , self ._has_positive_measure )
497
491
)
498
492
499
- return self ._apply_first_possible_rule (word , rules )
493
+ return self ._apply_rule_list (word , rules )
500
494
501
495
def _step3 (self , word ):
502
496
"""Implements Step 3 from "An algorithm for suffix stripping"
@@ -513,7 +507,7 @@ def _step3(self, word):
513
507
(m>0) FUL -> hopeful -> hope
514
508
(m>0) NESS -> goodness -> good
515
509
"""
516
- return self ._apply_first_possible_rule (word , [
510
+ return self ._apply_rule_list (word , [
517
511
('icate' , 'ic' , self ._has_positive_measure ),
518
512
('ative' , '' , self ._has_positive_measure ),
519
513
('alize' , 'al' , self ._has_positive_measure ),
@@ -553,7 +547,7 @@ def _step4(self, word):
553
547
"""
554
548
measure_gt_1 = lambda stem : self ._measure (stem ) > 1
555
549
556
- return self ._apply_first_possible_rule (word , [
550
+ return self ._apply_rule_list (word , [
557
551
('al' , '' , measure_gt_1 ),
558
552
('ance' , '' , measure_gt_1 ),
559
553
('ence' , '' , measure_gt_1 ),
@@ -593,17 +587,30 @@ def _step5a(self, word):
593
587
rate -> rate
594
588
(m=1 and not *o) E -> cease -> ceas
595
589
"""
596
- return self ._apply_first_possible_rule (word , [
597
- ('e' , '' , lambda stem : self ._measure (stem ) > 1 ),
598
- (
599
- 'e' ,
600
- '' ,
601
- lambda stem : (
602
- self ._measure (stem ) == 1 and
603
- not self ._ends_cvc (stem )
604
- )
605
- )
606
- ])
590
+ # Note that Martin's test vocabulary and reference
591
+ # implementations are inconsistent in how they handle the case
592
+ # where two rules both refer to a suffix that matches the word
593
+ # to be stemmed, but only the condition of the second one is
594
+ # true.
595
+ # Earlier in step2b we had the rules:
596
+ # (m>0) EED -> EE
597
+ # (*v*) ED ->
598
+ # but the examples in the paper included "feed"->"feed", even
599
+ # though (*v*) is true for "fe" and therefore the second rule
600
+ # alone would map "feed"->"fe".
601
+ # However, in THIS case, we need to handle the consecutive rules
602
+ # differently and try both conditions (obviously; the second
603
+ # rule here would be redundant otherwise). Martin's paper makes
604
+ # no explicit mention of the inconsistency; you have to infer it
605
+ # from the examples.
606
+ # For this reason, we can't use _apply_rule_list here.
607
+ if word .endswith ('e' ):
608
+ stem = self ._replace_suffix (word , 'e' , '' )
609
+ if self ._measure (stem ) > 1 :
610
+ return stem
611
+ if self ._measure (stem ) == 1 and not self ._ends_cvc (stem ):
612
+ return stem
613
+ return word
607
614
608
615
def _step5b (self , word ):
609
616
"""Implements Step 5a from "An algorithm for suffix stripping"
@@ -616,13 +623,8 @@ def _step5b(self, word):
616
623
controll -> control
617
624
roll -> roll
618
625
"""
619
- # The rule is expressed in an overcomplicated way in Porter's
620
- # paper, but all it means it that double-l should become
621
- # single-l. It could've been written more straightforwardly as:
622
- #
623
- # (m > 1) LL -> L
624
- return self ._apply_first_possible_rule (word , [
625
- ('ll' , 'l' , lambda stem : self ._measure (stem ) > 1 )
626
+ return self ._apply_rule_list (word , [
627
+ ('ll' , 'l' , lambda stem : self ._measure (word [:- 1 ]) > 1 )
626
628
])
627
629
628
630
def stem (self , word ):
0 commit comments