@@ -469,96 +469,82 @@ def _step1c(self, word):
469
469
return word
470
470
471
471
def _step2 (self , word ):
472
- """step2() maps double suffices to single ones.
473
- so -ization ( = -ize plus -ation) maps to -ize etc. note that the
474
- string before the suffix must give m() > 0.
472
+ """Implements Step 2 from "An algorithm for suffix stripping"
473
+
474
+ From the paper:
475
+
476
+ Step 2
477
+
478
+ (m>0) ATIONAL -> ATE relational -> relate
479
+ (m>0) TIONAL -> TION conditional -> condition
480
+ rational -> rational
481
+ (m>0) ENCI -> ENCE valenci -> valence
482
+ (m>0) ANCI -> ANCE hesitanci -> hesitance
483
+ (m>0) IZER -> IZE digitizer -> digitize
484
+ (m>0) ABLI -> ABLE conformabli -> conformable
485
+ (m>0) ALLI -> AL radicalli -> radical
486
+ (m>0) ENTLI -> ENT differentli -> different
487
+ (m>0) ELI -> E vileli - > vile
488
+ (m>0) OUSLI -> OUS analogousli -> analogous
489
+ (m>0) IZATION -> IZE vietnamization -> vietnamize
490
+ (m>0) ATION -> ATE predication -> predicate
491
+ (m>0) ATOR -> ATE operator -> operate
492
+ (m>0) ALISM -> AL feudalism -> feudal
493
+ (m>0) IVENESS -> IVE decisiveness -> decisive
494
+ (m>0) FULNESS -> FUL hopefulness -> hopeful
495
+ (m>0) OUSNESS -> OUS callousness -> callous
496
+ (m>0) ALITI -> AL formaliti -> formal
497
+ (m>0) IVITI -> IVE sensitiviti -> sensitive
498
+ (m>0) BILITI -> BLE sensibiliti -> sensible
475
499
"""
476
- if len (word ) <= 1 : # Only possible at this stage given unusual inputs to stem_word like 'oed'
477
- return word
478
-
479
- ch = word [- 2 ]
480
-
481
- if ch == 'a' :
482
- if word .endswith ("ational" ):
483
- return word [:- 7 ] + "ate" if self ._m (word , len (word )- 8 ) > 0 else word
484
- elif word .endswith ("tional" ):
485
- return word [:- 2 ] if self ._m (word , len (word )- 7 ) > 0 else word
486
- else :
487
- return word
488
- elif ch == 'c' :
489
- if word .endswith ("enci" ):
490
- return word [:- 4 ] + "ence" if self ._m (word , len (word )- 5 ) > 0 else word
491
- elif word .endswith ("anci" ):
492
- return word [:- 4 ] + "ance" if self ._m (word , len (word )- 5 ) > 0 else word
493
- else :
494
- return word
495
- elif ch == 'e' :
496
- if word .endswith ("izer" ):
497
- return word [:- 1 ] if self ._m (word , len (word )- 5 ) > 0 else word
498
- else :
499
- return word
500
- elif ch == 'l' :
501
- if word .endswith ("bli" ):
502
- return word [:- 3 ] + "ble" if self ._m (word , len (word )- 4 ) > 0 else word # --DEPARTURE--
503
- # To match the published algorithm, replace "bli" with "abli" and "ble" with "able"
504
- elif word .endswith ("alli" ):
505
- # --NEW--
506
- if self ._m (word , len (word )- 5 ) > 0 :
507
- word = word [:- 2 ]
508
- return self ._step2 (word )
509
- else :
510
- return word
511
- elif word .endswith ("fulli" ):
512
- return word [:- 2 ] if self ._m (word , len (word )- 6 ) else word # --NEW--
513
- elif word .endswith ("entli" ):
514
- return word [:- 2 ] if self ._m (word , len (word )- 6 ) else word
515
- elif word .endswith ("eli" ):
516
- return word [:- 2 ] if self ._m (word , len (word )- 4 ) else word
517
- elif word .endswith ("ousli" ):
518
- return word [:- 2 ] if self ._m (word , len (word )- 6 ) else word
519
- else :
520
- return word
521
- elif ch == 'o' :
522
- if word .endswith ("ization" ):
523
- return word [:- 7 ] + "ize" if self ._m (word , len (word )- 8 ) else word
524
- elif word .endswith ("ation" ):
525
- return word [:- 5 ] + "ate" if self ._m (word , len (word )- 6 ) else word
526
- elif word .endswith ("ator" ):
527
- return word [:- 4 ] + "ate" if self ._m (word , len (word )- 5 ) else word
6D40
td>528
- else :
529
- return word
530
- elif ch == 's' :
531
- if word .endswith ("alism" ):
532
- return word [:- 3 ] if self ._m (word , len (word )- 6 ) else word
533
- elif word .endswith ("ness" ):
534
- if word .endswith ("iveness" ):
535
- return word [:- 4 ] if self ._m (word , len (word )- 8 ) else word
536
- elif word .endswith ("fulness" ):
537
- return word [:- 4 ] if self ._m (word , len (word )- 8 ) else word
538
- elif word .endswith ("ousness" ):
539
- return word [:- 4 ] if self ._m (word , len (word )- 8 ) else word
540
- else :
541
- return word
542
- else :
543
- return word
544
- elif ch == 't' :
545
- if word .endswith ("aliti" ):
546
- return word [:- 3 ] if self ._m (word , len (word )- 6 ) else word
547
- elif word .endswith ("iviti" ):
548
- return word [:- 5 ] + "ive" if self ._m (word , len (word )- 6 ) else word
549
- elif word .endswith ("biliti" ):
550
- return word [:- 6 ] + "ble" if self ._m (word , len (word )- 7 ) else word
551
- else :
552
- return word
553
- elif ch == 'g' : # --DEPARTURE--
554
- if word .endswith ("logi" ):
555
- return word [:- 1 ] if self ._m (word , len (word ) - 4 ) else word # --NEW-- (Barry Wilkins)
556
- # To match the published algorithm, pass len(word)-5 to _m instead of len(word)-4
557
- else :
558
- return word
559
-
560
- else :
561
- return word
500
+ positive_measure = lambda stem : self ._measure (stem ) > 0
501
+
502
+ # --NEW--
503
+ # Instead of applying the ALLI -> AL rule after 'bli' per the
504
+ # published algorithm, instead we apply it first, and, if it
505
+ # succeeds, run the result through step2 again.
506
+ try :
507
+ stem = self ._replace_suffix_if (word , 'alli' , 'al' , positive_measure )
508
+ return self ._step2 (stem )
509
+ except _CannotReplaceSuffix :
510
+ pass
511
+
512
+ return self ._apply_first_possible_rule (word , [
513
+ ('ational' , 'ate' , positive_measure ),
514
+ ('tional' , 'tion' , positive_measure ),
515
+ ('enci' , 'ence' , positive_measure ),
516
+ ('anci' , 'ance' , positive_measure ),
517
+ ('izer' , 'ize' , positive_measure ),
518
+
519
+ # --DEPARTURE--
520
+ # To match the published algorithm, replace "bli" with
521
+ # "abli" and "ble" with "able"
522
+ ('bli' , 'ble' , positive_measure ),
523
+
524
+ # -- NEW --
525
+ ('fulli' , 'ful' , positive_measure ),
526
+
527
+ ('entli' , 'ent' , positive_measure ),
528
+ ('eli' , 'e' , positive_measure ),
529
+ ('ousli' , 'ous' , positive_measure ),
530
+ ('ization' , 'ize' , positive_measure ),
531
+ ('ation' , 'ate' , positive_measure ),
532
+ ('ator' , 'ate' , positive_measure ),
533
+ ('alism' , 'al' , positive_measure ),
534
+ ('iveness' , 'ive' , positive_measure ),
535
+ ('fulness' , 'ful' , positive_measure ),
536
+ ('ousness' , 'ous' , positive_measure ),
537
+ ('aliti' , 'al' , positive_measure ),
538
+ ('iviti' , 'ive' , positive_measure ),
539
+ ('biliti' , 'ble' , positive_measure ),
540
+
541
+ # --DEPARTURE--
542
+ # To match the published algorithm, delete this phrase
543
+ # --NEW-- (Barry Wilkins)
544
+ # To match the published algorithm, replace lambda below
545
+ # with just positive_measure
546
+ ("logi" , "log" , lambda stem : positive_measure (word [:- 3 ])),
547
+ ])
562
548
563
549
def _step3 (self , word ):
564
550
"""step3() deals with -ic-, -full, -ness etc. similar strategy to step2."""
0 commit comments