8000 [PorterStemmer] Refactor step2 · ExplodingCabbage/nltk@10498f8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 10498f8

Browse files
[PorterStemmer] Refactor step2
1 parent ccd5525 commit 10498f8

File tree

1 file changed

+75
-89
lines changed

1 file changed

+75
-89
lines changed

nltk/stem/porter.py

Lines changed: 75 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -469,96 +469,82 @@ def _step1c(self, word):
469469
return word
470470

471471
def _step2(self, word):
472-
"""step2() maps double suffices to single ones.
473-
so -ization ( = -ize plus -ation) maps to -ize etc. note that the
474-
string before the suffix must give m() > 0.
472+
"""Implements Step 2 from "An algorithm for suffix stripping"
473+
474+
From the paper:
475+
476+
Step 2
477+
478+
(m>0) ATIONAL -> ATE relational -> relate
479+
(m>0) TIONAL -> TION conditional -> condition
480+
rational -> rational
481+
(m>0) ENCI -> ENCE valenci -> valence
482+
(m>0) ANCI -> ANCE hesitanci -> hesitance
483+
(m>0) IZER -> IZE digitizer -> digitize
484+
(m>0) ABLI -> ABLE conformabli -> conformable
485+
(m>0) ALLI -> AL radicalli -> radical
486+
(m>0) ENTLI -> ENT differentli -> different
487+
(m>0) ELI -> E vileli - > vile
488+
(m>0) OUSLI -> OUS analogousli -> analogous
489+
(m>0) IZATION -> IZE vietnamization -> vietnamize
490+
(m>0) ATION -> ATE predication -> predicate
491+
(m>0) ATOR -> ATE operator -> operate
492+
(m>0) ALISM -> AL feudalism -> feudal
493+
(m>0) IVENESS -> IVE decisiveness -> decisive
494+
(m>0) FULNESS -> FUL hopefulness -> hopeful
495+
(m>0) OUSNESS -> OUS callousness -> callous
496+
(m>0) ALITI -> AL formaliti -> formal
497+
(m>0) IVITI -> IVE sensitiviti -> sensitive
498+
(m>0) BILITI -> BLE sensibiliti -> sensible
475499
"""
476-
if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed'
477-
return word
478-
479-
ch = word[-2]
480-
481-
if ch == 'a':
482-
if word.endswith("ational"):
483-
return word[:-7] + "ate" if self._m(word, len(word)-8) > 0 else word
484-
elif word.endswith("tional"):
485-
return word[:-2] if self._m(word, len(word)-7) > 0 else word
486-
else:
487-
return word
488-
elif ch == 'c':
489-
if word.endswith("enci"):
490-
return word[:-4] + "ence" if self._m(word, len(word)-5) > 0 else word
491-
elif word.endswith("anci"):
492-
return word[:-4] + "ance" if self._m(word, len(word)-5) > 0 else word
493-
else:
494-
return word
495-
elif ch == 'e':
496-
if word.endswith("izer"):
497-
return word[:-1] if self._m(word, len(word)-5) > 0 else word
498-
else:
499-
return word
500-
elif ch == 'l':
501-
if word.endswith("bli"):
502-
return word[:-3] + "ble" if self._m(word, len(word)-4) > 0 else word # --DEPARTURE--
503-
# To match the published algorithm, replace "bli" with "abli" and "ble" with "able"
504-
elif word.endswith("alli"):
505-
# --NEW--
506-
if self._m(word, len(word)-5) > 0:
507-
word = word[:-2]
508-
return self._step2(word)
509-
else:
510-
return word
511-
elif word.endswith("fulli"):
512-
return word[:-2] if self._m(word, len(word)-6) else word # --NEW--
513-
elif word.endswith("entli"):
514-
return word[:-2] if self._m(word, len(word)-6) else word
515-
elif word.endswith("eli"):
516-
return word[:-2] if self._m(word, len(word)-4) else word
517-
elif word.endswith("ousli"):
518-
return word[:-2] if self._m(word, len(word)-6) else word
519-
else:
520-
return word
521-
elif ch == 'o':
522-
if word.endswith("ization"):
523-
return word[:-7] + "ize" if self._m(word, len(word)-8) else word
524-
elif word.endswith("ation"):
525-
return word[:-5] + "ate" if self._m(word, len(word)-6) else word
526-
elif word.endswith("ator"):
527-
return word[:-4] + "ate" if self._m(word, len(word)-5) else word
528-
else:
529-
return word
530-
elif ch == 's':
531-
if word.endswith("alism"):
532-
return word[:-3] if self._m(word, len(word)-6) else word
533-
elif word.endswith("ness"):
534-
if word.endswith("iveness"):
535-
return word[:-4] if self._m(word, len(word)-8) else word
536-
elif word.endswith("fulness"):
537-
return word[:-4] if self._m(word, len(word)-8) else word
538-
elif word.endswith("ousness"):
539-
return word[:-4] if self._m(word, len(word)-8) else word
540-
else:
541-
return word
542-
else:
543-
return word
544-
elif ch == 't':
545-
if word.endswith("aliti"):
546-
return word[:-3] if self._m(word, len(word)-6) else word
547-
elif word.endswith("iviti"):
548-
return word[:-5] + "ive" if self._m(word, len(word)-6) else word
549-
elif word.endswith("biliti"):
550-
return word[:-6] + "ble" if self._m(word, len(word)-7) else word
551-
else:
552-
return word
553-
elif ch == 'g': # --DEPARTURE--
554-
if word.endswith("logi"):
555-
return word[:-1] if self._m(word, len(word) - 4) else word # --NEW-- (Barry Wilkins)
556-
# To match the published algorithm, pass len(word)-5 to _m instead of len(word)-4
557-
else:
558-
return word
559-
560-
else:
561-
return word
500+
positive_measure = lambda stem: self._measure(stem) > 0
501+
502+
# --NEW--
503+
# Instead of applying the ALLI -> AL rule after 'bli' per the
504+
# published algorithm, instead we apply it first, and, if it
505+
# succeeds, run the result through step2 again.
506+
try:
507+
stem = self._replace_suffix_if(word, 'alli', 'al', positive_measure)
508+
return self._step2(stem)
509+
except _CannotReplaceSuffix:
510+
pass
511+
512+
return self._apply_first_possible_rule(word, [
513+
('ational', 'ate', positive_measure),
514+
('tional', 'tion', positive_measure),
515+
('enci', 'ence', positive_measure),
516+
('anci', 'ance', positive_measure),
517+
('izer', 'ize', positive_measure),
518+
519+
# --DEPARTURE--
520+
# To match the published algorithm, replace "bli" with
521+
# "abli" and "ble" with "able"
522+
('bli', 'ble', positive_measure),
523+
524+
# -- NEW --
525+
('fulli', 'ful', positive_measure),
526+
527+
('entli', 'ent', positive_measure),
528+
('eli', 'e', positive_measure),
529+
('ousli', 'ous', positive_measure),
530+
('ization', 'ize', positive_measure),
531+
('ation', 'ate', positive_measure),
532+
('ator', 'ate', positive_measure),
533+
('alism', 'al', positive_measure),
534+
('iveness', 'ive', positive_measure),
535+
('fulness', 'ful', positive_measure),
536+
('ousness', 'ous', positive_measure),
537+
('aliti', 'al', positive_measure),
538+
('iviti', 'ive', positive_measure),
539+
('biliti', 'ble', positive_measure),
540+
541+
# --DEPARTURE--
542+
# To match the published algorithm, delete this phrase
543+
# --NEW-- (Barry Wilkins)
544+
# To match the published algorithm, replace lambda below
545+
# with just positive_measure
546+
("logi", "log", lambda stem: positive_measure(word[:-3])),
547+
])
562548

563549
def _step3(self, word):
564550
"""step3() deals with -ic-, -full, -ness etc. similar strategy to step2."""

0 commit comments

Comments
 (0)
0