8000 [PorterStemmer] Refactor step 3 · ExplodingCabbage/nltk@92d845f · GitHub
[go: up one dir, main page]

Skip to content

Commit 92d845f

Browse files
[PorterStemmer] Refactor step 3
1 parent 10498f8 commit 92d845f

File tree

1 file changed

+55
-58
lines changed

1 file changed

+55
-58
lines changed

nltk/stem/porter.py

Lines changed: 55 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,9 @@ def _measure(self, stem):
228228
# the number of 'VC' occurrences in Porter's reduced form in the
229229
# docstring above, which is in turn equivalent to `m`
230230
return cv_sequence.count('vc')
231+
232+
def _has_positive_measure(self, stem):
233+
return self._measure(stem) > 0
231234

232235
def _contains_vowel(self, stem):
233236
"""_contains_vowel(stem) is TRUE <=> stem contains a vowel"""
@@ -497,89 +500,83 @@ def _step2(self, word):
497500
(m>0) IVITI -> IVE sensitiviti -> sensitive
498501
(m>0) BILITI -> BLE sensibiliti -> sensible
499502
"""
500-
positive_measure = lambda stem: self._measure(stem) > 0
501-
503+
502504
# --NEW--
503505
# Instead of applying the ALLI -> AL rule after 'bli' per the
504506
# published algorithm, instead we apply it first, and, if it
505507
# succeeds, run the result through step2 again.
506508
try:
507-
stem = self._replace_suffix_if(word, 'alli', 'al', positive_measure)
509+
stem = self._replace_suffix_if(
510+
word,
511+
'alli',
512+
'al',
513+
self._has_positive_measure
514+
)
508515
return self._step2(stem)
509516
except _CannotReplaceSuffix:
510517
pass
511518

512519
return self._apply_first_possible_rule(word, [
513-
('ational', 'ate', positive_measure),
514-
('tional', 'tion', positive_measure),
515-
('enci', 'ence', positive_measure),
516-
('anci', 'ance', positive_measure),
517-
('izer', 'ize', positive_measure),
520+
('ational', 'ate', self._has_positive_measure),
521+
('tional', 'tion', self._has_positive_measure),
522+
('enci', 'ence', self._has_positive_measure),
523+
('anci', 'ance', self._has_positive_measure),
524+
('izer', 'ize', self._has_positive_measure),
518525

519526
# --DEPARTURE--
520527
# To match the published algorithm, replace "bli" with
521528
# "abli" and "ble" with "able"
522-
('bli', 'ble', positive_measure),
529+
('bli', 'ble', self._has_positive_measure),
523530

524531
# -- NEW --
525-
('fulli', 'ful', positive_measure),
532+
('fulli', 'ful', self._has_positive_measure),
526533

527-
('entli', 'ent', positive_measure),
528-
('eli', 'e', positive_measure),
529-
('ousli', 'ous', positive_measure),
530-
('ization', 'ize', positive_measure),
531-
('ation', 'ate', positive_measure),
532-
('ator', 'ate', positive_measure),
533-
('alism', 'al', positive_measure),
534-
('iveness', 'ive', positive_measure),
535-
('fulness', 'ful', positive_measure),
536-
('ousness', 'ous', positive_measure),
537-
('aliti', 'al', positive_measure),
538-
('iviti', 'ive', positive_measure),
539-
('biliti', 'ble', positive_measure),
534+
('entli', 'ent', self._has_positive_measure),
535+
('eli', 'e', self._has_positive_measure),
536+
('ousli', 'ous', self._has_positive_measure),
537+
('ization', 'ize', self._has_positive_measure),
538+
('ation', 'ate', self._has_positive_measure),
539+
('ator', 'ate', self._has_positive_measure),
540+
('alism', 'al', self._has_positive_measure),
541+
('iveness', 'ive', self._has_positive_measure),
542+
('fulness', 'ful', self._has_positive_measure),
543+
('ousness', 'ous', self._has_positive_measure),
544+
('aliti', 'al', self._has_positive_measure),
545+
('iviti', 'ive', self._has_positive_measure),
546+
('biliti', 'ble', self._has_positive_measure),
540547

541548
# --DEPARTURE--
542549
# To match the published algorithm, delete this phrase
543550
# --NEW-- (Barry Wilkins)
544551
# To match the published algorithm, replace lambda below
545-
# with just positive_measure
546-
("logi", "log", lambda stem: positive_measure(word[:-3])),
552+
# with just self._has_positive_measure
553+
("logi", "log", lambda stem: self._has_positive_measure(word[:-3])),
547554
])
548555

549556
def _step3(self, word):
550-
"""step3() deals with -ic-, -full, -ness etc. similar strategy to step2."""
551-
552-
ch = word[-1]
553-
554-
if ch == 'e':
555-
if word.endswith("icate"):
556-
return word[:-3] if self._m(word, len(word)-6) else word
557-
elif word.endswith("ative"):
558-
return word[:-5] if self._m(word, len(word)-6) else word
559-
elif word.endswith("alize"):
560-
return word[:-3] if self._m(word, len(word)-6) else word
561-
else:
562-
return word
563-
elif ch == 'i':
564-
if word.endswith("iciti"):
565-
return word[:-3] if self._m(word, len(word)-6) else word
566-
else:
567-
return word
568-
elif ch == 'l':
569-
if word.endswith("ical"):
570-
return word[:-2] if self._m(word, len(word)-5) else word
571-
elif word.endswith("ful"):
572-
return word[:-3] if self._m(word, len(word)-4) else word
573-
else:
574-
return word
575-
elif ch == 's':
576-
if word.endswith("ness"):
577-
return word[:-4] if self._m(word, len(word)-5) else word
578-
else:
579-
return word
580-
581-
else:
582-
return word
557+
"""Implements Step 3 from "An algorithm for suffix stripping"
558+
559+
From the paper:
560+
561+
Step 3
562+
563+
(m>0) ICATE -> IC triplicate -> triplic
564+
(m>0) ATIVE -> formative -> form
565+
(m>0) ALIZE -> AL formalize -> formal
566+
(m>0) ICITI -> IC electriciti -> electric
567+
(m>0) ICAL -> IC electrical -> electric
568+
(m>0) FUL -> hopeful -> hope
569+
(m>0) NESS -> goodness -> good
570+
"""
571+
return self._apply_first_possible_rule(word, [
572+
('icate', 'ic', self._has_positive_measure),
573+
('ative', '', self._has_positive_measure),
574+
('alize', 'al', self._has_positive_measure),
575+
('iciti', 'ic', self._has_positive_measure),
576+
('ical', 'ic', self._has_positive_measure),
577+
('ful', '', self._has_positive_measure),
578+
('ness', '', self._has_positive_measure),
579+
])
583580

584581
def _step4(self, word):
585582
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""

0 commit comments

Comments
 (0)
0