8000 Add tests; get them passing in MARTIN mode · ExplodingCabbage/nltk@625a9cf · GitHub
[go: up one dir, main page]

Skip to content

Commit 625a9cf

Browse files
Add tests; get them passing in MARTIN mode
1 parent 558eebe commit 625a9cf

File tree

4 files changed

+47189
-99
lines changed

4 files changed

+47189
-99
lines changed

nltk/stem/porter.py

Lines changed: 101 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,6 @@
2727
from nltk.stem.api import StemmerI
2828
from nltk.compat import python_2_unicode_compatible
2929

30-
class _CannotReplaceSuffix(Exception):
31-
pass
32-
3330
@python_2_unicode_compatible
3431
class PorterStemmer(StemmerI):
3532
"""
@@ -185,6 +182,16 @@ def _contains_vowel(self, stem):
185182
if not self._cons(stem, i):
186183
return True
187184
return False
185+
186+
def _ends_double_consonant(self, word):
187+
"""Implements condition *d from the paper
188+
189+
Returns True if word ends with a double consonant
190+
"""
191+
return (
192+
word[-1] == word[-2] and
193+
self._cons(word, len(word)-1)
194+
)
188195

189196
def _ends_cvc(self, word):
190197
"""Implements condition *o from the paper
@@ -210,24 +217,12 @@ def _ends_cvc(self, word):
210217
def _replace_suffix(self, word, suffix, replacement):
211218
"""Replaces `suffix` of `word` with `replacement"""
212219
assert word.endswith(suffix), "Given word doesn't end with given suffix"
213-
return word[:-len(suffix)] + replacement
214-
215-
def _replace_suffix_if(self, word, suffix, replacement, condition):
216-
"""If `condition`, replace suffix with replacement, else raise
217-
218-
`condition` should be a lambda that takes the word and stem as
219-
arguments and returns True or False.
220-
"""
221-
if not word.endswith(suffix):
222-
raise _CannotReplaceSuffix("word does not end with suffix")
220+
if suffix == '':
221+
return word + replacement
223222
else:
224-
stem = self._replace_suffix(word, suffix, replacement)
225-
if condition is None or condition(stem):
226-
return stem
227-
else:
228-
raise _CannotReplaceSuffix("condition not met")
223+
return word[:-len(suffix)] + replacement
229224

230-
def _apply_first_possible_rule(self, word, rules):
225+
def _apply_rule_list(self, word, rules):
231226
"""Applies the first applicable suffix-removal rule to the word
232227
233228
Takes a word and a list of suffix-removal rules represented as
@@ -237,10 +232,21 @@ def _apply_first_possible_rule(self, word, rules):
237232
or None if the rule is unconditional.
238233
"""
239234
for rule in rules:
240-
try:
241-
return self._replace_suffix_if(word, *rule)
242-
except _CannotReplaceSuffix:
243-
pass
235+
suffix, replacement, condition = rule
236+
if suffix == '*d' and self._ends_double_consonant(word):
237+
stem = word[:-2]
238+
if condition is None or condition(stem):
239+
return stem + replacement
240+
else:
241+
# Don't try any further rules
242+
return word
243+
if word.endswith(suffix):
244+
stem = self._replace_suffix(word, suffix, '')
245+
if condition is None or condition(stem):
246+
return stem + replacement
247+
else:
248+
# Don't try any further rules
249+
return word
244250

245251
return word
246252

@@ -255,18 +261,14 @@ def _step1a(self, word):
255261
SS -> SS caress -> caress
256262
S -> cats -> cat
257263
"""
258-
return self._apply_first_possible_rule(word, [
259-
('sses', 'ss', None), # SSES -> SS
260-
261-
# this NLTK-only rule extends the original algorithm, so
262-
# that 'flies'->'fli' but 'dies'->'die' etc
263-
(
264-
'ies',
265-
'ie',
266-
lambda stem: (self.mode == self.NLTK_EXTENSIONS and
267-
len(word) == 4)
268-
),
264+
# this NLTK-only rule extends the original algorithm, so
265+
# that 'flies'->'fli' but 'dies'->'die' etc
266+
if self.mode == self.NLTK_EXTENSIONS:
267+
if word.endswith('ies') and len(word) == 4:
268+
return self._replace_suffix(word, 'ies', 'ie')
269269

270+
return self._apply_rule_list(word, [
271+
('sses', 'ss', None), # SSES -> SS
270272
('ies', 'i', None), # IES -> I
271273
('ss', 'ss', None), # SS -> SS
272274
('s', '', None), # S ->
@@ -308,47 +310,42 @@ def _step1b(self, word):
308310
# this NLTK-only block extends the original algorithm, so that
309311
# 'spied'->'spi' but 'died'->'die' etc
310312
if self.mode == self.NLTK_EXTENSIONS:
311-
try:
312-
return self._replace_suffix_if(
313-
word, 'ied', 'ie', lambda stem: len(word) == 4
314-
)
315-
except _CannotReplaceSuffix:
316-
pass
317-
318-
try:
319-
# (m>0) EED -> EE
320-
return self._replace_suffix_if(
321-
word, 'eed', 'ee', lambda stem: self._measure(stem) > 0
322-
)
323-
except _CannotReplaceSuffix:
324-
pass
313+
if word.endswith('ied'):
314+
if len(word) == 4:
315+
return self._replace_suffix(word, 'ied', 'ie')
316+
else:
317+
return self._replace_suffix(word, 'ied', 'i')
318+
319+
# (m>0) EED -> EE
320+
if word.endswith('eed'):
321+
stem = self._replace_suffix(word, 'eed', '')
322+
if self._measure(stem) > 0:
323+
return stem + 'ee'
324+
else:
325+
return word
325326

326327
rule_2_or_3_succeeded = False
327-
for rule in [
328-
('ed', '', self._contains_vowel), # (*v*) ED ->
329-
('ing', '', self._contains_vowel), # (*v*) ING ->
330-
]:
331-
try:
332-
intermediate_stem = self._replace_suffix_if(word, *rule)
333-
rule_2_or_3_succeeded = True
334-
break
335-
except _CannotReplaceSuffix:
336-
pass
328+
329+
for suffix in ['ed', 'ing']:
330+
if word.endswith(suffix):
331+
intermediate_stem = self._replace_suffix(word, suffix, '')
332+
if self._contains_vowel(intermediate_stem):
333+
rule_2_or_3_succeeded = True
334+
break
337335

338336
if not rule_2_or_3_succeeded:
339337
return word
340-
341-
final_letter = intermediate_stem[-1]
342-
return self._apply_first_possible_rule(intermediate_stem, [
338+
339+
return self._apply_rule_list(intermediate_stem, [
343340
('at', 'ate', None), # AT -> ATE
344341
('bl', 'ble', None), # BL -> BLE
345342
('iz', 'ize', None), # IZ -> IZE
346343
# (*d and not (*L or *S or *Z))
347344
# -> single letter
348345
(
349-
final_letter*2,
350-
final_letter,
351-
lambda stem: final_letter not in ('l', 's', 'z')
346+
'*d',
347+
intermediate_stem[-1],
348+
lambda stem: intermediate_stem[-1] not in ('l', 's', 'z')
352349
),
353350
# (m=1 and *o) -> E
354351
(
@@ -393,16 +390,14 @@ def nltk_condition(stem):
393390
def original_condition(stem):
394391
return self._contains_vowel(stem)
395392

396-
try:
397-
return self._replace_suffix_if(
398-
word,
393+
return self._apply_rule_list(word, [
394+
(
399395
'y',
400396
'i',
401397
nltk_condition if self.mode == self.NLTK_EXTENSIONS
402398
else original_condition
403399
)
404-
except _CannotReplaceSuffix:
405-
return word
400+
])
406401

407402
def _step2(self, word):
408403
"""Implements Step 2 from "An algorithm for suffix stripping"
@@ -438,16 +433,15 @@ def _step2(self, word):
438433
# Instead of applying the ALLI -> AL rule after '(a)bli' per
439434
# the published algorithm, instead we apply it first, and,
440435
# if it succeeds, run the result through step2 again.
441-
try:
442-
stem = self._replace_suffix_if(
443-
word,
444-
'alli',
445-
'al',
446-
self._has_positive_measure
436+
if (
437+
word.endswith('alli') and
438+
self._has_positive_measure(
439+
self._replace_suffix(word, 'alli', '')
440+
)
441+
):
442+
return self._step2(
443+
self._replace_suffix(word, 'alli', 'al')
447444
)
448-
return self._step2(stem)
449-
except _CannotReplaceSuffix:
450-
pass
451445

452446
bli_rule = ('bli', 'ble', self._has_positive_measure)
453447
abli_rule = ('abli', 'able', self._has_positive_measure)
@@ -496,7 +490,7 @@ def _step2(self, word):
496490
("logi", "log", self._has_positive_measure)
497491
)
498492

499-
return self._apply_first_possible_rule(word, rules)
493+
return self._apply_rule_list(word, rules)
500494

501495
def _step3(self, word):
502496
"""Implements Step 3 from "An algorithm for suffix stripping"
@@ -513,7 +507,7 @@ def _step3(self, word):
513507
(m>0) FUL -> hopeful -> hope
514508
(m>0) NESS -> goodness -> good
515509
"""
516-
return self._apply_first_possible_rule(word, [
510+
return self._apply_rule_list(word, [
517511
('icate', 'ic', self._has_positive_measure),
518512
('ative', '', self._has_positive_measure),
519513
('alize', 'al', self._has_positive_measure),
@@ -553,7 +547,7 @@ def _step4(self, word):
553547
"""
554548
measure_gt_1 = lambda stem: self._measure(stem) > 1
555549

556-
return self._apply_first_possible_rule(word, [
550+
return self._apply_rule_list(word, [
557551
('al', '', measure_gt_1),
558552
('ance', '', measure_gt_1),
559553
('ence', '', measure_gt_1),
@@ -593,17 +587,30 @@ def _step5a(self, word):
593587
rate -> rate
594588
(m=1 and not *o) E -> cease -> ceas
595589
"""
596-
return self._apply_first_possible_rule(word, [
597-
('e', '', lambda stem: self._measure(stem) > 1),
598-
(
599-
'e',
600-
'',
601-
lambda stem: (
602-
self._measure(stem) == 1 and
603-
not self._ends_cvc(stem)
604-
)
605-
)
606-
])
590+
# Note that Martin's test vocabulary and reference
591+
# implementations are inconsistent in how they handle the case
592+
# where two rules both refer to a suffix that matches the word
593+
# to be stemmed, but only the condition of the second one is
594+
# true.
595+
# Earlier in step2b we had the rules:
596+
# (m>0) EED -> EE
597+
# (*v*) ED ->
598+
# but the examples in the paper included "feed"->"feed", even
599+
# though (*v*) is true for "fe" and therefore the second rule
600+
# alone would map "feed"->"fe".
601+
# However, in THIS case, we need to handle the consecutive rules
602+
# differently and try both conditions (obviously; the second
603+
# rule here would be redundant otherwise). Martin's paper makes
604+
# no explicit mention of the inconsistency; you have to infer it
605+
# from the examples.
606+
# For this reason, we can't use _apply_rule_list here.
607+
if word.endswith('e'):
608+
stem = self._replace_suffix(word, 'e', '')
609+
if self._measure(stem) > 1:
610+
return stem
611+
if self._measure(stem) == 1 and not self._ends_cvc(stem):
612+
return stem
613+
return word
607614

608615
def _step5b(self, word):
609616
"""Implements Step 5a from "An algorithm for suffix stripping"
@@ -616,13 +623,8 @@ def _step5b(self, word):
616623
controll -> control
617624
roll -> roll
618625
"""
619-
# The rule is expressed in an overcomplicated way in Porter's
620-
# paper, but all it means it that double-l should become
621-
# single-l. It could've been written more straightforwardly as:
622-
#
623-
# (m > 1) LL -> L
624-
return self._apply_first_possible_rule(word, [
625-
('ll', 'l', lambda stem: self._measure(stem) > 1)
626+
return self._apply_rule_list(word, [
627+
('ll', 'l', lambda stem: self._measure(word[:-1]) > 1)
626628
])
627629

628630
def stem(self, word):

0 commit comments

Comments
 (0)
0