[PorterStemmer] Remove stem_word from PorterStemmer. Breaks backwards compatability!

ExplodingCabbage · ExplodingCabbage · commit 20005549f7d5 · 2016-01-10T18:41:17.000Z
Prior to this change, the public API of the PorterStemmer was a mess. NLTK's version was based off Vivake Gupta's implementation at http://tartarus.org/~martin/PorterStemmer/python.txt, endorsed by Martin himself at http://tartarus.org/~martin/PorterStemmer/. However, Gupta's implementation is a shoddy port of Martin Porter's own implementation in C, and had several vestigial quirks lying around. These include the claim that the stem() method takes a "char pointer" as an argument (no such thing in Python) and the need to pass in start and end indexes between which stem() should read the word from the given char array. At some point in nltk's history, during or prior to the 2006 commit that added porter.py to the current Git repository: nltk@edf4677 this was "solved" by renaming Vivake's stem() method to stem_word() and creating a wrapper for it called stem() that conformed to the StemmerI interface. This was completely pointless; the right thing to do would've been to remove the unnecessary parts of Vivake's stem() method and thereby acheive conformity to StemmerI. This commit does this, but at the cost of breaking backwards compatibility for anyone who was using stem_word(word) instead of stem(word); those people will need to adjust their application code when updating to the latest version of NLTK.
diff --git a/nltk/stem/porter.py b/nltk/stem/porter.py
@@ -529,29 +529,6 @@ def _step5(self, word):
 
         return word
 
-    def stem_word(self, p, i=0, j=None):
-        """
-        Returns the stem of p, or, if i and j are given, the stem of p[i:j+1].
-        """
-        ## --NLTK--
-        if j is None and i == 0:
-            word = p
-        else:
-            if j is None:
-                j = len(p) - 1
-            word = p[i:j+1]
-
-        if word in self.pool:
-            return self.pool[word]
-
-        word = self._step1ab(word)
-        word = self._step1c(word)
-        word = self._step2(word)
-        word = self._step3(word)
-        word = self._step4(word)
-        word = self._step5(word)
-        return word
-
     def _adjust_case(self, word, stem):
         lower = word.lower()
 
@@ -583,10 +560,27 @@ def _adjust_case(self, word, stem):
     #        ret = ret + separator
     #    return ret
 
-    ## --NLTK--
-    ## Define a stem() method that implements the StemmerI interface.
     def stem(self, word):
-        stem = self.stem_word(word.lower(), 0, len(word) - 1)
+        stem = word.lower()
+        
+        # --NLTK--
+        if word in self.pool:
+            return self.pool[word]
+
+        if len(word) <= 2:
+            return word # --DEPARTURE--
+        # With this line, strings of length 1 or 2 don't go through the
+        # stemming process, although no mention is made of this in the
+        # published algorithm. Remove the line to match the published
+        # algorithm.
+        
+        stem = self._step1ab(stem)
+        stem = self._step1c(stem)
+        stem = self._step2(stem)
+        stem = self._step3(stem)
+        stem = self._step4(stem)
+        stem = self._step5(stem)
+        
         return self._adjust_case(word, stem)
 
     def __repr__(self):