ExplodingCabbage
diff --git a/‎nltk/translate/__init__.py
Lines changed: 2 additions & 2 deletions b/‎nltk/translate/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎nltk/translate/bleu_score.py
Lines changed: 188 additions & 44 deletions b/‎nltk/translate/bleu_score.py
Lines changed: 188 additions & 44 deletions
@@ -18,7 +18,7 @@
 from nltk.translate.ibm3 import IBMModel3
 from nltk.translate.ibm4 import IBMModel4
 from nltk.translate.ibm5 import IBMModel5
-from nltk.translate.bleu_score import bleu
-from nltk.translate.ribes_score import ribes
+from nltk.translate.bleu_score import sentence_bleu as bleu
+from nltk.translate.ribes_score import sentence_ribes as ribes
 from nltk.translate.metrics import alignment_error_rate
 from nltk.translate.stack_decoder import StackDecoder
@@ -3,29 +3,27 @@
 #
 # Copyright (C) 2001-2015 NLTK Project
 # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
-# Contributors: Dmitrijs Milajevs
+# Contributors: Dmitrijs Milajevs, Liling Tan
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 """BLEU score implementation."""
 
 from __future__ import division
 
 import math
+from fractions import Fraction
+from collections import Counter
 
-from nltk.tokenize import word_tokenize
-from nltk.compat import Counter
 from nltk.util import ngrams
 
 
-def bleu(references, hypothesis, weights):
+def sentence_bleu(references, hypothesis, weights=[0.25, 0.25, 0.25, 0.25]):
     """
     Calculate BLEU score (Bilingual Evaluation Understudy) from
     Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
     "BLEU: a method for automatic evaluation of machine translation." 
     In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
 
-
-    >>> weights = [0.25, 0.25, 0.25, 0.25]
     >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
     ...               'ensures', 'that', 'the', 'military', 'always',
     ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
@@ -47,41 +45,148 @@ def bleu(references, hypothesis, weights):
     ...               'army', 'always', 'to', 'heed', 'the', 'directions',
     ...               'of', 'the', 'party']
 
-    >>> bleu([reference1, reference2, reference3], hypothesis1, weights)
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1)
     0.5045666840058485
 
-    >>> bleu([reference1, reference2, reference3], hypothesis2, weights)
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis2)
     0
 
+    The default BLEU calculates a score for up to 4grams using uniform
+    weights. To evaluate your translations with higher/lower order ngrams, 
+    use customized weights. E.g. when accounting for up to 6grams with uniform
+    weights:
+
+    >>> weights = [0.1666, 0.1666, 0.1666, 0.1666, 0.1666]
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights)
+    0.45838627164939455
+    
     :param references: reference sentences
     :type references: list(list(str))
     :param hypothesis: a hypothesis sentence
     :type hypothesis: list(str)
     :param weights: weights for unigrams, bigrams, trigrams and so on
     :type weights: list(float)
+    :return: The sentence-level BLEU score.
+    :rtype: float
     """
-    p_ns = (
-        _modified_precision(references, hypothesis, i)
-        for i, _ in enumerate(weights, start=1)
-    )
+    # Calculates the modified precision *p_n* for each order of ngram.
+    p_ns = [] 
+    for i, _ in enumerate(weights, start=1): 
+        p_n = float(_modified_precision(references, hypothesis, i))
+        p_ns.append(p_n) 
 
     try:
+        # Calculates the overall modified precision for all ngrams.
+        # By taking the product of the weights and the respective *p_n*
         s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns))
     except ValueError:
         # some p_ns is 0
         return 0
 
-    bp = _brevity_penalty(references, hypothesis)
+    # Calculates the brevity penalty.
+    # *hyp_len* is referred to as *c* in Papineni et. al. (2002)
+    hyp_len = len(hypothesis)
+    # *closest_ref_len* is referred to as *r* variable in Papineni et. al. (2002)
+    closest_ref_len = _closest_ref_length(references, hyp_len)
+    bp = _brevity_penalty(closest_ref_len, hyp_len)
     return bp * math.exp(s)
 
 
+def corpus_bleu(list_of_references, hypotheses, weights=[0.25, 0.25, 0.25, 0.25]):
+    """
+    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all 
+    the hypotheses and their respective references.  
+
+    Instead of averaging the sentence level BLEU scores (i.e. marco-average 
+    precision), the original BLEU metric (Papineni et al. 2002) accounts for 
+    the micro-average precision (i.e. summing the numerators and denominators
+    for each hypothesis-reference(s) pairs before the division).
+    
+    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...         'ensures', 'that', 'the', 'military', 'always',
+    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
+    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...          'heed', 'Party', 'commands']
+    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...          'guarantees', 'the', 'military', 'forces', 'always',
+    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
+    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...          'of', 'the', 'party']
+    
+    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', 
+    ...         'interested', 'in', 'world', 'history']
+    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', 
+    ...          'because', 'he', 'read', 'the', 'book']
+    
+    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
+    >>> hypotheses = [hyp1, hyp2]
+    >>> corpus_bleu(list_of_references, hypotheses)
+    0.5520516129306314
+    
+    The example below show that corpus_bleu() is different from averaging 
+    sentence_bleu() for hypotheses 
+    
+    >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
+    >>> score2 = sentence_bleu([ref2a], hyp2)
+    >>> (score1 + score2) / 2
+    0.6223247442490669
+    
+    :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
+    :type references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param weights: weights for unigrams, bigrams, trigrams and so on
+    :type weights: list(float)
+    :return: The corpus-level BLEU score.
+    :rtype: float
+    """
+    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
+    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
+    hyp_lengths, ref_lengths = 0, 0
+    
+    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
+    
+    # Iterate through each hypothesis and their corresponding references.
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        # For each order of ngram, calculate the numerator and
+        # denominator for the corpus-level modified precision.
+        for i, _ in enumerate(weights, start=1): 
+            p_i = _modified_precision(references, hypothesis, i)
+            p_numerators[i] += p_i.numerator
+            p_denominators[i] += p_i.denominator
+            
+        # Calculate the hypothesis length and the closest reference length.
+        # Adds them to the corpus-level hypothesis and reference counts.
+        hyp_len =  len(hypothesis)
+        hyp_lengths += hyp_len
+        ref_lengths += _closest_ref_length(references, hyp_len)    
+        
+    # Calculate corpus-level brevity penalty.
+    bp = _brevity_penalty(ref_lengths, hyp_lengths)
+    
+    # Calculate corpus-level modified precision.
+    p_n = []
+    for i, w in enumerate(weights, start=1):
+        pn = p_numerators[i] / p_denominators[i]
+        p_n.append(w* math.log(pn))
+        
+    return bp * math.exp(math.fsum(p_n))
+
+
 def _modified_precision(references, hypothesis, n):
     """
     Calculate modified ngram precision.
 
     The normal precision method may lead to some wrong translations with
     high-precision, e.g., the translation, in which a word of reference
-    repeats several times, has very high precision. 
+    repeats several times, has very high precision.     
+
+    This function only returns the Fraction object that contains the numerator 
+    and denominator necessary to calculate the corpus-level precision. 
+    To calculate the modified precision for a single pair of hypothesis and 
+    references, cast the Fraction object into a float. 
     
     The famous "the the the ... " example shows that you can get BLEU precision
     by duplicating high frequency words.
@@ -90,7 +195,7 @@ def _modified_precision(references, hypothesis, n):
         >>> reference2 = 'there is a cat on the mat'.split()
         >>> hypothesis1 = 'the the the the the the the'.split()
         >>> references = [reference1, reference2]
-        >>> _modified_precision(references, hypothesis1, n=1)
+        >>> float(_modified_precision(references, hypothesis1, n=1))
         0.2857142857142857
     
     In the modified n-gram precision, a reference word will be considered 
@@ -108,9 +213,9 @@ def _modified_precision(references, hypothesis, n):
         ...               'of', 'the', 'party']
         >>> hypothesis = 'of the'.split()
         >>> references = [reference1, reference2, reference3]
-        >>> _modified_precision(references, hypothesis, n=1)
+        >>> float(_modified_precision(references, hypothesis, n=1))
         1.0
-        >>> _modified_precision(references, hypothesis, n=2)
+        >>> float(_modified_precision(references, hypothesis, n=2))
         1.0
         
     An example of a normal machine translation hypothesis:
@@ -136,39 +241,64 @@ def _modified_precision(references, hypothesis, n):
         ...               'army', 'always', 'to', 'heed', 'the', 'directions',
         ...               'of', 'the', 'party']
         >>> references = [reference1, reference2, reference3]
-        >>> _modified_precision(references, hypothesis1, n=1)
+        >>> float(_modified_precision(references, hypothesis1, n=1))
  
F438
       0.9444444444444444
-        >>> _modified_precision(references, hypothesis2, n=1)
+        >>> float(_modified_precision(references, hypothesis2, n=1))
         0.5714285714285714
-        >>> _modified_precision(references, hypothesis1, n=2)
+        >>> float(_modified_precision(references, hypothesis1, n=2))
         0.5882352941176471
-        >>> _modified_precision(references, hypothesis2, n=2)
+        >>> float(_modified_precision(references, hypothesis2, n=2))
         0.07692307692307693
-
+     
+    
     :param references: A list of reference translations.
     :type references: list(list(str))
     :param hypothesis: A hypothesis translation.
     :type hypothesis: list(str)
     :param n: The ngram order.
     :type n: int
+    :return: BLEU's modified precision for the nth order ngram.
+    :rtype: Fraction
     """
     counts = Counter(ngrams(hypothesis, n))
 
     if not counts:
-        return 0
+        return Fraction(0)
 
     max_counts = {}
     for reference in references:
         reference_counts = Counter(ngrams(reference, n))
         for ngram in counts:
             max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
 
-    clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())
-
-    return sum(clipped_counts.values()) / sum(counts.values())
+    clipped_counts = dict((ngram, min(count, max_counts[ngram])) 
+                          for ngram, count in counts.items())
+    
+    numerator = sum(clipped_counts.values())
+    denominator = sum(counts.values())  
+    
+    return Fraction(numerator, denominator)  
+    
 
+def _closest_ref_length(references, hyp_len):
+    """
+    This function finds the reference that is the closest length to the 
+    hypothesis. The closest reference length is referred to as *r* variable 
+    from the brevity penalty formula in Papineni et. al. (2002)
+    
+    :param references: A list of reference translations.
+    :type references: list(list(str))
+    :param hypothesis: The length of the hypothesis.
+    :type hypothesis: int
+    :return: The length of the reference that's closest to the hypothesis.
+    :rtype: int    
+    """
+    ref_lens = (len(reference) for reference in references)
+    closest_ref_len = min(ref_lens, key=lambda ref_len: 
+                          (abs(ref_len - hyp_len), ref_len))
+    return closest_ref_len
 
-def _brevity_penalty(references, hypothesis):
+def _brevity_penalty(closest_ref_len, hyp_len):
     """
     Calculate brevity penalty.
 
@@ -184,15 +314,19 @@ def _brevity_penalty(references, hypothesis):
         >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
         >>> hypothesis = list('aaaaaaaaaaaa')      # i.e. ['a'] * 12
         >>> references = [reference1, reference2, reference3]
-        >>> _brevity_penalty(references, hypothesis)
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  _closest_ref_length(references, hyp_len)
+        >>> _brevity_penalty(closest_ref_len, hyp_len)
         1.0
 
     In case a hypothesis translation is shorter than the references, penalty is
     applied.
 
         >>> references = [['a'] * 28, ['a'] * 28]
         >>> hypothesis = ['a'] * 12
-        >>> _brevity_penalty(references, hypothesis)
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  _closest_ref_length(references, hyp_len)
+        >>> _brevity_penalty(closest_ref_len, hyp_len)
         0.2635971381157267
 
     The length of the closest reference is used to compute the penalty. If the
@@ -202,7 +336,9 @@ def _brevity_penalty(references, hypothesis):
 
         >>> references = [['a'] * 13, ['a'] * 2]
         >>> hypothesis = ['a'] * 12
-        >>> _brevity_penalty(references, hypothesis)
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  _closest_ref_length(references, hyp_len)
+        >>> _brevity_penalty(closest_ref_len, hyp_len)
         0.9200444146293233
 
     The brevity penalty doesn't depend on reference order. More importantly,
@@ -211,34 +347,42 @@ def _brevity_penalty(references, hypothesis):
 
         >>> references = [['a'] * 13, ['a'] * 11]
         >>> hypothesis = ['a'] * 12
-        >>> bp1 = _brevity_penalty(references, hypothesis)  
-        >>> bp2 = _brevity_penalty(reversed(references),hypothesis) 
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  _closest_ref_length(references, hyp_len)
+        >>> bp1 = _brevity_penalty(closest_ref_len, hyp_len)
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  _closest_ref_length(reversed(references), hyp_len)
+        >>> bp2 = _brevity_penalty(closest_ref_len, hyp_len)
         >>> bp1 == bp2 == 1
         True
 
     A test example from mteval-v13a.pl (starting from the line 705):
 
         >>> references = [['a'] * 11, ['a'] * 8]
         >>> hypothesis = ['a'] * 7
-        >>> _brevity_penalty(references, hypothesis)
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  _closest_ref_length(references, hyp_len)
+        >>> _brevity_penalty(closest_ref_len, hyp_len)
         0.8668778997501817
 
         >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
         >>> hypothesis = ['a'] * 7
-        >>> _brevity_penalty(references, hypothesis)
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  _closest_ref_length(references, hyp_len)
+        >>> _brevity_penalty(closest_ref_len, hyp_len)
         1.0
     
-    :param references: A list of reference translations.
-    :type references: list(list(str))
-    :param hypothesis: A hypothesis translation.
-    :type hypothesis: list(str)
+    :param hyp_len: The length of the hypothesis for a single sentence OR the 
+    sum of all the hypotheses' lengths for a corpus
+    :type hyp_len: int
+    :param closest_ref_len: The length of the closest reference for a single 
+    hypothesis OR the sum of all the closest references for every hypotheses.
+    :type closest_reference_len: int    
+    :return: BLEU's brevity penalty.
+    :rtype: float
     """
-    c = len(hypothesis)
-    ref_lens = (len(reference) for reference in references)
-    r = min(ref_lens, key=lambda ref_len: (abs(ref_len - c), ref_len))
-
-    if c > r:
+    if hyp_len > closest_ref_len:
         return 1
     else:
-        return math.exp(1 - r / c)
+        return math.exp(1 - closest_ref_len / hyp_len)