3
3
#
4
4
# Copyright (C) 2001-2015 NLTK Project
5
5
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
6
- # Contributors: Dmitrijs Milajevs
6
+ # Contributors: Dmitrijs Milajevs, Liling Tan
7
7
# URL: <http://nltk.org/>
8
8
# For license information, see LICENSE.TXT
9
9
"""BLEU score implementation."""
10
10
11
11
from __future__ import division
12
12
13
13
import math
14
+ from fractions import Fraction
15
+ from collections import Counter
14
16
15
- from nltk .tokenize import word_tokenize
16
- from nltk .compat import Counter
17
17
from nltk .util import ngrams
18
18
19
19
20
- def bleu (references , hypothesis , weights ):
20
+ def sentence_bleu (references , hypothesis , weights = [ 0.25 , 0.25 , 0.25 , 0.25 ] ):
21
21
"""
22
22
Calculate BLEU score (Bilingual Evaluation Understudy) from
23
23
Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
24
24
"BLEU: a method for automatic evaluation of machine translation."
25
25
In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
26
26
27
-
28
- >>> weights = [0.25, 0.25, 0.25, 0.25]
29
27
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
30
28
... 'ensures', 'that', 'the', 'military', 'always',
31
29
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
@@ -47,41 +45,148 @@ def bleu(references, hypothesis, weights):
47
45
... 'army', 'always', 'to', 'heed', 'the', 'directions',
48
46
... 'of', 'the', 'party']
49
47
50
- >>> bleu ([reference1, reference2, reference3], hypothesis1, weights )
48
+ >>> sentence_bleu ([reference1, reference2, reference3], hypothesis1)
51
49
0.5045666840058485
52
50
53
- >>> bleu ([reference1, reference2, reference3], hypothesis2, weights )
51
+ >>> sentence_bleu ([reference1, reference2, reference3], hypothesis2)
54
52
0
55
53
54
+ The default BLEU calculates a score for up to 4grams using uniform
55
+ weights. To evaluate your translations with higher/lower order ngrams,
56
+ use customized weights. E.g. when accounting for up to 6grams with uniform
57
+ weights:
58
+
59
+ >>> weights = [0.1666, 0.1666, 0.1666, 0.1666, 0.1666]
60
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights)
61
+ 0.45838627164939455
62
+
56
63
:param references: reference sentences
57
64
:type references: list(list(str))
58
65
:param hypothesis: a hypothesis sentence
59
66
:type hypothesis: list(str)
60
67
:param weights: weights for unigrams, bigrams, trigrams and so on
61
68
:type weights: list(float)
69
+ :return: The sentence-level BLEU score.
70
+ :rtype: float
62
71
"""
63
- p_ns = (
64
- _modified_precision (references , hypothesis , i )
65
- for i , _ in enumerate (weights , start = 1 )
66
- )
72
+ # Calculates the modified precision *p_n* for each order of ngram.
73
+ p_ns = []
74
+ for i , _ in enumerate (weights , start = 1 ):
75
+ p_n = float (_modified_precision (references , hypothesis , i ))
76
+ p_ns .append (p_n )
67
77
68
78
try :
79
+ # Calculates the overall modified precision for all ngrams.
80
+ # By taking the product of the weights and the respective *p_n*
69
81
s = math .fsum (w * math .log (p_n ) for w , p_n in zip (weights , p_ns ))
70
82
except ValueError :
71
83
# some p_ns is 0
72
84
return 0
73
85
74
- bp = _brevity_penalty (references , hypothesis )
86
+ # Calculates the brevity penalty.
87
+ # *hyp_len* is referred to as *c* in Papineni et. al. (2002)
88
+ hyp_len = len (hypothesis )
89
+ # *closest_ref_len* is referred to as *r* variable in Papineni et. al. (2002)
90
+ closest_ref_len = _closest_ref_length (references , hyp_len )
91
+ bp = _brevity_penalty (closest_ref_len , hyp_len )
75
92
return bp * math .exp (s )
76
93
77
94
95
+ def corpus_bleu (list_of_references , hypotheses , weights = [0.25 , 0.25 , 0.25 , 0.25 ]):
96
+ """
97
+ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
98
+ the hypotheses and their respective references.
99
+
100
+ Instead of averaging the sentence level BLEU scores (i.e. marco-average
101
+ precision), the original BLEU metric (Papineni et al. 2002) accounts for
102
+ the micro-average precision (i.e. summing the numerators and denominators
103
+ for each hypothesis-reference(s) pairs before the division).
104
+
105
+ >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
106
+ ... 'ensures', 'that', 'the', 'military', 'always',
107
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
108
+ >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
109
+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
110
+ ... 'heed', 'Party', 'commands']
111
+ >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
112
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
113
+ ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
114
+ >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
115
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
116
+ ... 'of', 'the', 'party']
117
+
118
+ >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
119
+ ... 'interested', 'in', 'world', 'history']
120
+ >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
121
+ ... 'because', 'he', 'read', 'the', 'book']
122
+
123
+ >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
124
+ >>> hypotheses = [hyp1, hyp2]
125
+ >>> corpus_bleu(list_of_references, hypotheses)
126
+ 0.5520516129306314
127
+
128
+ The example below show that corpus_bleu() is different from averaging
129
+ sentence_bleu() for hypotheses
130
+
131
+ >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
132
+ >>> score2 = sentence_bleu([ref2a], hyp2)
133
+ >>> (score1 + score2) / 2
134
+ 0.6223247442490669
135
+
136
+ :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
137
+ :type references: list(list(list(str)))
138
+ :param hypotheses: a list of hypothesis sentences
139
+ :type hypotheses: list(list(str))
140
+ :param weights: weights for unigrams, bigrams, trigrams and so on
141
+ :type weights: list(float)
142
+ :return: The corpus-level BLEU score.
143
+ :rtype: float
144
+ """
145
+ p_numerators = Counter () # Key = ngram order, and value = no. of ngram matches.
146
+ p_denominators = Counter () # Key = ngram order, and value = no. of ngram in ref.
147
+ hyp_lengths , ref_lengths = 0 , 0
148
+
149
+ assert len (list_of_references ) == len (hypotheses ), "The number of hypotheses and their reference(s) should be the same"
150
+
151
+ # Iterate through each hypothesis and their corresponding references.
152
+ for references , hypothesis in zip (list_of_references , hypotheses ):
153
+ # For each order of ngram, calculate the numerator and
154
+ # denominator for the corpus-level modified precision.
155
+ for i , _ in enumerate (weights , start = 1 ):
156
+ p_i = _modified_precision (references , hypothesis , i )
157
+ p_numerators [i ] += p_i .numerator
158
+ p_denominators [i ] += p_i .denominator
159
+
160
+ # Calculate the hypothesis length and the closest reference length.
161
+ # Adds them to the corpus-level hypothesis and reference counts.
162
+ hyp_len = len (hypothesis )
163
+ hyp_lengths += hyp_len
164
+ ref_lengths += _closest_ref_length (references , hyp_len )
165
+
166
+ # Calculate corpus-level brevity penalty.
167
+ bp = _brevity_penalty (ref_lengths , hyp_lengths )
168
+
169
+ # Calculate corpus-level modified precision.
170
+ p_n = []
171
+ for i , w in enumerate (weights , start = 1 ):
172
+ pn = p_numerators [i ] / p_denominators [i ]
173
+ p_n .append (w * math .log (pn ))
174
+
175
+ return bp * math .exp (math .fsum (p_n ))
176
+
177
+
78
178
def _modified_precision (references , hypothesis , n ):
79
179
"""
80
180
Calculate modified ngram precision.
81
181
82
182
The normal precision method may lead to some wrong translations with
83
183
high-precision, e.g., the translation, in which a word of reference
84
- repeats several times, has very high precision.
184
+ repeats several times, has very high precision.
185
+
186
+ This function only returns the Fraction object that contains the numerator
187
+ and denominator necessary to calculate the corpus-level precision.
188
+ To calculate the modified precision for a single pair of hypothesis and
189
+ references, cast the Fraction object into a float.
85
190
86
191
The famous "the the the ... " example shows that you can get BLEU precision
87
192
by duplicating high frequency words.
@@ -90,7 +195,7 @@ def _modified_precision(references, hypothesis, n):
90
195
>>> reference2 = 'there is a cat on the mat'.split()
91
196
>>> hypothesis1 = 'the the the the the the the'.split()
92
197
>>> references = [reference1, reference2]
93
- >>> _modified_precision(references, hypothesis1, n=1)
198
+ >>> float( _modified_precision(references, hypothesis1, n=1) )
94
199
0.2857142857142857
95
200
96
201
In the modified n-gram precision, a reference word will be considered
@@ -108,9 +213,9 @@ def _modified_precision(references, hypothesis, n):
108
213
... 'of', 'the', 'party']
109
214
>>> hypothesis = 'of the'.split()
110
215
>>> references = [reference1, reference2, reference3]
111
- >>> _modified_precision(references, hypothesis, n=1)
216
+ >>> float( _modified_precision(references, hypothesis, n=1) )
112
217
1.0
113
- >>> _modified_precision(references, hypothesis, n=2)
218
+ >>> float( _modified_precision(references, hypothesis, n=2) )
114
219
1.0
115
220
116
221
An example of a normal machine translation hypothesis:
@@ -136,39 +241,64 @@ def _modified_precision(references, hypothesis, n):
136
241
... 'army', 'always', 'to', 'heed', 'the', 'directions',
137
242
... 'of', 'the', 'party']
138
243
>>> references = [reference1, reference2, reference3]
139
- >>> _modified_precision(references, hypothesis1, n=1)
244
+ >>> float( _modified_precision(references, hypothesis1, n=1) )
140
245
F438
0.9444444444444444
141
- >>> _modified_precision(references, hypothesis2, n=1)
246
+ >>> float( _modified_precision(references, hypothesis2, n=1) )
142
247
0.5714285714285714
143
- >>> _modified_precision(references, hypothesis1, n=2)
248
+ >>> float( _modified_precision(references, hypothesis1, n=2) )
144
249
0.5882352941176471
145
- >>> _modified_precision(references, hypothesis2, n=2)
250
+ >>> float( _modified_precision(references, hypothesis2, n=2) )
146
251
0.07692307692307693
147
-
252
+
253
+
148
254
:param references: A list of reference translations.
149
255
:type references: list(list(str))
150
256
:param hypothesis: A hypothesis translation.
151
257
:type hypothesis: list(str)
152
258
:param n: The ngram order.
153
259
:type n: int
260
+ :return: BLEU's modified precision for the nth order ngram.
261
+ :rtype: Fraction
154
262
"""
155
263
counts = Counter (ngrams (hypothesis , n ))
156
264
157
265
if not counts :
158
- return 0
266
+ return Fraction ( 0 )
159
267
160
268
max_counts = {}
161
269
for reference in references :
162
270
reference_counts = Counter (ngrams (reference , n ))
163
271
for ngram in counts :
164
272
max_counts [ngram ] = max (max_counts .get (ngram , 0 ), reference_counts [ngram ])
165
273
166
- clipped_counts = dict ((ngram , min (count , max_counts [ngram ])) for ngram , count in counts .items ())
167
-
168
- return sum (clipped_counts .values ()) / sum (counts .values ())
274
+ clipped_counts = dict ((ngram , min (count , max_counts [ngram ]))
275
+ for ngram , count in counts .items ())
276
+
277
+ numerator = sum (clipped_counts .values ())
278
+ denominator = sum (counts .values ())
279
+
280
+ return Fraction (numerator , denominator )
281
+
169
282
283
+ def _closest_ref_length (references , hyp_len ):
284
+ """
285
+ This function finds the reference that is the closest length to the
286
+ hypothesis. The closest reference length is referred to as *r* variable
287
+ from the brevity penalty formula in Papineni et. al. (2002)
288
+
289
+ :param references: A list of reference translations.
290
+ :type references: list(list(str))
291
+ :param hypothesis: The length of the hypothesis.
292
+ :type hypothesis: int
293
+ :return: The length of the reference that's closest to the hypothesis.
294
+ :rtype: int
295
+ """
296
+ ref_lens = (len (reference ) for reference in references )
297
+ closest_ref_len = min (ref_lens , key = lambda ref_len :
298
+ (abs (ref_len - hyp_len ), ref_len ))
299
+ return closest_ref_len
170
300
171
- def _brevity_penalty (references , hypothesis ):
301
+ def _brevity_penalty (closest_ref_len , hyp_len ):
172
302
"""
173
303
Calculate brevity penalty.
174
304
@@ -184,15 +314,19 @@ def _brevity_penalty(references, hypothesis):
184
314
>>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
185
315
>>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
186
316
>>> references = [reference1, reference2, reference3]
187
- >>> _brevity_penalty(references, hypothesis)
317
+ >>> hyp_len = len(hypothesis)
318
+ >>> closest_ref_len = _closest_ref_length(references, hyp_len)
319
+ >>> _brevity_penalty(closest_ref_len, hyp_len)
188
320
1.0
189
321
190
322
In case a hypothesis translation is shorter than the references, penalty is
191
323
applied.
192
324
193
325
>>> references = [['a'] * 28, ['a'] * 28]
194
326
>>> hypothesis = ['a'] * 12
195
- >>> _brevity_penalty(references, hypothesis)
327
+ >>> hyp_len = len(hypothesis)
328
+ >>> closest_ref_len = _closest_ref_length(references, hyp_len)
329
+ >>> _brevity_penalty(closest_ref_len, hyp_len)
196
330
0.2635971381157267
197
331
198
332
The length of the closest reference is used to compute the penalty. If the
@@ -202,7 +336,9 @@ def _brevity_penalty(references, hypothesis):
202
336
203
337
>>> references = [['a'] * 13, ['a'] * 2]
204
338
>>> hypothesis = ['a'] * 12
205
- >>> _brevity_penalty(references, hypothesis)
339
+ >>> hyp_len = len(hypothesis)
340
+ >>> closest_ref_len = _closest_ref_length(references, hyp_len)
341
+ >>> _brevity_penalty(closest_ref_len, hyp_len)
206
342
0.9200444146293233
207
343
208
344
The brevity penalty doesn't depend on reference order. More importantly,
@@ -211,34 +347,42 @@ def _brevity_penalty(references, hypothesis):
211
347
212
348
>>> references = [['a'] * 13, ['a'] * 11]
213
349
>>> hypothesis = ['a'] * 12
214
- >>> bp1 = _brevity_penalty(references, hypothesis)
215
- >>> bp2 = _brevity_penalty(reversed(references),hypothesis)
350
+ >>> hyp_len = len(hypothesis)
351
+ >>> closest_ref_len = _closest_ref_length(references, hyp_len)
352
+ >>> bp1 = _brevity_penalty(closest_ref_len, hyp_len)
353
+ >>> hyp_len = len(hypothesis)
354
+ >>> closest_ref_len = _closest_ref_length(reversed(references), hyp_len)
355
+ >>> bp2 = _brevity_penalty(closest_ref_len, hyp_len)
216
356
>>> bp1 == bp2 == 1
217
357
True
218
358
219
359
A test example from mteval-v13a.pl (starting from the line 705):
220
360
221
361
>>> references = [['a'] * 11, ['a'] * 8]
222
362
>>> hypothesis = ['a'] * 7
223
- >>> _brevity_penalty(references, hypothesis)
363
+ >>> hyp_len = len(hypothesis)
364
+ >>> closest_ref_len = _closest_ref_length(references, hyp_len)
365
+ >>> _brevity_penalty(closest_ref_len, hyp_len)
224
366
0.8668778997501817
225
367
226
368
>>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
227
369
>>> hypothesis = ['a'] * 7
228
- >>> _brevity_penalty(references, hypothesis)
370
+ >>> hyp_len = len(hypothesis)
371
+ >>> closest_ref_len = _closest_ref_length(references, hyp_len)
372
+ >>> _brevity_penalty(closest_ref_len, hyp_len)
229
373
1.0
230
374
231
- :param references: A list of reference translations.
232
- :type references: list(list(str))
233
- :param hypothesis: A hypothesis translation.
234
- :type hypothesis: list(str)
375
+ :param hyp_len: The length of the hypothesis for a single sentence OR the
376
+ sum of all the hypotheses' lengths for a corpus
377
+ :type hyp_len: int
378
+ :param closest_ref_len: The length of the closest reference for a single
379
+ hypothesis OR the sum of all the closest references for every hypotheses.
380
+ :type closest_reference_len: int
381
+ :return: BLEU's brevity penalty.
382
+ :rtype: float
235
383
"""
236
- c = len (hypothesis )
237
- ref_lens = (len (reference ) for reference in references )
238
- r = min (ref_lens , key = lambda ref_len : (abs (ref_len - c ), ref_len ))
239
-
240
- if c > r :
384
+ if hyp_len > closest_ref_len :
241
385
return 1
242
386
else :
243
- return math .exp (1 - r / c )
387
+ return math .exp (1 - closest_ref_len / hyp_len )
244
388
0 commit comments