10000 :bug: Fix rounding to nearest · srcarroll/flexfloat@188ae76 · GitHub
[go: up one dir, main page]

Skip to content

Commit 188ae76

Browse files
author
Stefan Mach
committed
🐛 Fix rounding to nearest
1 parent 28be2d4 commit 188ae76

File tree

4 files changed

+132
-43
lines changed

4 files changed

+132
-43
lines changed

src/flexfloat.c

Lines changed: 44 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,8 @@ uint_t flexfloat_get_bits(flexfloat_t *a)
105105

106106
#ifdef FLEXFLOAT_ROUNDING
107107

108-
// check if rounding to nearest is required (the most significant bit of the discarded ones is 1)
109-
bool flexfloat_nearest_rounding(const flexfloat_t *a, int_fast16_t exp)
108+
// get rounding bit from backend value (first bit after represented LSB)
109+
bool flexfloat_round_bit(const flexfloat_t *a, int_fast16_t exp)
110110
{
111111
if(exp <= 0 && EXPONENT(CAST_TO_INT(a->value)) != 0)
112112
{
@@ -122,27 +122,46 @@ bool flexfloat_nearest_rounding(const flexfloat_t *a, int_fast16_t exp)
122122
}
123123
}
124124

125-
// check if rounding to +inf/-inf is required (at least one bit of the discarded ones is 1)
126-
bool flexfloat_inf_rounding(const flexfloat_t *a, int_fast16_t exp, bool sign, bool plus)
125+
// get sticky bit from backend value (logic OR of all bits after represented LSB except the round bit)
126+
bool flexfloat_sticky_bit(const flexfloat_t *a, int_fast16_t exp)
127127
{
128-
if((plus && !sign) || (!plus && sign))
128+
if(exp <= 0 && EXPONENT(CAST_TO_INT(a->value)) != 0)
129129
{
130-
if(exp <= 0 && EXPONENT(CAST_TO_INT(a->value)) != 0)
130+
int shift = (- exp + 1);
131+
uint_t denorm = 0;
132+
if(shift < NUM_BITS)
133+
denorm = ((CAST_TO_INT(a->value) & MASK_FRAC) | MASK_FRAC_MSB) >> shift;
134+
return (denorm & (MASK_FRAC >> (a->desc.frac_bits + 1))) ||
135+
( ((denorm & MASK_FRAC) == 0) && (CAST_TO_INT(a->value)!=0) );
136+
}
137+
else
138+
{
139+
return CAST_TO_INT(a->value) & (MASK_FRAC >> (a->desc.frac_bits + 1));
140+
}
141+
}
142+
143+
// check if rounding to nearest is required (the most significant bit of the discarded ones is 1)
144+
bool flexfloat_nearest_rounding(const flexfloat_t *a, int_fast16_t exp)
145+
{
146+
if (flexfloat_round_bit(a, exp))
147+
if (flexfloat_sticky_bit(a, exp)) // > ulp/2 away
131148
{
132-
int shift = (- exp + 1);
133-
uint_t denorm = 0;
134-
if(shift < NUM_BITS)
135-
denorm = ( ((CAST_TO_INT(a->value) & MASK_FRAC)
136-
| MASK_FRAC_MSB)
137-
) >> shift;
138-
return (denorm & (MASK_FRAC >> (a->desc.frac_bits))) ||
139-
( ((denorm & MASK_FRAC) == 0) && (CAST_TO_INT(a->value)!=0) );
149+
return 1;
140150
}
141-
else
151+
else // = ulp/2 away, round towards even result, decided by LSB of mantissa
142152
{
143-
return CAST_TO_INT(a->value) & (MASK_FRAC >> (a->desc.frac_bits));
153+
if (exp <= 0) // denormal
154+
return flexfloat_denorm_frac(a, exp) & 0x1;
155+
return flexfloat_frac(a) & 0x1;
144156
}
145-
}
157+
return 0; // < ulp/2 away
158+
}
159+
160+
// check if rounding to +inf/-inf is required (at least one bit of the discarded ones is 1)
161+
bool flexfloat_inf_rounding(const flexfloat_t *a, int_fast16_t exp, bool sign, bool plus)
162+
{
163+
if (flexfloat_round_bit(a, exp) || flexfloat_sticky_bit(a, exp))
164+
return (plus ^ sign);
146165
return 0;
147166
}
148167

@@ -181,25 +200,9 @@ void flexfloat_sanitize(flexfloat_t *a)
181200
// Sign
182201
sign = flexfloat_sign(a);
183202

184-
// Denormalized backend value
185-
if(EXPONENT(CAST_TO_INT(a->value)) == 0)
186-
{
187-
// Set to the smallest normalized value
188-
if(a->desc.exp_bits < NUM_BITS_EXP)
189-
{
190-
191-
CAST_TO_INT(a->value) = (sign == 0? SMALLEST_NORM_POS:
192-
SMALLEST_NORM_NEG);
193-
}
194-
}
195-
196203
// Exponent
197204
exp = flexfloat_exp(a);
198205

199-
200-
// Exponent of NaN and Inf (target format)
201-
inf_exp = flexfloat_inf_exp(a->desc);
202-
203206
#ifdef FLEXFLOAT_ROUNDING
204207
// In these cases no rounding is needed
205208
if (!(exp == INF_EXP || a->desc.frac_bits == NUM_BITS_FRAC))
@@ -229,11 +232,17 @@ void flexfloat_sanitize(flexfloat_t *a)
229232
}
230233
#endif
231234

235+
// Exponent of NaN and Inf (target format)
236+
inf_exp = flexfloat_inf_exp(a->desc);
237+
232238
// Mantissa
233239
frac = flexfloat_frac(a);
234240

235-
if(EXPONENT(CAST_TO_INT(a->value)) == 0) // Denorm backend format
241+
if(EXPONENT(CAST_TO_INT(a->value)) == 0) // Denorm backend format - represented format also denormal
242+
{
243+
CAST_TO_INT(a->value) = flexfloat_denorm_pack(a->desc, sign, frac);
236244
return;
245+
}
237246

238247
if(exp <= 0) // Denormalized value in the target format (saved in normalized format in the backend value)
239248
{
@@ -635,4 +644,4 @@ void ff_print_stats() {
635644
}
636645
}
637646

638-
#endif /* FLEXFLOAT_STATS */
647+
#endif /* FLEXFLOAT_STATS */

test/downward_rounding.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,30 @@ TEST(FlexFloatDownwardRoundingTest, Denormal3Bits) {
5656
EXPECT_EQ("0-000-111", bitstring(ff_val));
5757
}
5858

59+
TEST(FlexFloatDownwardRoundingTest, DenormalQuarter) {
60+
fesetround(FE_DOWNWARD);
61+
const double val = 0.0703125; // 1/16 + 1/128
62+
flexfloat<3, 3> ff_val;
63+
ff_val = val;
64+
EXPECT_EQ("0-000-010", bitstring(ff_val));
65+
}
66+
67+
TEST(FlexFloatDownwardRoundingTest, DenormalHalfway) {
68+
fesetround(FE_DOWNWARD);
69+
const double val = 0.078125; // 1/16 + 1/64
70+
flexfloat<3, 3> ff_val;
71+
ff_val = val;
72+
EXPECT_EQ("0-000-010", bitstring(ff_val));
73+
}
74+
75+
TEST(FlexFloatDownwardRoundingTest, DenormalThreeQuarters) {
76+
fesetround(FE_DOWNWARD);
77+
const double val = 0.0859375; // 1/16 + 3/128
78+
flexfloat<3, 3> ff_val;
79+
ff_val = val;
80+
EXPECT_EQ("0-000-010", bitstring(ff_val));
81+
}
82+
5983
TEST(FlexFloatDownwardRoundingTest, DenormalSmallest) {
6084
fesetround(FE_DOWNWARD);
6185
const double val = 0.03125;
@@ -66,15 +90,15 @@ TEST(FlexFloatDownwardRoundingTest, DenormalSmallest) {
6690

6791
TEST(FlexFloatDownwardRoundingTest, LessThanDenormalSmallest1) {
6892
fesetround(FE_DOWNWARD);
69-
const double val = 0.015625;
93+
const double val = 0.0234375; // 1/32-1/64 = 3/128
7094
flexfloat<3, 3> ff_val;
7195
ff_val = val;
7296
EXPECT_EQ("0-000-000", bitstring(ff_val));
7397
}
7498

7599
TEST(FlexFloatDownwardRoundingTest, LessThanDenormalSmallest2) {
76100
fesetround(FE_DOWNWARD);
77-
const double val = 1.5625e-05;
101+
const double val = 0.015625; // 1/64
78102
flexfloat<3, 3> ff_val;
79103
ff_val = val;
80104
EXPECT_EQ("0-000-000", bitstring(ff_val));

test/nearest_rounding.cpp

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,25 +56,49 @@ TEST(FlexFloatNearestRoundingTest, Denormal3Bits) {
5656
EXPECT_EQ("0-000-111", bitstring(ff_val));
5757
}
5858

59+
TEST(FlexFloatNearestRoundingTest, DenormalQuarter) {
60+
fesetround(FE_TONEAREST);
61+
const double val = 0.0703125; // 1/16 + 1/128
62+
flexfloat<3, 3> ff_val;
63+
ff_val = val;
64+
EXPECT_EQ("0-000-010", bitstring(ff_val));
65+
}
66+
67+
TEST(FlexFloatNearestRoundingTest, DenormalHalfway) {
68+
fesetround(FE_TONEAREST);
69+
const double val = 0.078125; // 1/16 + 1/64
70+
flexfloat<3, 3> ff_val;
71+
ff_val = val;
72+
EXPECT_EQ("0-000-010", bitstring(ff_val));
73+
}
74+
75+
TEST(FlexFloatNearestRoundingTest, DenormalThreeQuarters) {
76+
fesetround(FE_TONEAREST);
77+
const double val = 0.0859375; // 1/16 + 3/128
78+
flexfloat<3, 3> ff_val;
79+
ff_val = val;
80+
EXPECT_EQ("0-000-011", bitstring(ff_val));
81+
}
82+
5983
TEST(FlexFloatNearestRoundingTest, DenormalSmallest) {
6084
fesetround(FE_TONEAREST);
61-
const double val = 0.03125;
85+
const double val = 0.03125; // 1/32
6286
flexfloat<3, 3> ff_val;
6387
ff_val = val;
6488
EXPECT_EQ("0-000-001", bitstring(ff_val));
6589
}
6690

6791
TEST(FlexFloatNearestRoundingTest, LessThanDenormalSmallest1) {
6892
fesetround(FE_TONEAREST);
69-
const double val = 0.015625;
93+
const double val = 0.0234375; // 1/32-1/64 = 3/128
7094
flexfloat<3, 3> ff_val;
7195
ff_val = val;
7296
EXPECT_EQ("0-000-001", bitstring(ff_val));
7397
}
7498

7599
TEST(FlexFloatNearestRoundingTest, LessThanDenormalSmallest2) {
76100
fesetround(FE_TONEAREST);
77-
const double val = 1.5625e-05;
101+
const double val = 0.015625; // 1/64
78102
flexfloat<3, 3> ff_val;
79103
ff_val = val;
80104
EXPECT_EQ("0-000-000", bitstring(ff_val));
@@ -133,7 +157,7 @@ TEST(FlexFloatNearestRoundingTest, DoubleToApproximate3) {
133157
const double val = 12.5;
134158
flexfloat<3, 3> ff_val;
135159
ff_val = val;
136-
EXPECT_EQ("0-110-101", bitstring(ff_val));
160+
EXPECT_EQ("0-110-100", bitstring(ff_val));
137161
}
138162

139163
TEST(FlexFloatNearestRoundingTest, DoubleNeg) {

test/upward_rounding.cpp

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,38 @@ TEST(FlexFloatUpwardRoundingTest, Denormal3Bits) {
5656
EXPECT_EQ("0-000-111", bitstring(ff_val));
5757
}
5858

59+
TEST(FlexFloatUpwardRoundingTest, DenormalEighth) {
60+
fesetround(FE_UPWARD);
61+
const double val = 0.06640625; // 1/16 + 1/256
62+
flexfloat<3, 3> ff_val;
63+
ff_val = val;
64+
EXPECT_EQ("0-000-011", bitstring(ff_val));
65+
}
66+
67+
TEST(FlexFloatUpwardRoundingTest, DenormalQuarter) {
68+
fesetround(FE_UPWARD);
69+
const double val = 0.0703125; // 1/16 + 1/128
70+
flexfloat<3, 3> ff_val;
71+
ff_val = val;
72+
EXPECT_EQ("0-000-011", bitstring(ff_val));
73+
}
74+
75+
TEST(FlexFloatUpwardRoundingTest, DenormalHalfway) {
76+
fesetround(FE_UPWARD);
77+
const double val = 0.078125; // 1/16 + 1/64
78+
flexfloat<3, 3> ff_val;
79+
ff_val = val;
80+
EXPECT_EQ("0-000-011", bitstring(ff_val));
81+
}
82+
83+
TEST(FlexFloatUpwardRoundingTest, DenormalThreeQuarters) {
84+
fesetround(FE_UPWARD);
85+
const double val = 0.0859375; // 1/16 + 3/128
86+
flexfloat<3, 3> ff_val;
87+
ff_val = val;
88+
EXPECT_EQ("0-000-011", bitstring(ff_val));
89+
}
90+
5991
TEST(FlexFloatUpwardRoundingTest, DenormalSmallest) {
6092
fesetround(FE_UPWARD);
6193
const double val = 0.03125;
@@ -66,15 +98,15 @@ TEST(FlexFloatUpwardRoundingTest, DenormalSmallest) {
6698

6799
TEST(FlexFloatUpwardRoundingTest, LessThanDenormalSmallest1) {
68100
fesetround(FE_UPWARD);
69-
const double val = 0.015625;
101+
const double val = 0.0234375; // 1/32-1/64 = 3/128
70102
flexfloat<3, 3> ff_val;
71103
ff_val = val;
72104
EXPECT_EQ("0-000-001", bitstring(ff_val));
73105
}
74106

75107
TEST(FlexFloatUpwardRoundingTest, LessThanDenormalSmallest2) {
76108
fesetround(FE_UPWARD);
77-
const double val = 1.5625e-05;
109+
const double val = 0.015625; // 1/64
78110
flexfloat<3, 3> ff_val;
79111
ff_val = val;
80112
EXPECT_EQ("0-000-001", bitstring(ff_val));

0 commit comments

Comments
 (0)
0