-
Notifications
You must be signed in to change notification settings - Fork 0
/
floating_point.hpp
190 lines (163 loc) · 6.74 KB
/
floating_point.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
// This template class represents an IEEE floating-point number
// (either single-precision or double-precision, depending on the
// template parameters).
//
// The purpose of this class is to do more sophisticated number
// comparison. (Due to round-off error, etc, it's very unlikely that
// two floating-points will be equal exactly. Hence a naive
// comparison by the == operation often doesn't work.)
//
// Format of IEEE floating-point:
//
// The most-significant bit being the leftmost, an IEEE
// floating-point looks like
//
// sign_bit exponent_bits fraction_bits
//
// Here, sign_bit is a single bit that designates the sign of the
// number.
//
// For float, there are 8 exponent bits and 23 fraction bits.
//
// For double, there are 11 exponent bits and 52 fraction bits.
//
// More details can be found at
// http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
//
// Template parameter:
//
// RawType: the raw floating-point type (either float or double)
template <typename RawType>
class FloatingPoint {
public:
// Defines the unsigned integer type that has the same size as the
// floating point number.
typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
// Constants.
// # of bits in a number.
static const size_t kBitCount = 8*sizeof(RawType);
// # of fraction bits in a number.
static const size_t kFractionBitCount =
std::numeric_limits<RawType>::digits - 1;
// # of exponent bits in a number.
static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
// The mask for the sign bit.
static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
// The mask for the fraction bits.
static const Bits kFractionBitMask =
~static_cast<Bits>(0) >> (kExponentBitCount + 1);
// The mask for the exponent bits.
static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
// How many ULP's (Units in the Last Place) we want to tolerate when
// comparing two numbers. The larger the value, the more error we
// allow. A 0 value means that two numbers must be exactly the same
// to be considered equal.
//
// The maximum error of a single floating-point operation is 0.5
// units in the last place. On Intel CPU's, all floating-point
// calculations are done with 80-bit precision, while double has 64
// bits. Therefore, 4 should be enough for ordinary use.
//
// See the following article for more details on ULP:
// http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
static const size_t kMaxUlps = 4;
// Constructs a FloatingPoint from a raw floating-point number.
//
// On an Intel CPU, passing a non-normalized NAN (Not a Number)
// around may change its bits, although the new value is guaranteed
// to be also a NAN. Therefore, don't expect this constructor to
// preserve the bits in x when x is a NAN.
explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
// Static methods
// Reinterprets a bit pattern as a floating-point number.
//
// This function is needed to test the AlmostEquals() method.
static RawType ReinterpretBits(const Bits bits) {
FloatingPoint fp(0);
fp.u_.bits_ = bits;
return fp.u_.value_;
}
// Returns the floating-point number that represent positive infinity.
static RawType Infinity() {
return ReinterpretBits(kExponentBitMask);
}
// Returns the maximum representable finite floating-point number.
static RawType Max();
// Non-static methods
// Returns the bits that represents this number.
const Bits &bits() const { return u_.bits_; }
// Returns the exponent bits of this number.
Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
// Returns the fraction bits of this number.
Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
// Returns the sign bit of this number.
Bits sign_bit() const { return kSignBitMask & u_.bits_; }
// Returns true iff this is NAN (not a number).
bool is_nan() const {
// It's a NAN if the exponent bits are all ones and the fraction
// bits are not entirely zeros.
return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
}
// Returns true iff this number is at most kMaxUlps ULP's away from
// rhs. In particular, this function:
//
// - returns false if either number is (or both are) NAN.
// - treats really large numbers as almost equal to infinity.
// - thinks +0.0 and -0.0 are 0 DLP's apart.
bool AlmostEquals(const FloatingPoint& rhs) const {
// The IEEE standard says that any comparison operation involving
// a NAN must return false.
if (is_nan() || rhs.is_nan()) return false;
return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
<= kMaxUlps;
}
private:
// The data type used to store the actual floating-point number.
union FloatingPointUnion {
RawType value_; // The raw floating-point number.
Bits bits_; // The bits that represent the number.
};
// Converts an integer from the sign-and-magnitude representation to
// the biased representation. More precisely, let N be 2 to the
// power of (kBitCount - 1), an integer x is represented by the
// unsigned number x + N.
//
// For instance,
//
// -N + 1 (the most negative number representable using
// sign-and-magnitude) is represented by 1;
// 0 is represented by N; and
// N - 1 (the biggest number representable using
// sign-and-magnitude) is represented by 2N - 1.
//
// Read http://en.wikipedia.org/wiki/Signed_number_representations
// for more details on signed number representations.
static Bits SignAndMagnitudeToBiased(const Bits &sam) {
if (kSignBitMask & sam) {
// sam represents a negative number.
return ~sam + 1;
} else {
// sam represents a positive number.
return kSignBitMask | sam;
}
}
// Given two numbers in the sign-and-magnitude representation,
// returns the distance between them as an unsigned number.
static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
const Bits &sam2) {
const Bits biased1 = SignAndMagnitudeToBiased(sam1);
const Bits biased2 = SignAndMagnitudeToBiased(sam2);
return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
}
FloatingPointUnion u_;
};
// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
// macro defined by <windows.h>.
template <>
inline float FloatingPoint<float>::Max() { return FLT_MAX; }
template <>
inline double FloatingPoint<double>::Max() { return DBL_MAX; }
// Typedefs the instances of the FloatingPoint template class that we
// care to use.
typedef FloatingPoint<float> Float;
typedef FloatingPoint<double> Double;