SingleSource/Regression/C/uint64_to_float.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

#include <stdio.h>
#include <stdint.h>
#include <fenv.h>
#include <float.h>

// tests uint64_t --> float conversions.  Not an exhaustive test, but sufficent
// to identify all reasonable bugs in such routines that I have yet encountered.
//
// Specifically, we walk up to four bits through all possible bit positions.
// This suffices to catch double-rounding errors from pretty much every
// "reasonable" algorithm one might pick to implement this conversion.  (It
// will miss lots of errors in "unreasonable" algorithms, but we trust that
// the code under test at least passes a sniff test).
//
// We test in all four basic rounding modes, to further flush out any
// double-rounding issues, or behavior at zero.

typedef union
{
    uint32_t u;
    float f;
} float_bits;


float
floatundisf(uint64_t a)
{
    if (a == 0)
        return 0.0F;
    const unsigned N = sizeof(uint64_t) * 8;
    int sd = N - __builtin_clzll(a);  /* number of significant digits */
    int e = sd - 1;             /* 8 exponent */
    if (sd > FLT_MANT_DIG)
    {
        /*  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
         *  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
         *                                                12345678901234567890123456
         *  1 = msb 1 bit
         *  P = bit FLT_MANT_DIG-1 bits to the right of 1
         *  Q = bit FLT_MANT_DIG bits to the right of 1
         *  R = "or" of all bits to the right of Q
         */
        switch (sd)
        {
        case FLT_MANT_DIG + 1:
            a <<= 1;
            break;
        case FLT_MANT_DIG + 2:
            break;
        default:
            a = (a >> (sd - (FLT_MANT_DIG+2))) |
                ((a & ((uint64_t)(-1) >> ((N + FLT_MANT_DIG+2) - sd))) != 0);
        };
        /* finish: */
        a |= (a & 4) != 0;  /* Or P into R */
        ++a;  /* round - this step may add a significant bit */
        a >>= 2;  /* dump Q and R */
        /* a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits */
        if (a & ((uint64_t)1 << FLT_MANT_DIG))
        {
            a >>= 1;
            ++e;
        }
        /* a is now rounded to FLT_MANT_DIG bits */
    }
    else
    {
        a <<= (FLT_MANT_DIG - sd);
        /* a is now rounded to FLT_MANT_DIG bits */
    }
    float_bits fb;
    fb.u = ((e + 127) << 23)       |  /* exponent */
           ((int32_t)a & 0x007FFFFF);  /* mantissa */
    return fb.f;
}

void test(uint64_t x) {
	const float_bits expected = { .f = floatundisf(x) };
	const float_bits observed = { .f = x };
    
	if (expected.u != observed.u) {
		printf("Error detected @ 0x%016llx\n", x);
		printf("\tExpected result: %a (0x%08x)\n", expected.f, expected.u);
		printf("\tObserved result: %a (0x%08x)\n", observed.f, observed.u);
	}
}

int main(int argc, char *argv[]) {
  int i, j, k, l, m;
	const uint64_t one = 1;
	const uint64_t mone = -1;
    
    // FIXME: Other rounding modes are temporarily disabled until we have
    // a canonical source to compare against.
    const int roundingModes[4] = { FE_TONEAREST };
    const char *modeNames[4] = {"to nearest", "down", "up", "towards zero"};
    
    for ( m=0; m<4; ++m) {
        fesetround(roundingModes[0]);
        printf("Testing uint64_t --> float conversions in round %s...\n",
               modeNames[m]);
    
        test(0);
	for ( i=0; i<64; ++i) {
		test( one << i);
		test(mone << i); 
	for ( j=0; j<i; ++j) {
		test(( one << i) + ( one << j));
		test(( one << i) + (mone << j));
		test((mone << i) + ( one << j));
		test((mone << i) + (mone << j));
	for ( k=0; k<j; ++k) {
		test(( one << i) + ( one << j) + ( one << k));
		test(( one << i) + ( one << j) + (mone << k));
		test(( one << i) + (mone << j) + ( one << k));
		test(( one << i) + (mone << j) + (mone << k));
		test((mone << i) + ( one << j) + ( one << k));
		test((mone << i) + ( one << j) + (mone << k));
		test((mone << i) + (mone << j) + ( one << k));
		test((mone << i) + (mone << j) + (mone << k));
	for ( l=0; l<k; ++l) {
		test(( one << i) + ( one << j) + ( one << k) + ( one << l));
		test(( one << i) + ( one << j) + ( one << k) + (mone << l));
		test(( one << i) + ( one << j) + (mone << k) + ( one << l));
		test(( one << i) + ( one << j) + (mone << k) + (mone << l));
		test(( one << i) + (mone << j) + ( one << k) + ( one << l));
		test(( one << i) + (mone << j) + ( one << k) + (mone << l));
		test(( one << i) + (mone << j) + (mone << k) + ( one << l));
		test(( one << i) + (mone << j) + (mone << k) + (mone << l));
		test((mone << i) + ( one << j) + ( one << k) + ( one << l));
		test((mone << i) + ( one << j) + ( one << k) + (mone << l));
		test((mone << i) + ( one << j) + (mone << k) + ( one << l));
		test((mone << i) + ( one << j) + (mone << k) + (mone << l));
		test((mone << i) + (mone << j) + ( one << k) + ( one << l));
		test((mone << i) + (mone << j) + ( one << k) + (mone << l));
		test((mone << i) + (mone << j) + (mone << k) + ( one << l));
		test((mone << i) + (mone << j) + (mone << k) + (mone << l));
	}
	}
	}
	}
    }
	printf("Finished Testing.\n");
  return 0;
}