parent
031eb13341
commit
3420d691f5
6 changed files with 870 additions and 75 deletions
@ -0,0 +1,443 @@ |
|||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// OpenGL Mathematics Copyright (c) 2005 - 2014 G-Truc Creation (www.g-truc.net)
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Created : 2014-10-27
|
||||||
|
// Updated : 2014-10-27
|
||||||
|
// Licence : This source is under MIT licence
|
||||||
|
// File : test/core/func_integer_find_lsb.cpp
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// This has the programs for computing the number of leading zeros
|
||||||
|
// in a word.
|
||||||
|
// Max line length is 57, to fit in hacker.book.
|
||||||
|
// Compile with g++, not gcc.
|
||||||
|
#include <cstdio> |
||||||
|
#include <cstdlib> // To define "exit", req'd by XLC. |
||||||
|
#include <ctime> // To define "exit", req'd by XLC. |
||||||
|
|
||||||
|
#define LE 1 // 1 for little-endian, 0 for big-endian.
|
||||||
|
|
||||||
|
int pop(unsigned x) { |
||||||
|
x = x - ((x >> 1) & 0x55555555); |
||||||
|
x = (x & 0x33333333) + ((x >> 2) & 0x33333333); |
||||||
|
x = (x + (x >> 4)) & 0x0F0F0F0F; |
||||||
|
x = x + (x << 8); |
||||||
|
x = x + (x << 16); |
||||||
|
return x >> 24; |
||||||
|
} |
||||||
|
|
||||||
|
int nlz1(unsigned x) { |
||||||
|
int n; |
||||||
|
|
||||||
|
if (x == 0) return(32); |
||||||
|
n = 0; |
||||||
|
if (x <= 0x0000FFFF) {n = n +16; x = x <<16;} |
||||||
|
if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;} |
||||||
|
if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;} |
||||||
|
if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;} |
||||||
|
if (x <= 0x7FFFFFFF) {n = n + 1;} |
||||||
|
return n; |
||||||
|
} |
||||||
|
|
||||||
|
int nlz1a(unsigned x) { |
||||||
|
int n; |
||||||
|
|
||||||
|
/* if (x == 0) return(32); */ |
||||||
|
if ((int)x <= 0) return (~x >> 26) & 32; |
||||||
|
n = 1; |
||||||
|
if ((x >> 16) == 0) {n = n +16; x = x <<16;} |
||||||
|
if ((x >> 24) == 0) {n = n + 8; x = x << 8;} |
||||||
|
if ((x >> 28) == 0) {n = n + 4; x = x << 4;} |
||||||
|
if ((x >> 30) == 0) {n = n + 2; x = x << 2;} |
||||||
|
n = n - (x >> 31); |
||||||
|
return n; |
||||||
|
} |
||||||
|
// On basic Risc, 12 to 20 instructions.
|
||||||
|
|
||||||
|
int nlz2(unsigned x) { |
||||||
|
unsigned y; |
||||||
|
int n; |
||||||
|
|
||||||
|
n = 32; |
||||||
|
y = x >>16; if (y != 0) {n = n -16; x = y;} |
||||||
|
y = x >> 8; if (y != 0) {n = n - 8; x = y;} |
||||||
|
y = x >> 4; if (y != 0) {n = n - 4; x = y;} |
||||||
|
y = x >> 2; if (y != 0) {n = n - 2; x = y;} |
||||||
|
y = x >> 1; if (y != 0) return n - 2; |
||||||
|
return n - x; |
||||||
|
} |
||||||
|
|
||||||
|
// As above but coded as a loop for compactness:
|
||||||
|
// 23 to 33 basic Risc instructions.
|
||||||
|
int nlz2a(unsigned x) { |
||||||
|
unsigned y; |
||||||
|
int n, c; |
||||||
|
|
||||||
|
n = 32; |
||||||
|
c = 16; |
||||||
|
do { |
||||||
|
y = x >> c; if (y != 0) {n = n - c; x = y;} |
||||||
|
c = c >> 1; |
||||||
|
} while (c != 0); |
||||||
|
return n - x; |
||||||
|
} |
||||||
|
|
||||||
|
int nlz3(int x) { |
||||||
|
int y, n; |
||||||
|
|
||||||
|
n = 0; |
||||||
|
y = x; |
||||||
|
L: if (x < 0) return n; |
||||||
|
if (y == 0) return 32 - n; |
||||||
|
n = n + 1; |
||||||
|
x = x << 1; |
||||||
|
y = y >> 1; |
||||||
|
goto L; |
||||||
|
} |
||||||
|
|
||||||
|
int nlz4(unsigned x) { |
||||||
|
int y, m, n; |
||||||
|
|
||||||
|
y = -(x >> 16); // If left half of x is 0,
|
||||||
|
m = (y >> 16) & 16; // set n = 16. If left half
|
||||||
|
n = 16 - m; // is nonzero, set n = 0 and
|
||||||
|
x = x >> m; // shift x right 16.
|
||||||
|
// Now x is of the form 0000xxxx.
|
||||||
|
y = x - 0x100; // If positions 8-15 are 0,
|
||||||
|
m = (y >> 16) & 8; // add 8 to n and shift x left 8.
|
||||||
|
n = n + m; |
||||||
|
x = x << m; |
||||||
|
|
||||||
|
y = x - 0x1000; // If positions 12-15 are 0,
|
||||||
|
m = (y >> 16) & 4; // add 4 to n and shift x left 4.
|
||||||
|
n = n + m; |
||||||
|
x = x << m; |
||||||
|
|
||||||
|
y = x - 0x4000; // If positions 14-15 are 0,
|
||||||
|
m = (y >> 16) & 2; // add 2 to n and shift x left 2.
|
||||||
|
n = n + m; |
||||||
|
x = x << m; |
||||||
|
|
||||||
|
y = x >> 14; // Set y = 0, 1, 2, or 3.
|
||||||
|
m = y & ~(y >> 1); // Set m = 0, 1, 2, or 2 resp.
|
||||||
|
return n + 2 - m; |
||||||
|
} |
||||||
|
|
||||||
|
int nlz5(unsigned x) { |
||||||
|
int pop(unsigned x); |
||||||
|
|
||||||
|
x = x | (x >> 1); |
||||||
|
x = x | (x >> 2); |
||||||
|
x = x | (x >> 4); |
||||||
|
x = x | (x >> 8); |
||||||
|
x = x | (x >>16); |
||||||
|
return pop(~x); |
||||||
|
} |
||||||
|
|
||||||
|
/* The four programs below are not valid ANSI C programs. This is
|
||||||
|
because they refer to the same storage locations as two different types. |
||||||
|
However, they work with xlc/AIX, gcc/AIX, and gcc/NT. If you try to |
||||||
|
code them more compactly by declaring a variable xx to be "double," and |
||||||
|
then using |
||||||
|
|
||||||
|
n = 1054 - (*((unsigned *)&xx + LE) >> 20); |
||||||
|
|
||||||
|
then you are violating not only the rule above, but also the ANSI C |
||||||
|
rule that pointer arithmetic can be performed only on pointers to |
||||||
|
array elements. |
||||||
|
When coded with the above statement, the program fails with xlc, |
||||||
|
gcc/AIX, and gcc/NT, at some optimization levels. |
||||||
|
BTW, these programs use the "anonymous union" feature of C++, not |
||||||
|
available in C. */ |
||||||
|
|
||||||
|
int nlz6(unsigned k) { |
||||||
|
union { |
||||||
|
unsigned asInt[2]; |
||||||
|
double asDouble; |
||||||
|
}; |
||||||
|
int n; |
||||||
|
|
||||||
|
asDouble = (double)k + 0.5; |
||||||
|
n = 1054 - (asInt[LE] >> 20); |
||||||
|
return n; |
||||||
|
} |
||||||
|
|
||||||
|
int nlz7(unsigned k) { |
||||||
|
union { |
||||||
|
unsigned asInt[2]; |
||||||
|
double asDouble; |
||||||
|
}; |
||||||
|
int n; |
||||||
|
|
||||||
|
asDouble = (double)k; |
||||||
|
n = 1054 - (asInt[LE] >> 20); |
||||||
|
n = (n & 31) + (n >> 9); |
||||||
|
return n; |
||||||
|
} |
||||||
|
|
||||||
|
/* In single precision, round-to-nearest mode, the basic method fails for:
|
||||||
|
k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF, |
||||||
|
07FFFFFC <= k <= 07FFFFFF, |
||||||
|
0FFFFFF8 <= k <= 0FFFFFFF, |
||||||
|
... |
||||||
|
7FFFFFC0 <= k <= 7FFFFFFF. |
||||||
|
FFFFFF80 <= k <= FFFFFFFF. |
||||||
|
For k = 0 it gives 158, and for the other values it is too low by 1. */ |
||||||
|
|
||||||
|
int nlz8(unsigned k) { |
||||||
|
union { |
||||||
|
unsigned asInt; |
||||||
|
float asFloat; |
||||||
|
}; |
||||||
|
int n; |
||||||
|
|
||||||
|
k = k & ~(k >> 1); /* Fix problem with rounding. */ |
||||||
|
asFloat = (float)k + 0.5f; |
||||||
|
n = 158 - (asInt >> 23); |
||||||
|
return n; |
||||||
|
} |
||||||
|
|
||||||
|
/* The example below shows how to make a macro for nlz. It uses an
|
||||||
|
extension to the C and C++ languages that is provided by the GNU C/C++ |
||||||
|
compiler, namely, that of allowing statements and declarations in |
||||||
|
expressions (see "Using and Porting GNU CC", by Richard M. Stallman |
||||||
|
(1998). The underscores are necessary to protect against the |
||||||
|
possibility that the macro argument will conflict with one of its local |
||||||
|
variables, e.g., NLZ(k). */ |
||||||
|
|
||||||
|
int nlz9(unsigned k) { |
||||||
|
union { |
||||||
|
unsigned asInt; |
||||||
|
float asFloat; |
||||||
|
}; |
||||||
|
int n; |
||||||
|
|
||||||
|
k = k & ~(k >> 1); /* Fix problem with rounding. */ |
||||||
|
asFloat = (float)k; |
||||||
|
n = 158 - (asInt >> 23); |
||||||
|
n = (n & 31) + (n >> 6); /* Fix problem with k = 0. */ |
||||||
|
return n; |
||||||
|
} |
||||||
|
|
||||||
|
/* Below are three nearly equivalent programs for computing the number
|
||||||
|
of leading zeros in a word. This material is not in HD, but may be in a |
||||||
|
future edition. |
||||||
|
Immediately below is Robert Harley's algorithm, found at the |
||||||
|
comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert |
||||||
|
Juffa. |
||||||
|
Table entries marked "u" are unused. 14 ops including a multiply, |
||||||
|
plus an indexed load. |
||||||
|
The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all |
||||||
|
of form 2**k + 1). There are no multipliers of three terms of the form |
||||||
|
2**k +- 1 that work, with a table size of 64 or 128. There are some, |
||||||
|
with a table size of 64, if you precede the multiplication with x = x - |
||||||
|
(x >> 1), but that seems less elegant. There are also some if you use a |
||||||
|
table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would |
||||||
|
save two instructions in the form of this algorithm with the |
||||||
|
multiplication expanded into shifts and adds, but the table size is |
||||||
|
getting a bit large). */ |
||||||
|
|
||||||
|
#define u 99 |
||||||
|
int nlz10(unsigned x) { |
||||||
|
|
||||||
|
static char table[64] = |
||||||
|
{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, |
||||||
|
u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, |
||||||
|
17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, |
||||||
|
5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u}; |
||||||
|
|
||||||
|
x = x | (x >> 1); // Propagate leftmost
|
||||||
|
x = x | (x >> 2); // 1-bit to the right.
|
||||||
|
x = x | (x >> 4); |
||||||
|
x = x | (x >> 8); |
||||||
|
x = x | (x >>16); |
||||||
|
x = x*0x06EB14F9; // Multiplier is 7*255**3.
|
||||||
|
return table[x >> 26]; |
||||||
|
} |
||||||
|
|
||||||
|
/* Harley's algorithm with multiply expanded.
|
||||||
|
19 elementary ops plus an indexed load. */ |
||||||
|
|
||||||
|
int nlz10a(unsigned x) { |
||||||
|
|
||||||
|
static char table[64] = |
||||||
|
{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, |
||||||
|
u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, |
||||||
|
17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, |
||||||
|
5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u}; |
||||||
|
|
||||||
|
x = x | (x >> 1); // Propagate leftmost
|
||||||
|
x = x | (x >> 2); // 1-bit to the right.
|
||||||
|
x = x | (x >> 4); |
||||||
|
x = x | (x >> 8); |
||||||
|
x = x | (x >> 16); |
||||||
|
x = (x << 3) - x; // Multiply by 7.
|
||||||
|
x = (x << 8) - x; // Multiply by 255.
|
||||||
|
x = (x << 8) - x; // Again.
|
||||||
|
x = (x << 8) - x; // Again.
|
||||||
|
return table[x >> 26]; |
||||||
|
} |
||||||
|
|
||||||
|
/* Julius Goryavsky's version of Harley's algorithm.
|
||||||
|
17 elementary ops plus an indexed load, if the machine |
||||||
|
has "and not." */ |
||||||
|
|
||||||
|
int nlz10b(unsigned x) { |
||||||
|
|
||||||
|
static char table[64] = |
||||||
|
{32,20,19, u, u,18, u, 7, 10,17, u, u,14, u, 6, u, |
||||||
|
u, 9, u,16, u, u, 1,26, u,13, u, u,24, 5, u, u, |
||||||
|
u,21, u, 8,11, u,15, u, u, u, u, 2,27, 0,25, u, |
||||||
|
22, u,12, u, u, 3,28, u, 23, u, 4,29, u, u,30,31}; |
||||||
|
|
||||||
|
x = x | (x >> 1); // Propagate leftmost
|
||||||
|
x = x | (x >> 2); // 1-bit to the right.
|
||||||
|
x = x | (x >> 4); |
||||||
|
x = x | (x >> 8); |
||||||
|
x = x & ~(x >> 16); |
||||||
|
x = x*0xFD7049FF; // Activate this line or the following 3.
|
||||||
|
// x = (x << 9) - x; // Multiply by 511.
|
||||||
|
// x = (x << 11) - x; // Multiply by 2047.
|
||||||
|
// x = (x << 14) - x; // Multiply by 16383.
|
||||||
|
return table[x >> 26]; |
||||||
|
} |
||||||
|
|
||||||
|
int errors; |
||||||
|
void error(int x, int y) { |
||||||
|
errors = errors + 1; |
||||||
|
printf("Error for x = %08x, got %d\n", x, y); |
||||||
|
} |
||||||
|
|
||||||
|
int main() |
||||||
|
{ |
||||||
|
int i, n; |
||||||
|
static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29, |
||||||
|
7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23, |
||||||
|
512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16, |
||||||
|
65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11, |
||||||
|
0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6, |
||||||
|
0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3, |
||||||
|
0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0, |
||||||
|
0xFFFFFFFF,0}; |
||||||
|
std::size_t const Count = 10000000; |
||||||
|
|
||||||
|
n = sizeof(test)/4; |
||||||
|
|
||||||
|
std::clock_t TimestampBeg = 0; |
||||||
|
std::clock_t TimestampEnd = 0; |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
TimestampBeg = std::clock(); |
||||||
|
for (std::size_t k = 0; k < Count; ++k) |
||||||
|
for (i = 0; i < n; i += 2) { |
||||||
|
if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));} |
||||||
|
TimestampEnd = std::clock(); |
||||||
|
|
||||||
|
printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg); |
||||||
|
|
||||||
|
if (errors == 0) |
||||||
|
printf("Passed all %d cases.\n", sizeof(test)/8); |
||||||
|
} |
Loading…
Reference in New Issue