From 0bffce4f4bbe039393371e49f25105bfaffa2f63 Mon Sep 17 00:00:00 2001 From: Christophe Riccio Date: Sat, 22 Nov 2014 20:14:48 +0100 Subject: [PATCH] Optimized findMSB and findLSB --- glm/detail/func_integer.inl | 197 +++++---- readme.txt | 6 +- test/core/CMakeLists.txt | 1 + test/core/core_func_integer.cpp | 426 +++++++++++++------ test/core/core_func_integer_find_lsb.cpp | 500 +++++++++++------------ test/core/core_func_integer_find_msb.cpp | 447 ++++++++++++++++++++ 6 files changed, 1105 insertions(+), 472 deletions(-) create mode 100644 test/core/core_func_integer_find_msb.cpp diff --git a/glm/detail/func_integer.inl b/glm/detail/func_integer.inl index 81b5d5af..ae455a83 100644 --- a/glm/detail/func_integer.inl +++ b/glm/detail/func_integer.inl @@ -87,6 +87,117 @@ namespace detail return (v & Mask) + ((v >> Shift) & Mask); } }; + + template + struct compute_findLSB + { + GLM_FUNC_QUALIFIER static int call(genIUType Value) + { + if(Value == 0) + return -1; + + return glm::bitCount(~Value & (Value - static_cast(1))); + } + }; + +# if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM)) + + template + struct compute_findLSB + { + GLM_FUNC_QUALIFIER static int call(genIUType Value) + { + unsigned long Result(0); + unsigned char IsNotNull = _BitScanForward(&Result, *reinterpret_cast(&Value)); + return IsNotNull ? int(Result) : -1; + } + }; + + template + struct compute_findLSB + { + GLM_FUNC_QUALIFIER static int call(genIUType Value) + { + unsigned long Result(0); + unsigned char IsNotNull = _BitScanForward64(&Result, *reinterpret_cast(&Value)); + return IsNotNull ? int(Result) : -1; + } + }; + +# endif + + template class vecType, bool EXEC = true> + struct compute_findMSB_step_vec + { + GLM_FUNC_QUALIFIER static vecType call(vecType const & x, T Shift) + { + return x | (x >> Shift); + } + }; + + template class vecType> + struct compute_findMSB_step_vec + { + GLM_FUNC_QUALIFIER static vecType call(vecType const & x, T) + { + return x; + } + }; + + template class vecType, std::size_t> + struct compute_findMSB_vec + { + GLM_FUNC_QUALIFIER static vecType call(vecType const & vec) + { + vecType x(vec); + x = compute_findMSB_step_vec= 8>::call(x, static_cast( 1)); + x = compute_findMSB_step_vec= 8>::call(x, static_cast( 2)); + x = compute_findMSB_step_vec= 8>::call(x, static_cast( 4)); + x = compute_findMSB_step_vec= 16>::call(x, static_cast( 8)); + x = compute_findMSB_step_vec= 32>::call(x, static_cast(16)); + x = compute_findMSB_step_vec= 64>::call(x, static_cast(32)); + return vecType(sizeof(T) * 8 - 1) - glm::bitCount(~x); + } + }; + +# if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM)) + + template + GLM_FUNC_QUALIFIER int compute_findMSB_32(genIUType Value) + { + unsigned long Result(0); + unsigned char IsNotNull = _BitScanReverse(&Result, *reinterpret_cast(&Value)); + return IsNotNull ? int(Result) : -1; + } + + template + GLM_FUNC_QUALIFIER int compute_findMSB_64(genIUType Value) + { + unsigned long Result(0); + unsigned char IsNotNull = _BitScanReverse64(&Result, *reinterpret_cast(&Value)); + return IsNotNull ? int(Result) : -1; + } + + template class vecType> + struct compute_findMSB_vec + { + GLM_FUNC_QUALIFIER static int call(vecType const & x) + { + return detail::functor1::call(compute_findMSB_32, x); + } + }; + + template class vecType> + struct compute_findMSB_vec + { + GLM_FUNC_QUALIFIER static int call(vecType const & x) + { + return detail::functor1::call(compute_findMSB_64, x); + } + }; + +# endif + }//namespace detail // uaddCarry @@ -248,12 +359,8 @@ namespace detail GLM_FUNC_QUALIFIER int findLSB(genIUType Value) { GLM_STATIC_ASSERT(std::numeric_limits::is_integer, "'findLSB' only accept integer values"); - if(Value == 0) - return -1; - genIUType Bit; - for(Bit = genIUType(0); !(Value & (1 << Bit)); ++Bit){} - return Bit; + return detail::compute_findLSB::call(Value); } template class vecType> @@ -265,89 +372,19 @@ namespace detail } // findMSB -#if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & GLM_COMPILER_VC) - template - GLM_FUNC_QUALIFIER int findMSB(genIUType Value) + GLM_FUNC_QUALIFIER int findMSB(genIUType x) { GLM_STATIC_ASSERT(std::numeric_limits::is_integer, "'findMSB' only accept integer values"); - if(Value == 0) - return -1; - unsigned long Result(0); - _BitScanReverse(&Result, Value); - return int(Result); + return findMSB(tvec1(x)).x; } -/* -// __builtin_clz seems to be buggy as it crasks for some values, from 0x00200000 to 80000000 -#elif((GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & GLM_COMPILER_GCC) && (GLM_COMPILER >= GLM_COMPILER_GCC40)) - - template - GLM_FUNC_QUALIFIER int findMSB - ( - genIUType const & Value - ) - { - GLM_STATIC_ASSERT(std::numeric_limits::is_integer, "'findMSB' only accept integer values"); - if(Value == 0) - return -1; - - // clz returns the number or trailing 0-bits; see - // http://gcc.gnu.org/onlinedocs/gcc-4.7.1/gcc/Other-Builtins.html - // - // NoteBecause __builtin_clz only works for unsigned ints, this - // implementation will not work for 64-bit integers. - // - return 31 - __builtin_clzl(Value); - } -*/ -#else - -/* SSE implementation idea - - __m128i const Zero = _mm_set_epi32( 0, 0, 0, 0); - __m128i const One = _mm_set_epi32( 1, 1, 1, 1); - __m128i Bit = _mm_set_epi32(-1, -1, -1, -1); - __m128i Tmp = _mm_set_epi32(Value, Value, Value, Value); - __m128i Mmi = Zero; - for(int i = 0; i < 32; ++i) - { - __m128i Shilt = _mm_and_si128(_mm_cmpgt_epi32(Tmp, One), One); - Tmp = _mm_srai_epi32(Tmp, One); - Bit = _mm_add_epi32(Bit, _mm_and_si128(Shilt, i)); - Mmi = _mm_and_si128(Mmi, One); - } - return Bit; -*/ - - template - GLM_FUNC_QUALIFIER int findMSB(genIUType Value) - { - GLM_STATIC_ASSERT(std::numeric_limits::is_integer, "'findMSB' only accept integer values"); - - if(Value == genIUType(0) || Value == genIUType(-1)) - return -1; - else if(Value > 0) - { - genIUType Bit = genIUType(-1); - for(genIUType tmp = Value; tmp > 0; tmp >>= 1, ++Bit){} - return Bit; - } - else //if(Value < 0) - { - int const BitCount(sizeof(genIUType) * 8); - int MostSignificantBit(-1); - for(int BitIndex(0); BitIndex < BitCount; ++BitIndex) - MostSignificantBit = (Value & (1 << BitIndex)) ? MostSignificantBit : BitIndex; - assert(MostSignificantBit >= 0); - return MostSignificantBit; - } - } -#endif//(GLM_COMPILER) template class vecType> GLM_FUNC_QUALIFIER vecType findMSB(vecType const & x) { - return detail::functor1::call(findMSB, x); + GLM_STATIC_ASSERT(std::numeric_limits::is_integer, "'findMSB' only accept integer values"); + + return detail::compute_findMSB_vec::call(x); } }//namespace glm diff --git a/readme.txt b/readme.txt index 26b7c1fe..62cd12a2 100644 --- a/readme.txt +++ b/readme.txt @@ -67,16 +67,16 @@ Improvements: - Undetected C++ compiler automatically compile with GLM_FORCE_CXX98 and GLM_FORCE_PURE - Added not function (from GLSL specification) on VC12 -- Optimized bitfield operations - Optimized bitfieldReverse and bitCount functions +- Optimized findLSB and findMSB functions. - Optimized matrix-vector multiple performance with Cuda #257, #258 - Reduced integer type redifinitions #233 - Rewrited of GTX_fast_trigonometry #264 #265 - Made types trivially copyable #263 - Removed in GLM tests - Used std features within GLM without redeclaring -- Optimized glm::cot #272 -- Optimized glm::sign #272 +- Optimized cot function #272 +- Optimized sign function #272 Fixes: - Fixed std::nextafter not supported with C++11 on Android #217 diff --git a/test/core/CMakeLists.txt b/test/core/CMakeLists.txt index e986841b..c7314b1d 100644 --- a/test/core/CMakeLists.txt +++ b/test/core/CMakeLists.txt @@ -22,6 +22,7 @@ glmCreateTestGTC(core_func_geometric) glmCreateTestGTC(core_func_integer) glmCreateTestGTC(core_func_integer_bit_count) glmCreateTestGTC(core_func_integer_find_lsb) +glmCreateTestGTC(core_func_integer_find_msb) glmCreateTestGTC(core_func_matrix) glmCreateTestGTC(core_func_noise) glmCreateTestGTC(core_func_packing) diff --git a/test/core/core_func_integer.cpp b/test/core/core_func_integer.cpp index 66825b18..21b7a5e6 100644 --- a/test/core/core_func_integer.cpp +++ b/test/core/core_func_integer.cpp @@ -8,6 +8,7 @@ /////////////////////////////////////////////////////////////////////////////////////////////////// #include +#include #include #include #include @@ -555,6 +556,19 @@ namespace findMSB genType Return; }; + template + GLM_FUNC_QUALIFIER int findMSB_intrinsic(genIUType Value) + { + GLM_STATIC_ASSERT(std::numeric_limits::is_integer, "'findMSB' only accept integer values"); + + if(Value == 0) + return -1; + + unsigned long Result(0); + _BitScanReverse(&Result, Value); + return int(Result); + } + template GLM_FUNC_QUALIFIER int findMSB_095(genIUType Value) { @@ -583,27 +597,17 @@ namespace findMSB GLM_FUNC_QUALIFIER int findMSB_nlz1(genIUType x) { GLM_STATIC_ASSERT(std::numeric_limits::is_integer, "'findMSB' only accept integer values"); -/* - int Result = 0; - for(std::size_t i = 0, n = sizeof(genIUType) * 8; i < n; ++i) - Result = Value & static_cast(1 << i) ? static_cast(i) : Result; - return Result; -*/ -/* - genIUType Bit = genIUType(-1); - for(genIUType tmp = Value; tmp > 0; tmp >>= 1, ++Bit){} - return Bit; -*/ - int n; - if (x == 0) return(32); - n = 0; + if (x == 0) + return -1; + + int n = 0; if (x <= 0x0000FFFF) {n = n +16; x = x <<16;} if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;} if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;} if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;} if (x <= 0x7FFFFFFF) {n = n + 1;} - return n; + return 31 - n; } int findMSB_nlz2(unsigned int x) @@ -617,14 +621,24 @@ namespace findMSB y = x >> 4; if (y != 0) {n = n - 4; x = y;} y = x >> 2; if (y != 0) {n = n - 2; x = y;} y = x >> 1; if (y != 0) return n - 2; - return n - x; + return 32 - (n - x); } - int perf_950() + int findMSB_pop(unsigned int x) { - type const Data[] = + x = x | (x >> 1); + x = x | (x >> 2); + x = x | (x >> 4); + x = x | (x >> 8); + x = x | (x >>16); + return 31 - glm::bitCount(~x); + } + + int perf_int() + { + type const Data[] = { - //{0x00000000, -1}, + {0x00000000, -1}, {0x00000001, 0}, {0x00000002, 1}, {0x00000003, 1}, @@ -662,141 +676,131 @@ namespace findMSB }; int Error(0); + std::size_t const Count(1000000); + + std::clock_t Timestamps0 = std::clock(); + + for(std::size_t k = 0; k < Count; ++k) + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) + { + int Result = glm::findMSB(Data[i].Value); + Error += Data[i].Return == Result ? 0 : 1; + } std::clock_t Timestamps1 = std::clock(); - for(std::size_t k = 0; k < 1000000; ++k) + for(std::size_t k = 0; k < Count; ++k) for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) { - int Result = findMSB_095(Data[i].Value); + int Result = findMSB_nlz1(Data[i].Value); Error += Data[i].Return == Result ? 0 : 1; } std::clock_t Timestamps2 = std::clock(); - std::printf("findMSB - 0.9.5: %d clocks\n", static_cast(Timestamps2 - Timestamps1)); + for(std::size_t k = 0; k < Count; ++k) + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) + { + int Result = findMSB_nlz2(Data[i].Value); + Error += Data[i].Return == Result ? 0 : 1; + } - return Error; - } + std::clock_t Timestamps3 = std::clock(); - int perf_ops() - { - type const Data[] = + for(std::size_t k = 0; k < Count; ++k) + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) { - {0x00000000, -1}, - {0x00000001, 0}, - {0x00000002, 1}, - {0x00000003, 1}, - {0x00000004, 2}, - {0x00000005, 2}, - {0x00000007, 2}, - {0x00000008, 3}, - {0x00000010, 4}, - {0x00000020, 5}, - {0x00000040, 6}, - {0x00000080, 7}, - {0x00000100, 8}, - {0x00000200, 9}, - {0x00000400, 10}, - {0x00000800, 11}, - {0x00001000, 12}, - {0x00002000, 13}, - {0x00004000, 14}, - {0x00008000, 15}, - {0x00010000, 16}, - {0x00020000, 17}, - {0x00040000, 18}, - {0x00080000, 19}, - {0x00100000, 20}, - {0x00200000, 21}, - {0x00400000, 22}, - {0x00800000, 23}, - {0x01000000, 24}, - {0x02000000, 25}, - {0x04000000, 26}, - {0x08000000, 27}, - {0x10000000, 28}, - {0x20000000, 29}, - {0x40000000, 30} - }; + int Result = findMSB_095(Data[i].Value); + Error += Data[i].Return == Result ? 0 : 1; + } - int Error(0); + std::clock_t Timestamps4 = std::clock(); - std::clock_t Timestamps1 = std::clock(); + for(std::size_t k = 0; k < Count; ++k) + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) + { + int Result = findMSB_intrinsic(Data[i].Value); + Error += Data[i].Return == Result ? 0 : 1; + } + + std::clock_t Timestamps5 = std::clock(); - for(std::size_t k = 0; k < 1000000; ++k) + for(std::size_t k = 0; k < Count; ++k) for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) { - int Result = findMSB_nlz1(Data[i].Value); + int Result = findMSB_pop(Data[i].Value); Error += Data[i].Return == Result ? 0 : 1; } - std::clock_t Timestamps2 = std::clock(); + std::clock_t Timestamps6 = std::clock(); + std::printf("glm::findMSB: %d clocks\n", static_cast(Timestamps1 - Timestamps0)); std::printf("findMSB - nlz1: %d clocks\n", static_cast(Timestamps2 - Timestamps1)); + std::printf("findMSB - nlz2: %d clocks\n", static_cast(Timestamps3 - Timestamps2)); + std::printf("findMSB - 0.9.5: %d clocks\n", static_cast(Timestamps4 - Timestamps3)); + std::printf("findMSB - intrinsics: %d clocks\n", static_cast(Timestamps5 - Timestamps4)); + std::printf("findMSB - pop: %d clocks\n", static_cast(Timestamps6 - Timestamps5)); return Error; } - - int test_findMSB() + int test_ivec4() { - type const Data[] = + type const Data[] = { - //{0x00000000, -1}, - {0x00000001, 0}, - {0x00000002, 1}, - {0x00000003, 1}, - {0x00000004, 2}, - {0x00000005, 2}, - {0x00000007, 2}, - {0x00000008, 3}, - {0x00000010, 4}, - {0x00000020, 5}, - {0x00000040, 6}, - {0x00000080, 7}, - {0x00000100, 8}, - {0x00000200, 9}, - {0x00000400, 10}, - {0x00000800, 11}, - {0x00001000, 12}, - {0x00002000, 13}, - {0x00004000, 14}, - {0x00008000, 15}, - {0x00010000, 16}, - {0x00020000, 17}, - {0x00040000, 18}, - {0x00080000, 19}, - {0x00100000, 20}, - {0x00200000, 21}, - {0x00400000, 22}, - {0x00800000, 23}, - {0x01000000, 24}, - {0x02000000, 25}, - {0x04000000, 26}, - {0x08000000, 27}, - {0x10000000, 28}, - {0x20000000, 29}, - {0x40000000, 30} + {glm::ivec4(0x00000000), glm::ivec4(-1)}, + {glm::ivec4(0x00000001), glm::ivec4( 0)}, + {glm::ivec4(0x00000002), glm::ivec4( 1)}, + {glm::ivec4(0x00000003), glm::ivec4( 1)}, + {glm::ivec4(0x00000004), glm::ivec4( 2)}, + {glm::ivec4(0x00000005), glm::ivec4( 2)}, + {glm::ivec4(0x00000007), glm::ivec4( 2)}, + {glm::ivec4(0x00000008), glm::ivec4( 3)}, + {glm::ivec4(0x00000010), glm::ivec4( 4)}, + {glm::ivec4(0x00000020), glm::ivec4( 5)}, + {glm::ivec4(0x00000040), glm::ivec4( 6)}, + {glm::ivec4(0x00000080), glm::ivec4( 7)}, + {glm::ivec4(0x00000100), glm::ivec4( 8)}, + {glm::ivec4(0x00000200), glm::ivec4( 9)}, + {glm::ivec4(0x00000400), glm::ivec4(10)}, + {glm::ivec4(0x00000800), glm::ivec4(11)}, + {glm::ivec4(0x00001000), glm::ivec4(12)}, + {glm::ivec4(0x00002000), glm::ivec4(13)}, + {glm::ivec4(0x00004000), glm::ivec4(14)}, + {glm::ivec4(0x00008000), glm::ivec4(15)}, + {glm::ivec4(0x00010000), glm::ivec4(16)}, + {glm::ivec4(0x00020000), glm::ivec4(17)}, + {glm::ivec4(0x00040000), glm::ivec4(18)}, + {glm::ivec4(0x00080000), glm::ivec4(19)}, + {glm::ivec4(0x00100000), glm::ivec4(20)}, + {glm::ivec4(0x00200000), glm::ivec4(21)}, + {glm::ivec4(0x00400000), glm::ivec4(22)}, + {glm::ivec4(0x00800000), glm::ivec4(23)}, + {glm::ivec4(0x01000000), glm::ivec4(24)}, + {glm::ivec4(0x02000000), glm::ivec4(25)}, + {glm::ivec4(0x04000000), glm::ivec4(26)}, + {glm::ivec4(0x08000000), glm::ivec4(27)}, + {glm::ivec4(0x10000000), glm::ivec4(28)}, + {glm::ivec4(0x20000000), glm::ivec4(29)}, + {glm::ivec4(0x40000000), glm::ivec4(30)} }; int Error(0); - for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) { - int Result = glm::findMSB(Data[i].Value); - Error += Data[i].Return == Result ? 0 : 1; - assert(!Error); + glm::ivec4 Result0 = glm::findMSB(Data[i].Value); + Error += glm::all(glm::equal(Data[i].Return, Result0)) ? 0 : 1; } return Error; } - int test_nlz1() + int test_int() { type const Data[] = { - //{0x00000000, -1}, + {0x00000000, -1}, {0x00000001, 0}, {0x00000002, 1}, {0x00000003, 1}, @@ -837,8 +841,38 @@ namespace findMSB for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) { - int Result = findMSB_nlz2(Data[i].Value); - Error += Data[i].Return == Result ? 0 : 1; + int Result0 = glm::findMSB(Data[i].Value); + Error += Data[i].Return == Result0 ? 0 : 1; + } + + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) + { + int Result0 = findMSB_nlz1(Data[i].Value); + Error += Data[i].Return == Result0 ? 0 : 1; + } +/* + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) + { + int Result0 = findMSB_nlz2(Data[i].Value); + Error += Data[i].Return == Result0 ? 0 : 1; + } +*/ + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) + { + int Result0 = findMSB_095(Data[i].Value); + Error += Data[i].Return == Result0 ? 0 : 1; + } + + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) + { + int Result0 = findMSB_intrinsic(Data[i].Value); + Error += Data[i].Return == Result0 ? 0 : 1; + } + + for(std::size_t i = 0; i < sizeof(Data) / sizeof(type); ++i) + { + int Result0 = findMSB_pop(Data[i].Value); + Error += Data[i].Return == Result0 ? 0 : 1; } return Error; @@ -848,8 +882,8 @@ namespace findMSB { int Error(0); - Error += test_findMSB(); - //Error += test_nlz1(); + Error += test_ivec4(); + Error += test_int(); return Error; } @@ -858,8 +892,7 @@ namespace findMSB { int Error(0); - Error += perf_950(); - //Error += perf_ops(); + Error += perf_int(); return Error; } @@ -878,20 +911,172 @@ namespace findLSB { {0x00000001, 0}, {0x00000003, 0}, - {0x00000002, 1} + {0x00000002, 1}, + {0x80000000, 31}, + {0x00010000, 16}, + {0xFFFF0000, 16}, + {0xFF000000, 24}, + {0xFF00FF00, 8}, + {0x00000000, -1} }; + template + GLM_FUNC_QUALIFIER int findLSB_intrinsic(genIUType Value) + { + GLM_STATIC_ASSERT(std::numeric_limits::is_integer, "'findLSB' only accept integer values"); + + if(Value == 0) + return -1; + + unsigned long Result(0); + _BitScanForward(&Result, Value); + return int(Result); + } + + template + GLM_FUNC_QUALIFIER int findLSB_095(genIUType Value) + { + GLM_STATIC_ASSERT(std::numeric_limits::is_integer, "'findLSB' only accept integer values"); + if(Value == 0) + return -1; + + genIUType Bit; + for(Bit = genIUType(0); !(Value & (1 << Bit)); ++Bit){} + return Bit; + } + + template + GLM_FUNC_QUALIFIER int findLSB_ntz2(genIUType x) + { + if(x == 0) + return -1; + + return glm::bitCount(~x & (x - static_cast(1))); + } + + template + GLM_FUNC_QUALIFIER int findLSB_branchfree(genIUType x) + { + bool IsNull(x == 0); + int const Keep(!IsNull); + int const Discard(IsNull); + + return static_cast(glm::bitCount(~x & (x - static_cast(1)))) * Keep + Discard * -1; + } + + int test_int() + { + int Error(0); + + for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) + { + int Result = glm::findLSB(DataI32[i].Value); + Error += DataI32[i].Return == Result ? 0 : 1; + } + + for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) + { + int Result = findLSB_095(DataI32[i].Value); + Error += DataI32[i].Return == Result ? 0 : 1; + } + + for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) + { + int Result = findLSB_intrinsic(DataI32[i].Value); + Error += DataI32[i].Return == Result ? 0 : 1; + } + + for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) + { + int Result = findLSB_ntz2(DataI32[i].Value); + Error += DataI32[i].Return == Result ? 0 : 1; + } + + for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) + { + int Result = findLSB_branchfree(DataI32[i].Value); + Error += DataI32[i].Return == Result ? 0 : 1; + } + + return Error; + } + int test() { int Error(0); + Error += test_int(); + + return Error; + } + + int perf_int() + { + int Error(0); + std::size_t const Count(10000000); + + std::clock_t Timestamps0 = std::clock(); + + for(std::size_t k = 0; k < Count; ++k) for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) { int Result = glm::findLSB(DataI32[i].Value); Error += DataI32[i].Return == Result ? 0 : 1; - assert(!Error); } + std::clock_t Timestamps1 = std::clock(); + + for(std::size_t k = 0; k < Count; ++k) + for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) + { + int Result = findLSB_095(DataI32[i].Value); + Error += DataI32[i].Return == Result ? 0 : 1; + } + + std::clock_t Timestamps2 = std::clock(); + + for(std::size_t k = 0; k < Count; ++k) + for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) + { + int Result = findLSB_intrinsic(DataI32[i].Value); + Error += DataI32[i].Return == Result ? 0 : 1; + } + + std::clock_t Timestamps3 = std::clock(); + + for(std::size_t k = 0; k < Count; ++k) + for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) + { + int Result = findLSB_ntz2(DataI32[i].Value); + Error += DataI32[i].Return == Result ? 0 : 1; + } + + std::clock_t Timestamps4 = std::clock(); + + for(std::size_t k = 0; k < Count; ++k) + for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type); ++i) + { + int Result = findLSB_branchfree(DataI32[i].Value); + Error += DataI32[i].Return == Result ? 0 : 1; + } + + std::clock_t Timestamps5 = std::clock(); + + std::printf("glm::findLSB: %d clocks\n", static_cast(Timestamps1 - Timestamps0)); + std::printf("findLSB - 0.9.5: %d clocks\n", static_cast(Timestamps2 - Timestamps1)); + std::printf("findLSB - intrinsics: %d clocks\n", static_cast(Timestamps3 - Timestamps2)); + std::printf("findLSB - ntz2: %d clocks\n", static_cast(Timestamps4 - Timestamps3)); + std::printf("findLSB - branchfree: %d clocks\n", static_cast(Timestamps5 - Timestamps4)); + + return Error; + } + + int perf() + { + int Error(0); + + Error += perf_int(); + return Error; } }//findLSB @@ -1324,6 +1509,7 @@ int main() Error += ::bitCount::perf(); Error += ::bitfieldReverse::perf(); Error += ::findMSB::perf(); + Error += ::findLSB::perf(); # endif return Error; diff --git a/test/core/core_func_integer_find_lsb.cpp b/test/core/core_func_integer_find_lsb.cpp index 21dd124a..89b420a5 100644 --- a/test/core/core_func_integer_find_lsb.cpp +++ b/test/core/core_func_integer_find_lsb.cpp @@ -7,15 +7,23 @@ // File : test/core/func_integer_find_lsb.cpp /////////////////////////////////////////////////////////////////////////////////////////////////// -// This has the programs for computing the number of leading zeros +// This has the programs for computing the number of trailing zeros // in a word. // Max line length is 57, to fit in hacker.book. -// Compile with g++, not gcc. #include -#include // To define "exit", req'd by XLC. +#include //To define "exit", req'd by XLC. #include -#define LE 1 // 1 for little-endian, 0 for big-endian. +int nlz(unsigned x) { + int pop(unsigned x); + + x = x | (x >> 1); + x = x | (x >> 2); + x = x | (x >> 4); + x = x | (x >> 8); + x = x | (x >>16); + return pop(~x); +} int pop(unsigned x) { x = x - ((x >> 1) & 0x55555555); @@ -26,280 +34,230 @@ int pop(unsigned x) { return x >> 24; } -int nlz1(unsigned x) { - int n; +int ntz1(unsigned x) { + return 32 - nlz(~x & (x-1)); +} - if (x == 0) return(32); - n = 0; - if (x <= 0x0000FFFF) {n = n +16; x = x <<16;} - if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;} - if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;} - if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;} - if (x <= 0x7FFFFFFF) {n = n + 1;} - return n; +int ntz2(unsigned x) { + return pop(~x & (x - 1)); } -int nlz1a(unsigned x) { +int ntz3(unsigned x) { int n; -/* if (x == 0) return(32); */ - if ((int)x <= 0) return (~x >> 26) & 32; + if (x == 0) return(32); n = 1; - if ((x >> 16) == 0) {n = n +16; x = x <<16;} - if ((x >> 24) == 0) {n = n + 8; x = x << 8;} - if ((x >> 28) == 0) {n = n + 4; x = x << 4;} - if ((x >> 30) == 0) {n = n + 2; x = x << 2;} - n = n - (x >> 31); - return n; + if ((x & 0x0000FFFF) == 0) {n = n +16; x = x >>16;} + if ((x & 0x000000FF) == 0) {n = n + 8; x = x >> 8;} + if ((x & 0x0000000F) == 0) {n = n + 4; x = x >> 4;} + if ((x & 0x00000003) == 0) {n = n + 2; x = x >> 2;} + return n - (x & 1); } -// On basic Risc, 12 to 20 instructions. -int nlz2(unsigned x) { +int ntz4(unsigned x) { unsigned y; int n; - n = 32; - y = x >>16; if (y != 0) {n = n -16; x = y;} - y = x >> 8; if (y != 0) {n = n - 8; x = y;} - y = x >> 4; if (y != 0) {n = n - 4; x = y;} - y = x >> 2; if (y != 0) {n = n - 2; x = y;} - y = x >> 1; if (y != 0) return n - 2; - return n - x; + if (x == 0) return 32; + n = 31; + y = x <<16; if (y != 0) {n = n -16; x = y;} + y = x << 8; if (y != 0) {n = n - 8; x = y;} + y = x << 4; if (y != 0) {n = n - 4; x = y;} + y = x << 2; if (y != 0) {n = n - 2; x = y;} + y = x << 1; if (y != 0) {n = n - 1;} + return n; } -// As above but coded as a loop for compactness: -// 23 to 33 basic Risc instructions. -int nlz2a(unsigned x) { +int ntz4a(unsigned x) { unsigned y; - int n, c; - - n = 32; - c = 16; - do { - y = x >> c; if (y != 0) {n = n - c; x = y;} - c = c >> 1; - } while (c != 0); - return n - x; -} + int n; -int nlz3(int x) { - int y, n; - - n = 0; - y = x; -L: if (x < 0) return n; - if (y == 0) return 32 - n; - n = n + 1; - x = x << 1; - y = y >> 1; - goto L; + if (x == 0) return 32; + n = 31; + y = x <<16; if (y != 0) {n = n -16; x = y;} + y = x << 8; if (y != 0) {n = n - 8; x = y;} + y = x << 4; if (y != 0) {n = n - 4; x = y;} + y = x << 2; if (y != 0) {n = n - 2; x = y;} + n = n - ((x << 1) >> 31); + return n; } -int nlz4(unsigned x) { - int y, m, n; - - y = -(x >> 16); // If left half of x is 0, - m = (y >> 16) & 16; // set n = 16. If left half - n = 16 - m; // is nonzero, set n = 0 and - x = x >> m; // shift x right 16. - // Now x is of the form 0000xxxx. - y = x - 0x100; // If positions 8-15 are 0, - m = (y >> 16) & 8; // add 8 to n and shift x left 8. - n = n + m; - x = x << m; - - y = x - 0x1000; // If positions 12-15 are 0, - m = (y >> 16) & 4; // add 4 to n and shift x left 4. - n = n + m; - x = x << m; - - y = x - 0x4000; // If positions 14-15 are 0, - m = (y >> 16) & 2; // add 2 to n and shift x left 2. - n = n + m; - x = x << m; - - y = x >> 14; // Set y = 0, 1, 2, or 3. - m = y & ~(y >> 1); // Set m = 0, 1, 2, or 2 resp. - return n + 2 - m; +int ntz5(char x) +{ + if (x & 15) { + if (x & 3) { + if (x & 1) return 0; + else return 1; + } + else if (x & 4) return 2; + else return 3; + } + else if (x & 0x30) { + if (x & 0x10) return 4; + else return 5; + } + else if (x & 0x40) return 6; + else if (x) return 7; + else return 8; } -int nlz5(unsigned x) { - int pop(unsigned x); +int ntz6(unsigned x) { + int n; - x = x | (x >> 1); - x = x | (x >> 2); - x = x | (x >> 4); - x = x | (x >> 8); - x = x | (x >>16); - return pop(~x); + x = ~x & (x - 1); + n = 0; // n = 32; + while(x != 0) { // while (x != 0) { + n = n + 1; // n = n - 1; + x = x >> 1; // x = x + x; + } // } + return n; // return n; } -/* The four programs below are not valid ANSI C programs. This is -because they refer to the same storage locations as two different types. -However, they work with xlc/AIX, gcc/AIX, and gcc/NT. If you try to -code them more compactly by declaring a variable xx to be "double," and -then using - - n = 1054 - (*((unsigned *)&xx + LE) >> 20); - -then you are violating not only the rule above, but also the ANSI C -rule that pointer arithmetic can be performed only on pointers to -array elements. - When coded with the above statement, the program fails with xlc, -gcc/AIX, and gcc/NT, at some optimization levels. - BTW, these programs use the "anonymous union" feature of C++, not -available in C. */ - -int nlz6(unsigned k) { - union { - unsigned asInt[2]; - double asDouble; - }; - int n; +int ntz6a(unsigned x) +{ + int n = 32; - asDouble = (double)k + 0.5; - n = 1054 - (asInt[LE] >> 20); - return n; + while (x != 0) { + n = n - 1; + x = x + x; + } + return n; } -int nlz7(unsigned k) { - union { - unsigned asInt[2]; - double asDouble; - }; - int n; +/* Dean Gaudet's algorithm. To be most useful there must be a good way +to evaluate the C "conditional expression" (a?b:c construction) without +branching. The result of a?b:c is b if a is true (nonzero), and c if a +is false (0). + For example, a compare to zero op that sets a target GPR to 1 if the +operand is 0, and to 0 if the operand is nonzero, will do it. With this +instruction, the algorithm is entirely branch-free. But the most +interesting thing about it is the high degree of parallelism. All six +lines with conditional expressions can be executed in parallel (on a +machine with sufficient computational units). + Although the instruction count is 30 measured statically, it could +execute in only 10 cycles on a machine with sufficient parallelism. + The first two uses of y can instead be x, which would increase the +useful parallelism on most machines (the assignments to y, bz, and b4 +could then all run in parallel). */ + +int ntz7(unsigned x) +{ + unsigned y, bz, b4, b3, b2, b1, b0; + + y = x & -x; // Isolate rightmost 1-bit. + bz = y ? 0 : 1; // 1 if y = 0. + b4 = (y & 0x0000FFFF) ? 0 : 16; + b3 = (y & 0x00FF00FF) ? 0 : 8; + b2 = (y & 0x0F0F0F0F) ? 0 : 4; + b1 = (y & 0x33333333) ? 0 : 2; + b0 = (y & 0x55555555) ? 0 : 1; + return bz + b4 + b3 + b2 + b1 + b0; +} - asDouble = (double)k; - n = 1054 - (asInt[LE] >> 20); - n = (n & 31) + (n >> 9); - return n; +int ntz7_christophe(unsigned x) +{ + unsigned y, bz, b4, b3, b2, b1, b0; + + y = x & -x; // Isolate rightmost 1-bit. + bz = unsigned(!bool(y)); // 1 if y = 0. + b4 = unsigned(!bool(y & 0x0000FFFF)) * 16; + b3 = unsigned(!bool(y & 0x00FF00FF)) * 8; + b2 = unsigned(!bool(y & 0x0F0F0F0F)) * 4; + b1 = unsigned(!bool(y & 0x33333333)) * 2; + b0 = unsigned(!bool(y & 0x55555555)) * 1; + return bz + b4 + b3 + b2 + b1 + b0; } - /* In single precision, round-to-nearest mode, the basic method fails for: - k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF, - 07FFFFFC <= k <= 07FFFFFF, - 0FFFFFF8 <= k <= 0FFFFFFF, - ... - 7FFFFFC0 <= k <= 7FFFFFFF. - FFFFFF80 <= k <= FFFFFFFF. - For k = 0 it gives 158, and for the other values it is too low by 1. */ - -int nlz8(unsigned k) { - union { - unsigned asInt; - float asFloat; - }; - int n; +/* Below is David Seal's algorithm, found at +http://www.ciphersbyritter.com/NEWS4/BITCT.HTM Table +entries marked "u" are unused. 6 ops including a +multiply, plus an indexed load. */ - k = k & ~(k >> 1); /* Fix problem with rounding. */ - asFloat = (float)k + 0.5f; - n = 158 - (asInt >> 23); - return n; +#define u 99 +int ntz8(unsigned x) +{ + static char table[64] = + {32, 0, 1,12, 2, 6, u,13, 3, u, 7, u, u, u, u,14, + 10, 4, u, u, 8, u, u,25, u, u, u, u, u,21,27,15, + 31,11, 5, u, u, u, u, u, 9, u, u,24, u, u,20,26, + 30, u, u, u, u,23, u,19, 29, u,22,18,28,17,16, u}; + + x = (x & -x)*0x0450FBAF; + return table[x >> 26]; } -/* The example below shows how to make a macro for nlz. It uses an -extension to the C and C++ languages that is provided by the GNU C/C++ -compiler, namely, that of allowing statements and declarations in -expressions (see "Using and Porting GNU CC", by Richard M. Stallman -(1998). The underscores are necessary to protect against the -possibility that the macro argument will conflict with one of its local -variables, e.g., NLZ(k). */ - -int nlz9(unsigned k) { - union { - unsigned asInt; - float asFloat; - }; - int n; +/* Seal's algorithm with multiply expanded. +9 elementary ops plus an indexed load. */ - k = k & ~(k >> 1); /* Fix problem with rounding. */ - asFloat = (float)k; - n = 158 - (asInt >> 23); - n = (n & 31) + (n >> 6); /* Fix problem with k = 0. */ - return n; +int ntz8a(unsigned x) +{ + static char table[64] = + {32, 0, 1,12, 2, 6, u,13, 3, u, 7, u, u, u, u,14, + 10, 4, u, u, 8, u, u,25, u, u, u, u, u,21,27,15, + 31,11, 5, u, u, u, u, u, 9, u, u,24, u, u,20,26, + 30, u, u, u, u,23, u,19, 29, u,22,18,28,17,16, u}; + + x = (x & -x); + x = (x << 4) + x; // x = x*17. + x = (x << 6) + x; // x = x*65. + x = (x << 16) - x; // x = x*65535. + return table[x >> 26]; } -/* Below are three nearly equivalent programs for computing the number -of leading zeros in a word. This material is not in HD, but may be in a -future edition. - Immediately below is Robert Harley's algorithm, found at the -comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert -Juffa. - Table entries marked "u" are unused. 14 ops including a multiply, -plus an indexed load. - The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all -of form 2**k + 1). There are no multipliers of three terms of the form -2**k +- 1 that work, with a table size of 64 or 128. There are some, -with a table size of 64, if you precede the multiplication with x = x - -(x >> 1), but that seems less elegant. There are also some if you use a -table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would -save two instructions in the form of this algorithm with the -multiplication expanded into shifts and adds, but the table size is -getting a bit large). */ +/* Reiser's algorithm. Three ops including a "remainder," +plus an indexed load. */ -#define u 99 -int nlz10(unsigned x) { +int ntz9(unsigned x) { - static char table[64] = - {32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, - u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, - 17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, - 5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u}; + static char table[37] = {32, 0, 1, 26, 2, 23, 27, + u, 3, 16, 24, 30, 28, 11, u, 13, 4, + 7, 17, u, 25, 22, 31, 15, 29, 10, 12, + 6, u, 21, 14, 9, 5, 20, 8, 19, 18}; - x = x | (x >> 1); // Propagate leftmost - x = x | (x >> 2); // 1-bit to the right. - x = x | (x >> 4); - x = x | (x >> 8); - x = x | (x >>16); - x = x*0x06EB14F9; // Multiplier is 7*255**3. - return table[x >> 26]; + x = (x & -x)%37; + return table[x]; } -/* Harley's algorithm with multiply expanded. -19 elementary ops plus an indexed load. */ +/* Using a de Bruijn sequence. This is a table lookup with a 32-entry +table. The de Bruijn sequence used here is + 0000 0100 1101 0111 0110 0101 0001 1111, +obtained from Danny Dube's October 3, 1997, posting in +comp.compression.research. Thanks to Norbert Juffa for this reference. */ -int nlz10a(unsigned x) { +int ntz10(unsigned x) { - static char table[64] = - {32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, - u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, - 17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, - 5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u}; + static char table[32] = + { 0, 1, 2,24, 3,19, 6,25, 22, 4,20,10,16, 7,12,26, + 31,23,18, 5,21, 9,15,11, 30,17, 8,14,29,13,28,27}; - x = x | (x >> 1); // Propagate leftmost - x = x | (x >> 2); // 1-bit to the right. - x = x | (x >> 4); - x = x | (x >> 8); - x = x | (x >> 16); - x = (x << 3) - x; // Multiply by 7. - x = (x << 8) - x; // Multiply by 255. - x = (x << 8) - x; // Again. - x = (x << 8) - x; // Again. - return table[x >> 26]; + if (x == 0) return 32; + x = (x & -x)*0x04D7651F; + return table[x >> 27]; } -/* Julius Goryavsky's version of Harley's algorithm. -17 elementary ops plus an indexed load, if the machine -has "and not." */ - -int nlz10b(unsigned x) { +/* Norbert Juffa's code, answer to exercise 1 of Chapter 5 (2nd ed). */ - static char table[64] = - {32,20,19, u, u,18, u, 7, 10,17, u, u,14, u, 6, u, - u, 9, u,16, u, u, 1,26, u,13, u, u,24, 5, u, u, - u,21, u, 8,11, u,15, u, u, u, u, 2,27, 0,25, u, - 22, u,12, u, u, 3,28, u, 23, u, 4,29, u, u,30,31}; +#define SLOW_MUL +int ntz11 (unsigned int n) { - x = x | (x >> 1); // Propagate leftmost - x = x | (x >> 2); // 1-bit to the right. - x = x | (x >> 4); - x = x | (x >> 8); - x = x & ~(x >> 16); - x = x*0xFD7049FF; // Activate this line or the following 3. -// x = (x << 9) - x; // Multiply by 511. -// x = (x << 11) - x; // Multiply by 2047. -// x = (x << 14) - x; // Multiply by 16383. - return table[x >> 26]; + static unsigned char tab[32] = + { 0, 1, 2, 24, 3, 19, 6, 25, + 22, 4, 20, 10, 16, 7, 12, 26, + 31, 23, 18, 5, 21, 9, 15, 11, + 30, 17, 8, 14, 29, 13, 28, 27 + }; + unsigned int k; + n = n & (-n); /* isolate lsb */ + printf("n = %d\n", n); +#if defined(SLOW_MUL) + k = (n << 11) - n; + k = (k << 2) + k; + k = (k << 8) + n; + k = (k << 5) - k; +#else + k = n * 0x4d7651f; +#endif + return n ? tab[k>>27] : 32; } int errors; @@ -308,19 +266,22 @@ void error(int x, int y) { printf("Error for x = %08x, got %d\n", x, y); } +/* ------------------------------ main ------------------------------ */ + int main() { # ifdef GLM_TEST_ENABLE_PERF - int i, n; - static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29, - 7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23, - 512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16, - 65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11, - 0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6, - 0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3, - 0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0, - 0xFFFFFFFF,0}; + int i, m, n; + static unsigned test[] = {0,32, 1,0, 2,1, 3,0, 4,2, 5,0, 6,1, 7,0, + 8,3, 9,0, 16,4, 32,5, 64,6, 128,7, 255,0, 256,8, 512,9, 1024,10, + 2048,11, 4096,12, 8192,13, 16384,14, 32768,15, 65536,16, + 0x20000,17, 0x40000,18, 0x80000,19, 0x100000,20, 0x200000,21, + 0x400000,22, 0x800000,23, 0x1000000,24, 0x2000000,25, + 0x4000000,26, 0x8000000,27, 0x10000000,28, 0x20000000,29, + 0x40000000,30, 0x80000000,31, 0xFFFFFFF0,4, 0x3000FF00,8, + 0xC0000000,30, 0x60000000,29, 0x00011000, 12}; + std::size_t const Count = 10000000; n = sizeof(test)/4; @@ -331,114 +292,115 @@ int main() TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));} + if (ntz1(test[i]) != test[i+1]) error(test[i], ntz1(test[i]));} TimestampEnd = std::clock(); - printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz1: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));} + if (ntz2(test[i]) != test[i+1]) error(test[i], ntz2(test[i]));} TimestampEnd = std::clock(); - printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz2: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));} + if (ntz3(test[i]) != test[i+1]) error(test[i], ntz3(test[i]));} TimestampEnd = std::clock(); - printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz3: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));} + if (ntz4(test[i]) != test[i+1]) error(test[i], ntz4(test[i]));} TimestampEnd = std::clock(); - printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz4: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));} + if (ntz4a(test[i]) != test[i+1]) error(test[i], ntz4a(test[i]));} TimestampEnd = std::clock(); - printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz4a: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));} + m = test[i+1]; if (m > 8) m = 8; + if (ntz5(test[i]) != m) error(test[i], ntz5(test[i]));} TimestampEnd = std::clock(); - printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz5: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));} + if (ntz6(test[i]) != test[i+1]) error(test[i], ntz6(test[i]));} TimestampEnd = std::clock(); - printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz6: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));} + if (ntz6a(test[i]) != test[i+1]) error(test[i], ntz6a(test[i]));} TimestampEnd = std::clock(); - printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz6a: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));} + if (ntz7(test[i]) != test[i+1]) error(test[i], ntz7(test[i]));} TimestampEnd = std::clock(); - printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz7: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));} + if (ntz7_christophe(test[i]) != test[i+1]) error(test[i], ntz7(test[i]));} TimestampEnd = std::clock(); - printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz7_christophe: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));} + if (ntz8(test[i]) != test[i+1]) error(test[i], ntz8(test[i]));} TimestampEnd = std::clock(); - printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz8: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));} + if (ntz8a(test[i]) != test[i+1]) error(test[i], ntz8a(test[i]));} TimestampEnd = std::clock(); - printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz8a: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));} + if (ntz9(test[i]) != test[i+1]) error(test[i], ntz9(test[i]));} TimestampEnd = std::clock(); - printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz9: %d clocks\n", TimestampEnd - TimestampBeg); TimestampBeg = std::clock(); for (std::size_t k = 0; k < Count; ++k) for (i = 0; i < n; i += 2) { - if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));} + if (ntz10(test[i]) != test[i+1]) error(test[i], ntz10(test[i]));} TimestampEnd = std::clock(); - printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg); + printf("ntz10: %d clocks\n", TimestampEnd - TimestampBeg); if (errors == 0) printf("Passed all %d cases.\n", sizeof(test)/8); diff --git a/test/core/core_func_integer_find_msb.cpp b/test/core/core_func_integer_find_msb.cpp new file mode 100644 index 00000000..21dd124a --- /dev/null +++ b/test/core/core_func_integer_find_msb.cpp @@ -0,0 +1,447 @@ +/////////////////////////////////////////////////////////////////////////////////////////////////// +// OpenGL Mathematics Copyright (c) 2005 - 2014 G-Truc Creation (www.g-truc.net) +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Created : 2014-10-27 +// Updated : 2014-10-27 +// Licence : This source is under MIT licence +// File : test/core/func_integer_find_lsb.cpp +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// This has the programs for computing the number of leading zeros +// in a word. +// Max line length is 57, to fit in hacker.book. +// Compile with g++, not gcc. +#include +#include // To define "exit", req'd by XLC. +#include + +#define LE 1 // 1 for little-endian, 0 for big-endian. + +int pop(unsigned x) { + x = x - ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x + (x >> 4)) & 0x0F0F0F0F; + x = x + (x << 8); + x = x + (x << 16); + return x >> 24; +} + +int nlz1(unsigned x) { + int n; + + if (x == 0) return(32); + n = 0; + if (x <= 0x0000FFFF) {n = n +16; x = x <<16;} + if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;} + if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;} + if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;} + if (x <= 0x7FFFFFFF) {n = n + 1;} + return n; +} + +int nlz1a(unsigned x) { + int n; + +/* if (x == 0) return(32); */ + if ((int)x <= 0) return (~x >> 26) & 32; + n = 1; + if ((x >> 16) == 0) {n = n +16; x = x <<16;} + if ((x >> 24) == 0) {n = n + 8; x = x << 8;} + if ((x >> 28) == 0) {n = n + 4; x = x << 4;} + if ((x >> 30) == 0) {n = n + 2; x = x << 2;} + n = n - (x >> 31); + return n; +} +// On basic Risc, 12 to 20 instructions. + +int nlz2(unsigned x) { + unsigned y; + int n; + + n = 32; + y = x >>16; if (y != 0) {n = n -16; x = y;} + y = x >> 8; if (y != 0) {n = n - 8; x = y;} + y = x >> 4; if (y != 0) {n = n - 4; x = y;} + y = x >> 2; if (y != 0) {n = n - 2; x = y;} + y = x >> 1; if (y != 0) return n - 2; + return n - x; +} + +// As above but coded as a loop for compactness: +// 23 to 33 basic Risc instructions. +int nlz2a(unsigned x) { + unsigned y; + int n, c; + + n = 32; + c = 16; + do { + y = x >> c; if (y != 0) {n = n - c; x = y;} + c = c >> 1; + } while (c != 0); + return n - x; +} + +int nlz3(int x) { + int y, n; + + n = 0; + y = x; +L: if (x < 0) return n; + if (y == 0) return 32 - n; + n = n + 1; + x = x << 1; + y = y >> 1; + goto L; +} + +int nlz4(unsigned x) { + int y, m, n; + + y = -(x >> 16); // If left half of x is 0, + m = (y >> 16) & 16; // set n = 16. If left half + n = 16 - m; // is nonzero, set n = 0 and + x = x >> m; // shift x right 16. + // Now x is of the form 0000xxxx. + y = x - 0x100; // If positions 8-15 are 0, + m = (y >> 16) & 8; // add 8 to n and shift x left 8. + n = n + m; + x = x << m; + + y = x - 0x1000; // If positions 12-15 are 0, + m = (y >> 16) & 4; // add 4 to n and shift x left 4. + n = n + m; + x = x << m; + + y = x - 0x4000; // If positions 14-15 are 0, + m = (y >> 16) & 2; // add 2 to n and shift x left 2. + n = n + m; + x = x << m; + + y = x >> 14; // Set y = 0, 1, 2, or 3. + m = y & ~(y >> 1); // Set m = 0, 1, 2, or 2 resp. + return n + 2 - m; +} + +int nlz5(unsigned x) { + int pop(unsigned x); + + x = x | (x >> 1); + x = x | (x >> 2); + x = x | (x >> 4); + x = x | (x >> 8); + x = x | (x >>16); + return pop(~x); +} + +/* The four programs below are not valid ANSI C programs. This is +because they refer to the same storage locations as two different types. +However, they work with xlc/AIX, gcc/AIX, and gcc/NT. If you try to +code them more compactly by declaring a variable xx to be "double," and +then using + + n = 1054 - (*((unsigned *)&xx + LE) >> 20); + +then you are violating not only the rule above, but also the ANSI C +rule that pointer arithmetic can be performed only on pointers to +array elements. + When coded with the above statement, the program fails with xlc, +gcc/AIX, and gcc/NT, at some optimization levels. + BTW, these programs use the "anonymous union" feature of C++, not +available in C. */ + +int nlz6(unsigned k) { + union { + unsigned asInt[2]; + double asDouble; + }; + int n; + + asDouble = (double)k + 0.5; + n = 1054 - (asInt[LE] >> 20); + return n; +} + +int nlz7(unsigned k) { + union { + unsigned asInt[2]; + double asDouble; + }; + int n; + + asDouble = (double)k; + n = 1054 - (asInt[LE] >> 20); + n = (n & 31) + (n >> 9); + return n; +} + + /* In single precision, round-to-nearest mode, the basic method fails for: + k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF, + 07FFFFFC <= k <= 07FFFFFF, + 0FFFFFF8 <= k <= 0FFFFFFF, + ... + 7FFFFFC0 <= k <= 7FFFFFFF. + FFFFFF80 <= k <= FFFFFFFF. + For k = 0 it gives 158, and for the other values it is too low by 1. */ + +int nlz8(unsigned k) { + union { + unsigned asInt; + float asFloat; + }; + int n; + + k = k & ~(k >> 1); /* Fix problem with rounding. */ + asFloat = (float)k + 0.5f; + n = 158 - (asInt >> 23); + return n; +} + +/* The example below shows how to make a macro for nlz. It uses an +extension to the C and C++ languages that is provided by the GNU C/C++ +compiler, namely, that of allowing statements and declarations in +expressions (see "Using and Porting GNU CC", by Richard M. Stallman +(1998). The underscores are necessary to protect against the +possibility that the macro argument will conflict with one of its local +variables, e.g., NLZ(k). */ + +int nlz9(unsigned k) { + union { + unsigned asInt; + float asFloat; + }; + int n; + + k = k & ~(k >> 1); /* Fix problem with rounding. */ + asFloat = (float)k; + n = 158 - (asInt >> 23); + n = (n & 31) + (n >> 6); /* Fix problem with k = 0. */ + return n; +} + +/* Below are three nearly equivalent programs for computing the number +of leading zeros in a word. This material is not in HD, but may be in a +future edition. + Immediately below is Robert Harley's algorithm, found at the +comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert +Juffa. + Table entries marked "u" are unused. 14 ops including a multiply, +plus an indexed load. + The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all +of form 2**k + 1). There are no multipliers of three terms of the form +2**k +- 1 that work, with a table size of 64 or 128. There are some, +with a table size of 64, if you precede the multiplication with x = x - +(x >> 1), but that seems less elegant. There are also some if you use a +table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would +save two instructions in the form of this algorithm with the +multiplication expanded into shifts and adds, but the table size is +getting a bit large). */ + +#define u 99 +int nlz10(unsigned x) { + + static char table[64] = + {32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, + u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, + 17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, + 5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u}; + + x = x | (x >> 1); // Propagate leftmost + x = x | (x >> 2); // 1-bit to the right. + x = x | (x >> 4); + x = x | (x >> 8); + x = x | (x >>16); + x = x*0x06EB14F9; // Multiplier is 7*255**3. + return table[x >> 26]; +} + +/* Harley's algorithm with multiply expanded. +19 elementary ops plus an indexed load. */ + +int nlz10a(unsigned x) { + + static char table[64] = + {32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, + u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, + 17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, + 5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u}; + + x = x | (x >> 1); // Propagate leftmost + x = x | (x >> 2); // 1-bit to the right. + x = x | (x >> 4); + x = x | (x >> 8); + x = x | (x >> 16); + x = (x << 3) - x; // Multiply by 7. + x = (x << 8) - x; // Multiply by 255. + x = (x << 8) - x; // Again. + x = (x << 8) - x; // Again. + return table[x >> 26]; +} + +/* Julius Goryavsky's version of Harley's algorithm. +17 elementary ops plus an indexed load, if the machine +has "and not." */ + +int nlz10b(unsigned x) { + + static char table[64] = + {32,20,19, u, u,18, u, 7, 10,17, u, u,14, u, 6, u, + u, 9, u,16, u, u, 1,26, u,13, u, u,24, 5, u, u, + u,21, u, 8,11, u,15, u, u, u, u, 2,27, 0,25, u, + 22, u,12, u, u, 3,28, u, 23, u, 4,29, u, u,30,31}; + + x = x | (x >> 1); // Propagate leftmost + x = x | (x >> 2); // 1-bit to the right. + x = x | (x >> 4); + x = x | (x >> 8); + x = x & ~(x >> 16); + x = x*0xFD7049FF; // Activate this line or the following 3. +// x = (x << 9) - x; // Multiply by 511. +// x = (x << 11) - x; // Multiply by 2047. +// x = (x << 14) - x; // Multiply by 16383. + return table[x >> 26]; +} + +int errors; +void error(int x, int y) { + errors = errors + 1; + printf("Error for x = %08x, got %d\n", x, y); +} + +int main() +{ +# ifdef GLM_TEST_ENABLE_PERF + + int i, n; + static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29, + 7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23, + 512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16, + 65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11, + 0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6, + 0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3, + 0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0, + 0xFFFFFFFF,0}; + std::size_t const Count = 10000000; + + n = sizeof(test)/4; + + std::clock_t TimestampBeg = 0; + std::clock_t TimestampEnd = 0; + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg); + + TimestampBeg = std::clock(); + for (std::size_t k = 0; k < Count; ++k) + for (i = 0; i < n; i += 2) { + if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));} + TimestampEnd = std::clock(); + + printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg); + + if (errors == 0) + printf("Passed all %d cases.\n", sizeof(test)/8); + +# endif//GLM_TEST_ENABLE_PERF +}