Optimized findMSB and findLSB

ago%!(EXTRA string=11 years) · 0bffce4f4b
parent 20bdab33dd
commit 0bffce4f4b
6 changed files with 1105 additions and 472 deletions
--- a/glm/detail/func_integer.inl
+++ b/glm/detail/func_integer.inl
@ -87,6 +87,117 @@ namespace detail
 			return (v & Mask) + ((v >> Shift) & Mask);
 		}
 	};
 	template <typename genIUType, size_t Bits>
 	struct compute_findLSB
 	{
 		GLM_FUNC_QUALIFIER static int call(genIUType Value)
 		{
 			if(Value == 0)
 				return -1;
 			return glm::bitCount(~Value & (Value - static_cast<genIUType>(1)));
 		}
 	};
 #	if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM))
 	template <typename genIUType>
 	struct compute_findLSB<genIUType, 32>
 	{
 		GLM_FUNC_QUALIFIER static int call(genIUType Value)
 		{
 			unsigned long Result(0);
 			unsigned char IsNotNull = _BitScanForward(&Result, *reinterpret_cast<unsigned long*>(&Value));
 			return IsNotNull ? int(Result) : -1;
 		}
 	};
 	template <typename genIUType>
 	struct compute_findLSB<genIUType, 64>
 	{
 		GLM_FUNC_QUALIFIER static int call(genIUType Value)
 		{
 			unsigned long Result(0);
 			unsigned char IsNotNull = _BitScanForward64(&Result, *reinterpret_cast<unsigned __int64*>(&Value));
 			return IsNotNull ? int(Result) : -1;
 		}
 	};
 #	endif
 	template <typename T, glm::precision P, template <class, glm::precision> class vecType, bool EXEC = true>
 	struct compute_findMSB_step_vec
 	{
 		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x, T Shift)
 		{
 			return x | (x >> Shift);
 		}
 	};
 	template <typename T, glm::precision P, template <class, glm::precision> class vecType>
 	struct compute_findMSB_step_vec<T, P, vecType, false>
 	{
 		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x, T)
 		{
 			return x;
 		}
 	};
 	template <typename T, glm::precision P, template <class, glm::precision> class vecType, std::size_t>
 	struct compute_findMSB_vec
 	{
 		GLM_FUNC_QUALIFIER static vecType<int, P> call(vecType<T, P> const & vec)
 		{
 			vecType<T, P> x(vec);
 			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >=  8>::call(x, static_cast<T>( 1));
 			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >=  8>::call(x, static_cast<T>( 2));
 			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >=  8>::call(x, static_cast<T>( 4));
 			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 16>::call(x, static_cast<T>( 8));
 			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 32>::call(x, static_cast<T>(16));
 			x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 64>::call(x, static_cast<T>(32));
 			return vecType<int, P>(sizeof(T) * 8 - 1) - glm::bitCount(~x);
 		}
 	};
 #	if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM))
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int compute_findMSB_32(genIUType Value)
 	{
 		unsigned long Result(0);
 		unsigned char IsNotNull = _BitScanReverse(&Result, *reinterpret_cast<unsigned long*>(&Value));
 		return IsNotNull ? int(Result) : -1;
 	}
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int compute_findMSB_64(genIUType Value)
 	{
 		unsigned long Result(0);
 		unsigned char IsNotNull = _BitScanReverse64(&Result, *reinterpret_cast<unsigned __int64*>(&Value));
 		return IsNotNull ? int(Result) : -1;
 	}
 	template <typename T, glm::precision P, template <class, glm::precision> class vecType>
 	struct compute_findMSB_vec<T, P, vecType, 32>
 	{
 		GLM_FUNC_QUALIFIER static int call(vecType<T, P> const & x)
 		{
 			return detail::functor1<int, T, P, vecType>::call(compute_findMSB_32, x);
 		}
 	};
 	template <typename T, glm::precision P, template <class, glm::precision> class vecType>
 	struct compute_findMSB_vec<T, P, vecType, 64>
 	{
 		GLM_FUNC_QUALIFIER static int call(vecType<T, P> const & x)
 		{
 			return detail::functor1<int, T, P, vecType>::call(compute_findMSB_64, x);
 		}
 	};
 #	endif
 }//namespace detail
 	// uaddCarry
@ -248,12 +359,8 @@ namespace detail
 	GLM_FUNC_QUALIFIER int findLSB(genIUType Value)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findLSB' only accept integer values");
 		if(Value == 0)
 			return -1;
-		genIUType Bit;
+		return detail::compute_findLSB<genIUType, sizeof(genIUType) * 8>::call(Value);
 		for(Bit = genIUType(0); !(Value & (1 << Bit)); ++Bit){}
 		return Bit;
 	}
 	template <typename T, precision P, template <typename, precision> class vecType>
@ -265,89 +372,19 @@ namespace detail
 	}
 	// findMSB
 #if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & GLM_COMPILER_VC)
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int findMSB(genIUType Value)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
 		if(Value == 0)
 			return -1;
 		unsigned long Result(0);
 		_BitScanReverse(&Result, Value);
 		return int(Result);
 	}
 /*
 // __builtin_clz seems to be buggy as it crasks for some values, from 0x00200000 to 80000000
 #elif((GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & GLM_COMPILER_GCC) && (GLM_COMPILER >= GLM_COMPILER_GCC40))
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int findMSB
 	(
 		genIUType const & Value
 	)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
 		if(Value == 0)
 			return -1;
 		// clz returns the number or trailing 0-bits; see
 		// http://gcc.gnu.org/onlinedocs/gcc-4.7.1/gcc/Other-Builtins.html
 		//
 		// NoteBecause __builtin_clz only works for unsigned ints, this
 		// implementation will not work for 64-bit integers.
 		//
 		return 31 - __builtin_clzl(Value);
 	}
 */
 #else
 /* SSE implementation idea
 		__m128i const Zero = _mm_set_epi32( 0,  0,  0,  0);
 		__m128i const One = _mm_set_epi32( 1,  1,  1,  1);
 		__m128i Bit = _mm_set_epi32(-1, -1, -1, -1);
 		__m128i Tmp = _mm_set_epi32(Value, Value, Value, Value);
 		__m128i Mmi = Zero;
 		for(int i = 0; i < 32; ++i)
 		{
 			__m128i Shilt = _mm_and_si128(_mm_cmpgt_epi32(Tmp, One), One);
 			Tmp = _mm_srai_epi32(Tmp, One);
 			Bit = _mm_add_epi32(Bit, _mm_and_si128(Shilt, i));
 			Mmi = _mm_and_si128(Mmi, One);
 		}
 		return Bit;
 */
 	template <typename genIUType>
-	GLM_FUNC_QUALIFIER int findMSB(genIUType Value)
+	GLM_FUNC_QUALIFIER int findMSB(genIUType x)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
-		if(Value == genIUType(0) || Value == genIUType(-1))
+		return findMSB(tvec1<genIUType>(x)).x;
 			return -1;
 		else if(Value > 0)
 		{
 			genIUType Bit = genIUType(-1);
 			for(genIUType tmp = Value; tmp > 0; tmp >>= 1, ++Bit){}
 			return Bit;
 		}
 		else //if(Value < 0)
 		{
 			int const BitCount(sizeof(genIUType) * 8);
 			int MostSignificantBit(-1);
 			for(int BitIndex(0); BitIndex < BitCount; ++BitIndex)
 				MostSignificantBit = (Value & (1 << BitIndex)) ? MostSignificantBit : BitIndex;
 			assert(MostSignificantBit >= 0);
 			return MostSignificantBit;
 		}
 	}
 #endif//(GLM_COMPILER)
 	template <typename T, precision P, template <typename, precision> class vecType>
 	GLM_FUNC_QUALIFIER vecType<int, P> findMSB(vecType<T, P> const & x)
 	{
-		return detail::functor1<int, T, P, vecType>::call(findMSB, x);
+		GLM_STATIC_ASSERT(std::numeric_limits<T>::is_integer, "'findMSB' only accept integer values");
 		return detail::compute_findMSB_vec<T, P, vecType, sizeof(T) * 8>::call(x);
 	}
 }//namespace glm
--- a/readme.txt
+++ b/readme.txt
@ -67,16 +67,16 @@ Improvements:
 - Undetected C++ compiler automatically compile with GLM_FORCE_CXX98 and 
  GLM_FORCE_PURE
 - Added not function (from GLSL specification) on VC12
 - Optimized bitfield operations
 - Optimized bitfieldReverse and bitCount functions
 - Optimized findLSB and findMSB functions.
 - Optimized matrix-vector multiple performance with Cuda #257, #258
 - Reduced integer type redifinitions #233
 - Rewrited of GTX_fast_trigonometry #264 #265
 - Made types trivially copyable #263
 - Removed <iostream> in GLM tests
 - Used std features within GLM without redeclaring
- Optimized glm::cot #272
+- Optimized cot function #272
- Optimized glm::sign #272
+- Optimized sign function #272
 Fixes:
 - Fixed std::nextafter not supported with C++11 on Android #217
--- a/test/core/CMakeLists.txt
+++ b/test/core/CMakeLists.txt
@ -22,6 +22,7 @@ glmCreateTestGTC(core_func_geometric)
 glmCreateTestGTC(core_func_integer)
 glmCreateTestGTC(core_func_integer_bit_count)
 glmCreateTestGTC(core_func_integer_find_lsb)
 glmCreateTestGTC(core_func_integer_find_msb)
 glmCreateTestGTC(core_func_matrix)
 glmCreateTestGTC(core_func_noise)
 glmCreateTestGTC(core_func_packing)
--- a/test/core/core_func_integer.cpp
+++ b/test/core/core_func_integer.cpp
@ -8,6 +8,7 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #include <glm/integer.hpp>
 #include <glm/vector_relational.hpp>
 #include <glm/gtc/vec1.hpp>
 #include <vector>
 #include <ctime>
@ -555,6 +556,19 @@ namespace findMSB
 		genType		Return;
 	};
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int findMSB_intrinsic(genIUType Value)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
 		if(Value == 0)
 			return -1;
 		unsigned long Result(0);
 		_BitScanReverse(&Result, Value);
 		return int(Result);
 	}
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int findMSB_095(genIUType Value)
 	{
@ -583,27 +597,17 @@ namespace findMSB
 	GLM_FUNC_QUALIFIER int findMSB_nlz1(genIUType x)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
 /*
 		int Result = 0;
 		for(std::size_t i = 0, n = sizeof(genIUType) * 8; i < n; ++i)
 			Result = Value & static_cast<genIUType>(1 << i) ? static_cast<int>(i) : Result;
 		return Result;
 */
 /*
 		genIUType Bit = genIUType(-1);
 		for(genIUType tmp = Value; tmp > 0; tmp >>= 1, ++Bit){}
 		return Bit;
 */
 		int n;
-		if (x == 0) return(32);
+		if (x == 0)
-		n = 0;
+			return -1;
 		int n = 0;
 		if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
 		if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
 		if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
 		if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
 		if (x <= 0x7FFFFFFF) {n = n + 1;}
-		return n;
+		return 31 - n;
 	}
 	int findMSB_nlz2(unsigned int x)
@ -617,14 +621,24 @@ namespace findMSB
 		y = x >> 4;  if (y != 0) {n = n - 4;  x = y;}
 		y = x >> 2;  if (y != 0) {n = n - 2;  x = y;}
 		y = x >> 1;  if (y != 0) return n - 2;
-		return n - x;
+		return 32 - (n - x);
 	}
-	int perf_950()
+	int findMSB_pop(unsigned int x)
 	{
-		type<glm::uint> const Data[] =
+		x = x | (x >> 1);
 		x = x | (x >> 2);
 		x = x | (x >> 4);
 		x = x | (x >> 8);
 		x = x | (x >>16);
 		return 31 - glm::bitCount(~x);
 	}
 	int perf_int()
 	{
 		type<int> const Data[] =
 		{
-			//{0x00000000, -1},
+			{0x00000000, -1},
 			{0x00000001,  0},
 			{0x00000002,  1},
 			{0x00000003,  1},
@ -662,141 +676,131 @@ namespace findMSB
 		};
 		int Error(0);
 		std::size_t const Count(1000000);
 		std::clock_t Timestamps0 = std::clock();
 		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
 			int Result = glm::findMSB(Data[i].Value);
 			Error += Data[i].Return == Result ? 0 : 1;
 		}
 		std::clock_t Timestamps1 = std::clock();
-		for(std::size_t k = 0; k < 1000000; ++k)
+		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
-			int Result = findMSB_095(Data[i].Value);
+			int Result = findMSB_nlz1(Data[i].Value);
 			Error += Data[i].Return == Result ? 0 : 1;
 		}
 		std::clock_t Timestamps2 = std::clock();
-		std::printf("findMSB - 0.9.5: %d clocks\n", static_cast<unsigned int>(Timestamps2 - Timestamps1));
+		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
 			int Result = findMSB_nlz2(Data[i].Value);
 			Error += Data[i].Return == Result ? 0 : 1;
 		}
-		return Error;
+		std::clock_t Timestamps3 = std::clock();
 	}
-	int perf_ops()
+		for(std::size_t k = 0; k < Count; ++k)
-	{
+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		type<int> const Data[] =
 		{
-			{0x00000000, -1},
+			int Result = findMSB_095(Data[i].Value);
-			{0x00000001,  0},
+			Error += Data[i].Return == Result ? 0 : 1;
-			{0x00000002,  1},
+		}
 			{0x00000003,  1},
 			{0x00000004,  2},
 			{0x00000005,  2},
 			{0x00000007,  2},
 			{0x00000008,  3},
 			{0x00000010,  4},
 			{0x00000020,  5},
 			{0x00000040,  6},
 			{0x00000080,  7},
 			{0x00000100,  8},
 			{0x00000200,  9},
 			{0x00000400, 10},
 			{0x00000800, 11},
 			{0x00001000, 12},
 			{0x00002000, 13},
 			{0x00004000, 14},
 			{0x00008000, 15},
 			{0x00010000, 16},
 			{0x00020000, 17},
 			{0x00040000, 18},
 			{0x00080000, 19},
 			{0x00100000, 20},
 			{0x00200000, 21},
 			{0x00400000, 22},
 			{0x00800000, 23},
 			{0x01000000, 24},
 			{0x02000000, 25},
 			{0x04000000, 26},
 			{0x08000000, 27},
 			{0x10000000, 28},
 			{0x20000000, 29},
 			{0x40000000, 30}
 		};
-		int Error(0);
+		std::clock_t Timestamps4 = std::clock();
-		std::clock_t Timestamps1 = std::clock();
+		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
 			int Result = findMSB_intrinsic(Data[i].Value);
 			Error += Data[i].Return == Result ? 0 : 1;
 		}
 		std::clock_t Timestamps5 = std::clock();
-		for(std::size_t k = 0; k < 1000000; ++k)
+		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
-			int Result = findMSB_nlz1(Data[i].Value);
+			int Result = findMSB_pop(Data[i].Value);
 			Error += Data[i].Return == Result ? 0 : 1;
 		}
-		std::clock_t Timestamps2 = std::clock();
+		std::clock_t Timestamps6 = std::clock();
 		std::printf("glm::findMSB: %d clocks\n", static_cast<unsigned int>(Timestamps1 - Timestamps0));
 		std::printf("findMSB - nlz1: %d clocks\n", static_cast<unsigned int>(Timestamps2 - Timestamps1));
 		std::printf("findMSB - nlz2: %d clocks\n", static_cast<unsigned int>(Timestamps3 - Timestamps2));
 		std::printf("findMSB - 0.9.5: %d clocks\n", static_cast<unsigned int>(Timestamps4 - Timestamps3));
 		std::printf("findMSB - intrinsics: %d clocks\n", static_cast<unsigned int>(Timestamps5 - Timestamps4));
 		std::printf("findMSB - pop: %d clocks\n", static_cast<unsigned int>(Timestamps6 - Timestamps5));
 		return Error;
 	}
-
+	int test_ivec4()
 	int test_findMSB()
 	{
-		type<glm::uint> const Data[] =
+		type<glm::ivec4> const Data[] =
 		{
-			//{0x00000000, -1},
+			{glm::ivec4(0x00000000), glm::ivec4(-1)},
-			{0x00000001,  0},
+			{glm::ivec4(0x00000001), glm::ivec4( 0)},
-			{0x00000002,  1},
+			{glm::ivec4(0x00000002), glm::ivec4( 1)},
-			{0x00000003,  1},
+			{glm::ivec4(0x00000003), glm::ivec4( 1)},
-			{0x00000004,  2},
+			{glm::ivec4(0x00000004), glm::ivec4( 2)},
-			{0x00000005,  2},
+			{glm::ivec4(0x00000005), glm::ivec4( 2)},
-			{0x00000007,  2},
+			{glm::ivec4(0x00000007), glm::ivec4( 2)},
-			{0x00000008,  3},
+			{glm::ivec4(0x00000008), glm::ivec4( 3)},
-			{0x00000010,  4},
+			{glm::ivec4(0x00000010), glm::ivec4( 4)},
-			{0x00000020,  5},
+			{glm::ivec4(0x00000020), glm::ivec4( 5)},
-			{0x00000040,  6},
+			{glm::ivec4(0x00000040), glm::ivec4( 6)},
-			{0x00000080,  7},
+			{glm::ivec4(0x00000080), glm::ivec4( 7)},
-			{0x00000100,  8},
+			{glm::ivec4(0x00000100), glm::ivec4( 8)},
-			{0x00000200,  9},
+			{glm::ivec4(0x00000200), glm::ivec4( 9)},
-			{0x00000400, 10},
+			{glm::ivec4(0x00000400), glm::ivec4(10)},
-			{0x00000800, 11},
+			{glm::ivec4(0x00000800), glm::ivec4(11)},
-			{0x00001000, 12},
+			{glm::ivec4(0x00001000), glm::ivec4(12)},
-			{0x00002000, 13},
+			{glm::ivec4(0x00002000), glm::ivec4(13)},
-			{0x00004000, 14},
+			{glm::ivec4(0x00004000), glm::ivec4(14)},
-			{0x00008000, 15},
+			{glm::ivec4(0x00008000), glm::ivec4(15)},
-			{0x00010000, 16},
+			{glm::ivec4(0x00010000), glm::ivec4(16)},
-			{0x00020000, 17},
+			{glm::ivec4(0x00020000), glm::ivec4(17)},
-			{0x00040000, 18},
+			{glm::ivec4(0x00040000), glm::ivec4(18)},
-			{0x00080000, 19},
+			{glm::ivec4(0x00080000), glm::ivec4(19)},
-			{0x00100000, 20},
+			{glm::ivec4(0x00100000), glm::ivec4(20)},
-			{0x00200000, 21},
+			{glm::ivec4(0x00200000), glm::ivec4(21)},
-			{0x00400000, 22},
+			{glm::ivec4(0x00400000), glm::ivec4(22)},
-			{0x00800000, 23},
+			{glm::ivec4(0x00800000), glm::ivec4(23)},
-			{0x01000000, 24},
+			{glm::ivec4(0x01000000), glm::ivec4(24)},
-			{0x02000000, 25},
+			{glm::ivec4(0x02000000), glm::ivec4(25)},
-			{0x04000000, 26},
+			{glm::ivec4(0x04000000), glm::ivec4(26)},
-			{0x08000000, 27},
+			{glm::ivec4(0x08000000), glm::ivec4(27)},
-			{0x10000000, 28},
+			{glm::ivec4(0x10000000), glm::ivec4(28)},
-			{0x20000000, 29},
+			{glm::ivec4(0x20000000), glm::ivec4(29)},
-			{0x40000000, 30}
+			{glm::ivec4(0x40000000), glm::ivec4(30)}
 		};
 		int Error(0);
-		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
+		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<glm::ivec4>); ++i)
 		{
-			int Result = glm::findMSB(Data[i].Value);
+			glm::ivec4 Result0 = glm::findMSB(Data[i].Value);
-			Error += Data[i].Return == Result ? 0 : 1;
+			Error += glm::all(glm::equal(Data[i].Return, Result0)) ? 0 : 1;
 			assert(!Error);
 		}
 		return Error;
 	}
-	int test_nlz1()
+	int test_int()
 	{
 		type<glm::uint> const Data[] =
 		{
-			//{0x00000000, -1},
+			{0x00000000, -1},
 			{0x00000001,  0},
 			{0x00000002,  1},
 			{0x00000003,  1},
@ -837,8 +841,38 @@ namespace findMSB
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
-			int Result = findMSB_nlz2(Data[i].Value);
+			int Result0 = glm::findMSB(Data[i].Value);
-			Error += Data[i].Return == Result ? 0 : 1;
+			Error += Data[i].Return == Result0 ? 0 : 1;
 		}
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
 			int Result0 = findMSB_nlz1(Data[i].Value);
 			Error += Data[i].Return == Result0 ? 0 : 1;
 		}
 /*
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
 			int Result0 = findMSB_nlz2(Data[i].Value);
 			Error += Data[i].Return == Result0 ? 0 : 1;
 		}
 */
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
 			int Result0 = findMSB_095(Data[i].Value);
 			Error += Data[i].Return == Result0 ? 0 : 1;
 		}
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
 			int Result0 = findMSB_intrinsic(Data[i].Value);
 			Error += Data[i].Return == Result0 ? 0 : 1;
 		}
 		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
 		{
 			int Result0 = findMSB_pop(Data[i].Value);
 			Error += Data[i].Return == Result0 ? 0 : 1;
 		}
 		return Error;
@ -848,8 +882,8 @@ namespace findMSB
 	{
 		int Error(0);
-		Error += test_findMSB();
+		Error += test_ivec4();
-		//Error += test_nlz1();
+		Error += test_int();
 		return Error;
 	}
@ -858,8 +892,7 @@ namespace findMSB
 	{
 		int Error(0);
-		Error += perf_950();
+		Error += perf_int();
 		//Error += perf_ops();
 		return Error;
 	}
@ -878,20 +911,172 @@ namespace findLSB
 	{
 		{0x00000001,  0},
 		{0x00000003,  0},
-		{0x00000002,  1}
+		{0x00000002,  1},
 		{0x80000000, 31},
 		{0x00010000, 16},
 		{0xFFFF0000, 16},
 		{0xFF000000, 24},
 		{0xFF00FF00,  8},
 		{0x00000000, -1}
 	};
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int findLSB_intrinsic(genIUType Value)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findLSB' only accept integer values");
 		if(Value == 0)
 			return -1;
 		unsigned long Result(0);
 		_BitScanForward(&Result, Value);
 		return int(Result);
 	}
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int findLSB_095(genIUType Value)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findLSB' only accept integer values");
 		if(Value == 0)
 			return -1;
 		genIUType Bit;
 		for(Bit = genIUType(0); !(Value & (1 << Bit)); ++Bit){}
 		return Bit;
 	}
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int findLSB_ntz2(genIUType x)
 	{
 		if(x == 0)
 			return -1;
 		return glm::bitCount(~x & (x - static_cast<genIUType>(1)));
 	}
 	template <typename genIUType>
 	GLM_FUNC_QUALIFIER int findLSB_branchfree(genIUType x)
 	{
 		bool IsNull(x == 0);
 		int const Keep(!IsNull);
 		int const Discard(IsNull);
 		return static_cast<int>(glm::bitCount(~x & (x - static_cast<genIUType>(1)))) * Keep + Discard * -1;
 	}
 	int test_int()
 	{
 		int Error(0);
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = glm::findLSB(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 		}
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = findLSB_095(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 		}
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = findLSB_intrinsic(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 		}
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = findLSB_ntz2(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 		}
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = findLSB_branchfree(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 		}
 		return Error;
 	}
 	int test()
 	{
 		int Error(0);
 		Error += test_int();
 		return Error;
 	}
 	int perf_int()
 	{
 		int Error(0);
 		std::size_t const Count(10000000);
 		std::clock_t Timestamps0 = std::clock();
 		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = glm::findLSB(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 			assert(!Error);
 		}
 		std::clock_t Timestamps1 = std::clock();
 		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = findLSB_095(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 		}
 		std::clock_t Timestamps2 = std::clock();
 		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = findLSB_intrinsic(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 		}
 		std::clock_t Timestamps3 = std::clock();
 		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = findLSB_ntz2(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 		}
 		std::clock_t Timestamps4 = std::clock();
 		for(std::size_t k = 0; k < Count; ++k)
 		for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
 		{
 			int Result = findLSB_branchfree(DataI32[i].Value);
 			Error += DataI32[i].Return == Result ? 0 : 1;
 		}
 		std::clock_t Timestamps5 = std::clock();
 		std::printf("glm::findLSB: %d clocks\n", static_cast<unsigned int>(Timestamps1 - Timestamps0));
 		std::printf("findLSB - 0.9.5: %d clocks\n", static_cast<unsigned int>(Timestamps2 - Timestamps1));
 		std::printf("findLSB - intrinsics: %d clocks\n", static_cast<unsigned int>(Timestamps3 - Timestamps2));
 		std::printf("findLSB - ntz2: %d clocks\n", static_cast<unsigned int>(Timestamps4 - Timestamps3));
 		std::printf("findLSB - branchfree: %d clocks\n", static_cast<unsigned int>(Timestamps5 - Timestamps4));
 		return Error;
 	}
 	int perf()
 	{
 		int Error(0);
 		Error += perf_int();
 		return Error;
 	}
 }//findLSB
@ -1324,6 +1509,7 @@ int main()
 		Error += ::bitCount::perf();
 		Error += ::bitfieldReverse::perf();
 		Error += ::findMSB::perf();
 		Error += ::findLSB::perf();
 #	endif
 	return Error;
--- a/test/core/core_func_integer_find_lsb.cpp
+++ b/test/core/core_func_integer_find_lsb.cpp
@ -7,15 +7,23 @@
 // File    : test/core/func_integer_find_lsb.cpp
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-// This has the programs for computing the number of leading zeros
+// This has the programs for computing the number of trailing zeros
 // in a word.
 // Max line length is 57, to fit in hacker.book.
 // Compile with g++, not gcc.
 #include <cstdio>
-#include <cstdlib>     // To define "exit", req'd by XLC.
+#include <cstdlib>     //To define "exit", req'd by XLC.
 #include <ctime>
-#define LE 1            // 1 for little-endian, 0 for big-endian.
+int nlz(unsigned x) {
   int pop(unsigned x);
   x = x | (x >> 1);
   x = x | (x >> 2);
   x = x | (x >> 4);
   x = x | (x >> 8);
   x = x | (x >>16);
   return pop(~x);
 }
 int pop(unsigned x) {
   x = x - ((x >> 1) & 0x55555555);
@ -26,280 +34,230 @@ int pop(unsigned x) {
   return x >> 24;
 }
-int nlz1(unsigned x) {
+int ntz1(unsigned x) {
-   int n;
+   return 32 - nlz(~x & (x-1));
 }
-   if (x == 0) return(32);
+int ntz2(unsigned x) {
-   n = 0;
+   return pop(~x & (x - 1));
   if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
   if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
   if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
   if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
   if (x <= 0x7FFFFFFF) {n = n + 1;}
   return n;
 }
-int nlz1a(unsigned x) {
+int ntz3(unsigned x) {
   int n;
-/* if (x == 0) return(32); */
+   if (x == 0) return(32);
   if ((int)x <= 0) return (~x >> 26) & 32;
   n = 1;
-   if ((x >> 16) == 0) {n = n +16; x = x <<16;}
+   if ((x & 0x0000FFFF) == 0) {n = n +16; x = x >>16;}
-   if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
+   if ((x & 0x000000FF) == 0) {n = n + 8; x = x >> 8;}
-   if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
+   if ((x & 0x0000000F) == 0) {n = n + 4; x = x >> 4;}
-   if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
+   if ((x & 0x00000003) == 0) {n = n + 2; x = x >> 2;}
-   n = n - (x >> 31);
+   return n - (x & 1);
   return n;
 }
 // On basic Risc, 12 to 20 instructions.
-int nlz2(unsigned x) {
+int ntz4(unsigned x) {
   unsigned y;
   int n;
-   n = 32;
+   if (x == 0) return 32;
-   y = x >>16;  if (y != 0) {n = n -16;  x = y;}
+   n = 31;
-   y = x >> 8;  if (y != 0) {n = n - 8;  x = y;}
+   y = x <<16;  if (y != 0) {n = n -16;  x = y;}
-   y = x >> 4;  if (y != 0) {n = n - 4;  x = y;}
+   y = x << 8;  if (y != 0) {n = n - 8;  x = y;}
-   y = x >> 2;  if (y != 0) {n = n - 2;  x = y;}
+   y = x << 4;  if (y != 0) {n = n - 4;  x = y;}
-   y = x >> 1;  if (y != 0) return n - 2;
+   y = x << 2;  if (y != 0) {n = n - 2;  x = y;}
-   return n - x;
+   y = x << 1;  if (y != 0) {n = n - 1;}
   return n;
 }
-// As above but coded as a loop for compactness:
+int ntz4a(unsigned x) {
 // 23 to 33 basic Risc instructions.
 int nlz2a(unsigned x) {
   unsigned y;
-   int n, c;
+   int n;
   n = 32;
   c = 16;
   do {
      y = x >> c;  if (y != 0) {n = n - c;  x = y;}
      c = c >> 1;
   } while (c != 0);
   return n - x;
 }
-int nlz3(int x) {
+   if (x == 0) return 32;
-   int y, n;
+   n = 31;
-
+   y = x <<16;  if (y != 0) {n = n -16;  x = y;}
-   n = 0;
+   y = x << 8;  if (y != 0) {n = n - 8;  x = y;}
-   y = x;
+   y = x << 4;  if (y != 0) {n = n - 4;  x = y;}
-L: if (x < 0) return n;
+   y = x << 2;  if (y != 0) {n = n - 2;  x = y;}
-   if (y == 0) return 32 - n;
+   n = n - ((x << 1) >> 31);
-   n = n + 1;
+   return n;
   x = x << 1;
   y = y >> 1;
   goto L;
 }
-int nlz4(unsigned x) {
+int ntz5(char x)
-   int y, m, n;
+{
-
+	if (x & 15) {
-   y = -(x >> 16);      // If left half of x is 0,
+		if (x & 3) {
-   m = (y >> 16) & 16;  // set n = 16.  If left half
+			if (x & 1) return 0;
-   n = 16 - m;          // is nonzero, set n = 0 and
+			else return 1;
-   x = x >> m;          // shift x right 16.
+		}
-                        // Now x is of the form 0000xxxx.
+		else if (x & 4) return 2;
-   y = x - 0x100;       // If positions 8-15 are 0,
+		else return 3;
-   m = (y >> 16) & 8;   // add 8 to n and shift x left 8.
+	}
-   n = n + m;
+	else if (x & 0x30) {
-   x = x << m;
+		if (x & 0x10) return 4;
-
+		else return 5;
-   y = x - 0x1000;      // If positions 12-15 are 0,
+	}
-   m = (y >> 16) & 4;   // add 4 to n and shift x left 4.
+	else if (x & 0x40) return 6;
-   n = n + m;
+	else if (x) return 7;
-   x = x << m;
+	else return 8;
   y = x - 0x4000;      // If positions 14-15 are 0,
   m = (y >> 16) & 2;   // add 2 to n and shift x left 2.
   n = n + m;
   x = x << m;
   y = x >> 14;         // Set y = 0, 1, 2, or 3.
   m = y & ~(y >> 1);   // Set m = 0, 1, 2, or 2 resp.
   return n + 2 - m;
 }
-int nlz5(unsigned x) {
+int ntz6(unsigned x) {
-   int pop(unsigned x);
+   int n;
-   x = x | (x >> 1);
+   x = ~x & (x - 1);
-   x = x | (x >> 2);
+   n = 0;                       // n = 32;
-   x = x | (x >> 4);
+   while(x != 0) {              // while (x != 0) {
-   x = x | (x >> 8);
+      n = n + 1;                //    n = n - 1;
-   x = x | (x >>16);
+      x = x >> 1;               //    x = x + x;
-   return pop(~x);
+   }                            // }
   return n;                    // return n;
 }
-/* The four programs below are not valid ANSI C programs.  This is
+int ntz6a(unsigned x)
-because they refer to the same storage locations as two different types.
+{
-However, they work with xlc/AIX, gcc/AIX, and gcc/NT.  If you try to
+	int n = 32;
 code them more compactly by declaring a variable xx to be "double," and
 then using
   n = 1054 - (*((unsigned *)&xx + LE) >> 20);
 then you are violating not only the rule above, but also the ANSI C
 rule that pointer arithmetic can be performed only on pointers to
 array elements.
   When coded with the above statement, the program fails with xlc,
 gcc/AIX, and gcc/NT, at some optimization levels.
   BTW, these programs use the "anonymous union" feature of C++, not
 available in C. */
 int nlz6(unsigned k) {
   union {
      unsigned asInt[2];
      double asDouble;
   };
   int n;
-   asDouble = (double)k + 0.5;
+	while (x != 0) {
-   n = 1054 - (asInt[LE] >> 20);
+		n = n - 1;
-   return n;
+		x = x + x;
 	}
 	return n;
 }
-int nlz7(unsigned k) {
+/* Dean Gaudet's algorithm. To be most useful there must be a good way
-   union {
+to evaluate the C "conditional expression" (a?b:c construction) without
-      unsigned asInt[2];
+branching. The result of a?b:c is b if a is true (nonzero), and c if a
-      double asDouble;
+is false (0).
-   };
+   For example, a compare to zero op that sets a target GPR to 1 if the
-   int n;
+operand is 0, and to 0 if the operand is nonzero, will do it. With this
 instruction, the algorithm is entirely branch-free. But the most
 interesting thing about it is the high degree of parallelism. All six
 lines with conditional expressions can be executed in parallel (on a
 machine with sufficient computational units).
   Although the instruction count is 30 measured statically, it could
 execute in only 10 cycles on a machine with sufficient parallelism.
   The first two uses of y can instead be x, which would increase the
 useful parallelism on most machines (the assignments to y, bz, and b4
 could then all run in parallel). */
 int ntz7(unsigned x)
 {
 	unsigned y, bz, b4, b3, b2, b1, b0;
 	y = x & -x;               // Isolate rightmost 1-bit.
 	bz = y ? 0 : 1;           // 1 if y = 0.
 	b4 = (y & 0x0000FFFF) ? 0 : 16;
 	b3 = (y & 0x00FF00FF) ? 0 : 8;
 	b2 = (y & 0x0F0F0F0F) ? 0 : 4;
 	b1 = (y & 0x33333333) ? 0 : 2;
 	b0 = (y & 0x55555555) ? 0 : 1;
 	return bz + b4 + b3 + b2 + b1 + b0;
 }
-   asDouble = (double)k;
+int ntz7_christophe(unsigned x)
-   n = 1054 - (asInt[LE] >> 20);
+{
-   n = (n & 31) + (n >> 9);
+	unsigned y, bz, b4, b3, b2, b1, b0;
-   return n;
+
 	y = x & -x;               // Isolate rightmost 1-bit.
 	bz = unsigned(!bool(y));           // 1 if y = 0.
 	b4 = unsigned(!bool(y & 0x0000FFFF)) * 16;
 	b3 = unsigned(!bool(y & 0x00FF00FF)) * 8;
 	b2 = unsigned(!bool(y & 0x0F0F0F0F)) * 4;
 	b1 = unsigned(!bool(y & 0x33333333)) * 2;
 	b0 = unsigned(!bool(y & 0x55555555)) * 1;
 	return bz + b4 + b3 + b2 + b1 + b0;
 }
-   /* In single precision, round-to-nearest mode, the basic method fails for:
+/* Below is David Seal's algorithm, found at
-   k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF,
+http://www.ciphersbyritter.com/NEWS4/BITCT.HTM Table
-                        07FFFFFC <= k <= 07FFFFFF,
+entries marked "u" are unused. 6 ops including a
-                        0FFFFFF8 <= k <= 0FFFFFFF,
+multiply, plus an indexed load. */
                                   ...
                        7FFFFFC0 <= k <= 7FFFFFFF.
                        FFFFFF80 <= k <= FFFFFFFF.
   For k = 0 it gives 158, and for the other values it is too low by 1. */
 int nlz8(unsigned k) {
   union {
      unsigned asInt;
      float asFloat;
   };
   int n;
-   k = k & ~(k >> 1);           /* Fix problem with rounding. */
+#define u 99
-   asFloat = (float)k + 0.5f;
+int ntz8(unsigned x)
-   n = 158 - (asInt >> 23);
+{
-   return n;
+	static char table[64] =
 		{32, 0, 1,12, 2, 6, u,13,   3, u, 7, u, u, u, u,14,
 		10, 4, u, u, 8, u, u,25,   u, u, u, u, u,21,27,15,
 		31,11, 5, u, u, u, u, u,   9, u, u,24, u, u,20,26,
 		30, u, u, u, u,23, u,19,  29, u,22,18,28,17,16, u};
 	x = (x & -x)*0x0450FBAF;
 	return table[x >> 26];
 }
-/* The example below shows how to make a macro for nlz.  It uses an
+/* Seal's algorithm with multiply expanded.
-extension to the C and C++ languages that is provided by the GNU C/C++
+9 elementary ops plus an indexed load. */
 compiler, namely, that of allowing statements and declarations in
 expressions (see "Using and Porting GNU CC", by Richard M. Stallman
 (1998).  The underscores are necessary to protect against the
 possibility that the macro argument will conflict with one of its local
 variables, e.g., NLZ(k). */
 int nlz9(unsigned k) {
   union {
      unsigned asInt;
      float asFloat;
   };
   int n;
-   k = k & ~(k >> 1);           /* Fix problem with rounding. */
+int ntz8a(unsigned x)
-   asFloat = (float)k;
+{
-   n = 158 - (asInt >> 23);
+	static char table[64] =
-   n = (n & 31) + (n >> 6);     /* Fix problem with k = 0. */
+		{32, 0, 1,12, 2, 6, u,13,   3, u, 7, u, u, u, u,14,
-   return n;
+		10, 4, u, u, 8, u, u,25,   u, u, u, u, u,21,27,15,
 		31,11, 5, u, u, u, u, u,   9, u, u,24, u, u,20,26,
 		30, u, u, u, u,23, u,19,  29, u,22,18,28,17,16, u};
 	x = (x & -x);
 	x = (x << 4) + x;    // x = x*17.
 	x = (x << 6) + x;    // x = x*65.
 	x = (x << 16) - x;   // x = x*65535.
 	return table[x >> 26];
 }
-/* Below are three nearly equivalent programs for computing the number
+/* Reiser's algorithm. Three ops including a "remainder,"
-of leading zeros in a word. This material is not in HD, but may be in a
+plus an indexed load. */
 future edition.
   Immediately below is Robert Harley's algorithm, found at the
 comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert
 Juffa.
   Table entries marked "u" are unused. 14 ops including a multiply,
 plus an indexed load.
   The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all
 of form 2**k + 1). There are no multipliers of three terms of the form
 2**k +- 1 that work, with a table size of 64 or 128. There are some,
 with a table size of 64, if you precede the multiplication with x = x -
 (x >> 1), but that seems less elegant. There are also some if you use a
 table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would
 save two instructions in the form of this algorithm with the
 multiplication expanded into shifts and adds, but the table size is
 getting a bit large). */
-#define u 99
+int ntz9(unsigned x) {
 int nlz10(unsigned x) {
-   static char table[64] =
+   static char table[37] = {32,  0,  1, 26,  2, 23, 27,
-     {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
+                 u,  3, 16, 24, 30, 28, 11,  u, 13,  4,
-       u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
+                 7, 17,  u, 25, 22, 31, 15, 29, 10, 12,
-      17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
+                 6,  u, 21, 14,  9,  5, 20,  8, 19, 18};
       5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};
-   x = x | (x >> 1);    // Propagate leftmost
+   x = (x & -x)%37;
-   x = x | (x >> 2);    // 1-bit to the right.
+   return table[x];
   x = x | (x >> 4);
   x = x | (x >> 8);
   x = x | (x >>16);
   x = x*0x06EB14F9;    // Multiplier is 7*255**3.
   return table[x >> 26];
 }
-/* Harley's algorithm with multiply expanded.
+/* Using a de Bruijn sequence. This is a table lookup with a 32-entry
-19 elementary ops plus an indexed load. */
+table. The de Bruijn sequence used here is
                0000 0100 1101 0111 0110 0101 0001 1111,
 obtained from Danny Dube's October 3, 1997, posting in
 comp.compression.research. Thanks to Norbert Juffa for this reference. */
-int nlz10a(unsigned x) {
+int ntz10(unsigned x) {
-   static char table[64] =
+   static char table[32] =
-     {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
+     { 0, 1, 2,24, 3,19, 6,25,  22, 4,20,10,16, 7,12,26,
-       u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
+      31,23,18, 5,21, 9,15,11,  30,17, 8,14,29,13,28,27};
      17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
       5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};
-   x = x | (x >> 1);    // Propagate leftmost
+   if (x == 0) return 32;
-   x = x | (x >> 2);    // 1-bit to the right.
+   x = (x & -x)*0x04D7651F;
-   x = x | (x >> 4);
+   return table[x >> 27];
   x = x | (x >> 8);
   x = x | (x >> 16);
   x = (x << 3) - x;    // Multiply by 7.
   x = (x << 8) - x;    // Multiply by 255.
   x = (x << 8) - x;    // Again.
   x = (x << 8) - x;    // Again.
   return table[x >> 26];
 }
-/* Julius Goryavsky's version of Harley's algorithm.
+/* Norbert Juffa's code, answer to exercise 1 of Chapter 5 (2nd ed). */
 17 elementary ops plus an indexed load, if the machine
 has "and not." */
 int nlz10b(unsigned x) {
-   static char table[64] =
+#define SLOW_MUL
-     {32,20,19, u, u,18, u, 7,  10,17, u, u,14, u, 6, u,
+int ntz11 (unsigned int n) {
       u, 9, u,16, u, u, 1,26,   u,13, u, u,24, 5, u, u,
       u,21, u, 8,11, u,15, u,   u, u, u, 2,27, 0,25, u,
      22, u,12, u, u, 3,28, u,  23, u, 4,29, u, u,30,31};
-   x = x | (x >> 1);    // Propagate leftmost
+   static unsigned char tab[32] =
-   x = x | (x >> 2);    // 1-bit to the right.
+   {   0,  1,  2, 24,  3, 19, 6,  25,
-   x = x | (x >> 4);
+      22,  4, 20, 10, 16,  7, 12, 26,
-   x = x | (x >> 8);
+      31, 23, 18,  5, 21,  9, 15, 11,
-   x = x & ~(x >> 16);
+      30, 17,  8, 14, 29, 13, 28, 27
-   x = x*0xFD7049FF;    // Activate this line or the following 3.
+   };
-// x = (x << 9) - x;    // Multiply by 511.
+   unsigned int k;
-// x = (x << 11) - x;   // Multiply by 2047.
+   n = n & (-n);        /* isolate lsb */
-// x = (x << 14) - x;   // Multiply by 16383.
+   printf("n = %d\n", n);
-   return table[x >> 26];
+#if defined(SLOW_MUL)
   k = (n << 11) - n;
   k = (k <<  2) + k;
   k = (k <<  8) + n;
   k = (k <<  5) - k;
 #else
   k = n * 0x4d7651f;
 #endif
   return n ? tab[k>>27] : 32;
 }
 int errors;
@ -308,19 +266,22 @@ void error(int x, int y) {
   printf("Error for x = %08x, got %d\n", x, y);
 }
 /* ------------------------------ main ------------------------------ */
 int main()
 {
 #	ifdef GLM_TEST_ENABLE_PERF
-	int i, n;
+	int i, m, n;
-	static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29,
+	static unsigned test[] = {0,32, 1,0, 2,1, 3,0, 4,2, 5,0, 6,1,  7,0,
-		7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23,
+		8,3, 9,0, 16,4, 32,5, 64,6, 128,7, 255,0, 256,8, 512,9, 1024,10,
-		512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16,
+		2048,11, 4096,12, 8192,13, 16384,14, 32768,15, 65536,16,
-		65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11,
+		0x20000,17, 0x40000,18, 0x80000,19, 0x100000,20, 0x200000,21,
-		0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6,
+		0x400000,22, 0x800000,23, 0x1000000,24, 0x2000000,25,
-		0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3,
+		0x4000000,26, 0x8000000,27, 0x10000000,28, 0x20000000,29,
-		0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0,
+		0x40000000,30, 0x80000000,31, 0xFFFFFFF0,4, 0x3000FF00,8,
-		0xFFFFFFFF,0};
+		0xC0000000,30, 0x60000000,29, 0x00011000, 12};
 	std::size_t const Count = 10000000;
 	n = sizeof(test)/4;
@ -331,114 +292,115 @@ int main()
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));}
+		if (ntz1(test[i]) != test[i+1]) error(test[i], ntz1(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz1: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));}
+		if (ntz2(test[i]) != test[i+1]) error(test[i], ntz2(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz2: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));}
+		if (ntz3(test[i]) != test[i+1]) error(test[i], ntz3(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz3: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));}
+		if (ntz4(test[i]) != test[i+1]) error(test[i], ntz4(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz4: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));}
+		if (ntz4a(test[i]) != test[i+1]) error(test[i], ntz4a(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz4a: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));}
+		m = test[i+1]; if (m > 8) m = 8;
 		if (ntz5(test[i]) != m) error(test[i], ntz5(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz5: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));}
+		if (ntz6(test[i]) != test[i+1]) error(test[i], ntz6(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz6: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));}
+		if (ntz6a(test[i]) != test[i+1]) error(test[i], ntz6a(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz6a: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));}
+		if (ntz7(test[i]) != test[i+1]) error(test[i], ntz7(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz7: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));}
+		if (ntz7_christophe(test[i]) != test[i+1]) error(test[i], ntz7(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz7_christophe: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));}
+		if (ntz8(test[i]) != test[i+1]) error(test[i], ntz8(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz8: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));}
+		if (ntz8a(test[i]) != test[i+1]) error(test[i], ntz8a(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz8a: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));}
+		if (ntz9(test[i]) != test[i+1]) error(test[i], ntz9(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz9: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
-		if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));}
+		if (ntz10(test[i]) != test[i+1]) error(test[i], ntz10(test[i]));}
 	TimestampEnd = std::clock();
-	printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg);
+	printf("ntz10: %d clocks\n", TimestampEnd - TimestampBeg);
 	if (errors == 0)
 		printf("Passed all %d cases.\n", sizeof(test)/8);
--- a/test/core/core_func_integer_find_msb.cpp
+++ b/test/core/core_func_integer_find_msb.cpp
@ -0,0 +1,447 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // OpenGL Mathematics Copyright (c) 2005 - 2014 G-Truc Creation (www.g-truc.net)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Created : 2014-10-27
 // Updated : 2014-10-27
 // Licence : This source is under MIT licence
 // File    : test/core/func_integer_find_lsb.cpp
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // This has the programs for computing the number of leading zeros
 // in a word.
 // Max line length is 57, to fit in hacker.book.
 // Compile with g++, not gcc.
 #include <cstdio>
 #include <cstdlib>     // To define "exit", req'd by XLC.
 #include <ctime>
 #define LE 1            // 1 for little-endian, 0 for big-endian.
 int pop(unsigned x) {
   x = x - ((x >> 1) & 0x55555555);
   x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
   x = (x + (x >> 4)) & 0x0F0F0F0F;
   x = x + (x << 8);
   x = x + (x << 16);
   return x >> 24;
 }
 int nlz1(unsigned x) {
   int n;
   if (x == 0) return(32);
   n = 0;
   if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
   if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
   if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
   if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
   if (x <= 0x7FFFFFFF) {n = n + 1;}
   return n;
 }
 int nlz1a(unsigned x) {
   int n;
 /* if (x == 0) return(32); */
   if ((int)x <= 0) return (~x >> 26) & 32;
   n = 1;
   if ((x >> 16) == 0) {n = n +16; x = x <<16;}
   if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
   if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
   if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
   n = n - (x >> 31);
   return n;
 }
 // On basic Risc, 12 to 20 instructions.
 int nlz2(unsigned x) {
   unsigned y;
   int n;
   n = 32;
   y = x >>16;  if (y != 0) {n = n -16;  x = y;}
   y = x >> 8;  if (y != 0) {n = n - 8;  x = y;}
   y = x >> 4;  if (y != 0) {n = n - 4;  x = y;}
   y = x >> 2;  if (y != 0) {n = n - 2;  x = y;}
   y = x >> 1;  if (y != 0) return n - 2;
   return n - x;
 }
 // As above but coded as a loop for compactness:
 // 23 to 33 basic Risc instructions.
 int nlz2a(unsigned x) {
   unsigned y;
   int n, c;
   n = 32;
   c = 16;
   do {
      y = x >> c;  if (y != 0) {n = n - c;  x = y;}
      c = c >> 1;
   } while (c != 0);
   return n - x;
 }
 int nlz3(int x) {
   int y, n;
   n = 0;
   y = x;
 L: if (x < 0) return n;
   if (y == 0) return 32 - n;
   n = n + 1;
   x = x << 1;
   y = y >> 1;
   goto L;
 }
 int nlz4(unsigned x) {
   int y, m, n;
   y = -(x >> 16);      // If left half of x is 0,
   m = (y >> 16) & 16;  // set n = 16.  If left half
   n = 16 - m;          // is nonzero, set n = 0 and
   x = x >> m;          // shift x right 16.
                        // Now x is of the form 0000xxxx.
   y = x - 0x100;       // If positions 8-15 are 0,
   m = (y >> 16) & 8;   // add 8 to n and shift x left 8.
   n = n + m;
   x = x << m;
   y = x - 0x1000;      // If positions 12-15 are 0,
   m = (y >> 16) & 4;   // add 4 to n and shift x left 4.
   n = n + m;
   x = x << m;
   y = x - 0x4000;      // If positions 14-15 are 0,
   m = (y >> 16) & 2;   // add 2 to n and shift x left 2.
   n = n + m;
   x = x << m;
   y = x >> 14;         // Set y = 0, 1, 2, or 3.
   m = y & ~(y >> 1);   // Set m = 0, 1, 2, or 2 resp.
   return n + 2 - m;
 }
 int nlz5(unsigned x) {
   int pop(unsigned x);
   x = x | (x >> 1);
   x = x | (x >> 2);
   x = x | (x >> 4);
   x = x | (x >> 8);
   x = x | (x >>16);
   return pop(~x);
 }
 /* The four programs below are not valid ANSI C programs.  This is
 because they refer to the same storage locations as two different types.
 However, they work with xlc/AIX, gcc/AIX, and gcc/NT.  If you try to
 code them more compactly by declaring a variable xx to be "double," and
 then using
   n = 1054 - (*((unsigned *)&xx + LE) >> 20);
 then you are violating not only the rule above, but also the ANSI C
 rule that pointer arithmetic can be performed only on pointers to
 array elements.
   When coded with the above statement, the program fails with xlc,
 gcc/AIX, and gcc/NT, at some optimization levels.
   BTW, these programs use the "anonymous union" feature of C++, not
 available in C. */
 int nlz6(unsigned k) {
   union {
      unsigned asInt[2];
      double asDouble;
   };
   int n;
   asDouble = (double)k + 0.5;
   n = 1054 - (asInt[LE] >> 20);
   return n;
 }
 int nlz7(unsigned k) {
   union {
      unsigned asInt[2];
      double asDouble;
   };
   int n;
   asDouble = (double)k;
   n = 1054 - (asInt[LE] >> 20);
   n = (n & 31) + (n >> 9);
   return n;
 }
   /* In single precision, round-to-nearest mode, the basic method fails for:
   k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF,
                        07FFFFFC <= k <= 07FFFFFF,
                        0FFFFFF8 <= k <= 0FFFFFFF,
                                   ...
                        7FFFFFC0 <= k <= 7FFFFFFF.
                        FFFFFF80 <= k <= FFFFFFFF.
   For k = 0 it gives 158, and for the other values it is too low by 1. */
 int nlz8(unsigned k) {
   union {
      unsigned asInt;
      float asFloat;
   };
   int n;
   k = k & ~(k >> 1);           /* Fix problem with rounding. */
   asFloat = (float)k + 0.5f;
   n = 158 - (asInt >> 23);
   return n;
 }
 /* The example below shows how to make a macro for nlz.  It uses an
 extension to the C and C++ languages that is provided by the GNU C/C++
 compiler, namely, that of allowing statements and declarations in
 expressions (see "Using and Porting GNU CC", by Richard M. Stallman
 (1998).  The underscores are necessary to protect against the
 possibility that the macro argument will conflict with one of its local
 variables, e.g., NLZ(k). */
 int nlz9(unsigned k) {
   union {
      unsigned asInt;
      float asFloat;
   };
   int n;
   k = k & ~(k >> 1);           /* Fix problem with rounding. */
   asFloat = (float)k;
   n = 158 - (asInt >> 23);
   n = (n & 31) + (n >> 6);     /* Fix problem with k = 0. */
   return n;
 }
 /* Below are three nearly equivalent programs for computing the number
 of leading zeros in a word. This material is not in HD, but may be in a
 future edition.
   Immediately below is Robert Harley's algorithm, found at the
 comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert
 Juffa.
   Table entries marked "u" are unused. 14 ops including a multiply,
 plus an indexed load.
   The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all
 of form 2**k + 1). There are no multipliers of three terms of the form
 2**k +- 1 that work, with a table size of 64 or 128. There are some,
 with a table size of 64, if you precede the multiplication with x = x -
 (x >> 1), but that seems less elegant. There are also some if you use a
 table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would
 save two instructions in the form of this algorithm with the
 multiplication expanded into shifts and adds, but the table size is
 getting a bit large). */
 #define u 99
 int nlz10(unsigned x) {
   static char table[64] =
     {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
       u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
      17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
       5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};
   x = x | (x >> 1);    // Propagate leftmost
   x = x | (x >> 2);    // 1-bit to the right.
   x = x | (x >> 4);
   x = x | (x >> 8);
   x = x | (x >>16);
   x = x*0x06EB14F9;    // Multiplier is 7*255**3.
   return table[x >> 26];
 }
 /* Harley's algorithm with multiply expanded.
 19 elementary ops plus an indexed load. */
 int nlz10a(unsigned x) {
   static char table[64] =
     {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
       u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
      17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
       5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};
   x = x | (x >> 1);    // Propagate leftmost
   x = x | (x >> 2);    // 1-bit to the right.
   x = x | (x >> 4);
   x = x | (x >> 8);
   x = x | (x >> 16);
   x = (x << 3) - x;    // Multiply by 7.
   x = (x << 8) - x;    // Multiply by 255.
   x = (x << 8) - x;    // Again.
   x = (x << 8) - x;    // Again.
   return table[x >> 26];
 }
 /* Julius Goryavsky's version of Harley's algorithm.
 17 elementary ops plus an indexed load, if the machine
 has "and not." */
 int nlz10b(unsigned x) {
   static char table[64] =
     {32,20,19, u, u,18, u, 7,  10,17, u, u,14, u, 6, u,
       u, 9, u,16, u, u, 1,26,   u,13, u, u,24, 5, u, u,
       u,21, u, 8,11, u,15, u,   u, u, u, 2,27, 0,25, u,
      22, u,12, u, u, 3,28, u,  23, u, 4,29, u, u,30,31};
   x = x | (x >> 1);    // Propagate leftmost
   x = x | (x >> 2);    // 1-bit to the right.
   x = x | (x >> 4);
   x = x | (x >> 8);
   x = x & ~(x >> 16);
   x = x*0xFD7049FF;    // Activate this line or the following 3.
 // x = (x << 9) - x;    // Multiply by 511.
 // x = (x << 11) - x;   // Multiply by 2047.
 // x = (x << 14) - x;   // Multiply by 16383.
   return table[x >> 26];
 }
 int errors;
 void error(int x, int y) {
   errors = errors + 1;
   printf("Error for x = %08x, got %d\n", x, y);
 }
 int main()
 {
 #	ifdef GLM_TEST_ENABLE_PERF
 	int i, n;
 	static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29,
 		7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23,
 		512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16,
 		65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11,
 		0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6,
 		0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3,
 		0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0,
 		0xFFFFFFFF,0};
 	std::size_t const Count = 10000000;
 	n = sizeof(test)/4;
 	std::clock_t TimestampBeg = 0;
 	std::clock_t TimestampEnd = 0;
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg);
 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));}
 	TimestampEnd = std::clock();
 	printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg);
 	if (errors == 0)
 		printf("Passed all %d cases.\n", sizeof(test)/8);
 #	endif//GLM_TEST_ENABLE_PERF
 }