diff --git a/glm/detail/func_exponential.inl b/glm/detail/func_exponential.inl index 20d1141f..5cec61c7 100644 --- a/glm/detail/func_exponential.inl +++ b/glm/detail/func_exponential.inl @@ -130,3 +130,8 @@ namespace detail return detail::compute_inversesqrt::call(x); } }//namespace glm + +#if GLM_ARCH != GLM_ARCH_PURE && GLM_HAS_UNRESTRICTED_UNIONS +# include "func_exponential_simd.inl" +#endif + diff --git a/glm/detail/func_exponential_simd.inl b/glm/detail/func_exponential_simd.inl new file mode 100644 index 00000000..e69de29b diff --git a/glm/detail/func_integer.inl b/glm/detail/func_integer.inl index 20445034..dfdae80d 100644 --- a/glm/detail/func_integer.inl +++ b/glm/detail/func_integer.inl @@ -6,12 +6,10 @@ #include "type_vec4.hpp" #include "type_int.hpp" #include "_vectorize.hpp" -#if(GLM_ARCH != GLM_ARCH_PURE) -#if(GLM_COMPILER & GLM_COMPILER_VC) +#if(GLM_ARCH & GLM_ARCH_X86 && GLM_COMPILER & GLM_COMPILER_VC) # include # pragma intrinsic(_BitScanReverse) -#endif//(GLM_COMPILER & GLM_COMPILER_VC) -#endif//(GLM_ARCH != GLM_ARCH_PURE) +#endif//(GLM_ARCH & GLM_ARCH_X86 && GLM_COMPILER & GLM_COMPILER_VC) #include namespace glm{ @@ -359,7 +357,7 @@ namespace detail } }//namespace glm -#if GLM_ARCH != GLM_ARCH_PURE +#if GLM_ARCH != GLM_ARCH_PURE && GLM_HAS_UNRESTRICTED_UNIONS # include "func_integer_simd.inl" #endif diff --git a/glm/detail/func_integer_simd.inl b/glm/detail/func_integer_simd.inl index 6fab1146..bf393056 100644 --- a/glm/detail/func_integer_simd.inl +++ b/glm/detail/func_integer_simd.inl @@ -4,111 +4,6 @@ namespace glm{ namespace detail { - inline __m128i _mm_bit_interleave_si128(__m128i x) - { - __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); - __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); - __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); - __m128i const Mask1 = _mm_set1_epi32(0x33333333); - __m128i const Mask0 = _mm_set1_epi32(0x55555555); - __m128i Reg1; - __m128i Reg2; - - // REG1 = x; - // REG2 = y; - //Reg1 = _mm_unpacklo_epi64(x, y); - Reg1 = x; - - //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); - //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); - Reg2 = _mm_slli_si128(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask4); - - //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); - //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); - Reg2 = _mm_slli_si128(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask3); - - //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); - //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); - Reg2 = _mm_slli_epi32(Reg1, 4); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask2); - - //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); - //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); - Reg2 = _mm_slli_epi32(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask1); - - //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); - //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask0); - - //return REG1 | (REG2 << 1); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg2 = _mm_srli_si128(Reg2, 8); - Reg1 = _mm_or_si128(Reg1, Reg2); - - return Reg1; - } - - inline __m128i _mm_bit_interleave_si128(__m128i x, __m128i y) - { - __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); - __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); - __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); - __m128i const Mask1 = _mm_set1_epi32(0x33333333); - __m128i const Mask0 = _mm_set1_epi32(0x55555555); - - __m128i Reg1; - __m128i Reg2; - - // REG1 = x; - // REG2 = y; - Reg1 = _mm_unpacklo_epi64(x, y); - - //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); - //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); - Reg2 = _mm_slli_si128(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask4); - - //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); - //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); - Reg2 = _mm_slli_si128(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask3); - - //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); - //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); - Reg2 = _mm_slli_epi32(Reg1, 4); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask2); - - //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); - //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); - Reg2 = _mm_slli_epi32(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask1); - - //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); - //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask0); - - //return REG1 | (REG2 << 1); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg2 = _mm_srli_si128(Reg2, 8); - Reg1 = _mm_or_si128(Reg1, Reg2); - - return Reg1; - } }//namespace detail }//namespace glm diff --git a/glm/detail/func_matrix_simd.inl b/glm/detail/func_matrix_simd.inl index 820fadc7..9f984f93 100644 --- a/glm/detail/func_matrix_simd.inl +++ b/glm/detail/func_matrix_simd.inl @@ -15,7 +15,7 @@ namespace detail GLM_FUNC_QUALIFIER static tmat4x4 call(tmat4x4 const& m) { tmat4x4 Result(uninitialize); - glm_f32m4_inverse(*reinterpret_cast<__m128 const(*)[4]>(&m[0].data), *reinterpret_cast<__m128(*)[4]>(&Result[0].data)); + glm_f32m4_inv(*reinterpret_cast<__m128 const(*)[4]>(&m[0].data), *reinterpret_cast<__m128(*)[4]>(&Result[0].data)); return Result; } }; diff --git a/glm/detail/func_packing.inl b/glm/detail/func_packing.inl index 0eddd691..505c80a1 100644 --- a/glm/detail/func_packing.inl +++ b/glm/detail/func_packing.inl @@ -184,3 +184,7 @@ namespace glm } }//namespace glm +#if GLM_ARCH != GLM_ARCH_PURE && GLM_HAS_UNRESTRICTED_UNIONS +# include "func_packing_simd.inl" +#endif + diff --git a/glm/detail/func_packing_simd.inl b/glm/detail/func_packing_simd.inl new file mode 100644 index 00000000..e69de29b diff --git a/glm/detail/func_vector_relational.inl b/glm/detail/func_vector_relational.inl index ccba724f..3d8d2b7e 100644 --- a/glm/detail/func_vector_relational.inl +++ b/glm/detail/func_vector_relational.inl @@ -100,3 +100,6 @@ namespace glm } }//namespace glm +#if GLM_ARCH != GLM_ARCH_PURE && GLM_HAS_UNRESTRICTED_UNIONS +# include "func_vector_relational_simd.inl" +#endif diff --git a/glm/detail/func_vector_relational_simd.inl b/glm/detail/func_vector_relational_simd.inl new file mode 100644 index 00000000..e69de29b diff --git a/glm/gtc/bitfield.inl b/glm/gtc/bitfield.inl index dccc8107..af522bdf 100644 --- a/glm/gtc/bitfield.inl +++ b/glm/gtc/bitfield.inl @@ -1,6 +1,8 @@ /// @ref gtc_bitfield /// @file glm/gtc/bitfield.inl +#include "../simd/integer.h" + namespace glm{ namespace detail { diff --git a/glm/simd/common.h b/glm/simd/common.h index fbbdb6aa..be712ca1 100644 --- a/glm/simd/common.h +++ b/glm/simd/common.h @@ -3,14 +3,27 @@ #pragma once -static const __m128 GLM_VAR_USED glm_zero = _mm_setzero_ps(); -static const __m128 GLM_VAR_USED glm_one = _mm_set_ps1(1.0f); -static const __m128 GLM_VAR_USED glm_half = _mm_set_ps1(0.5f); -static const __m128 GLM_VAR_USED glm_minus_one = _mm_set_ps1(-1.0f); -static const __m128 GLM_VAR_USED glm_two = _mm_set_ps1(2.0f); -static const __m128 GLM_VAR_USED glm_three = _mm_set_ps1(3.0f); +#if GLM_ARCH & GLM_ARCH_SSE2 -static const __m128 GLM_VAR_USED glm_ps_2pow23 = _mm_set_ps1(8388608.0f); +//mad +GLM_FUNC_QUALIFIER __m128 glm_f32v1_mad(__m128 a, __m128 b, __m128 c) +{ +# if GLM_ARCH & GLM_ARCH_AVX2 + return _mm_fmadd_ss(a, b, c); +# else + return _mm_add_ss(_mm_mul_ss(a, b), c); +# endif +} + +//mad +GLM_FUNC_QUALIFIER __m128 glm_f32v4_mad(__m128 a, __m128 b, __m128 c) +{ +# if GLM_ARCH & GLM_ARCH_AVX2 + return _mm_fmadd_ps(a, b, c); +# else + return _mm_add_ps(_mm_mul_ps(a, b), c); +# endif +} //abs GLM_FUNC_QUALIFIER __m128 glm_f32v4_abs(__m128 x) @@ -33,19 +46,21 @@ GLM_FUNC_QUALIFIER __m128i glm_i32v4_abs(__m128i x) //sign GLM_FUNC_QUALIFIER __m128 glm_f32v4_sgn(__m128 x) { - __m128 const Cmp0 = _mm_cmplt_ps(x, glm_zero); - __m128 const Cmp1 = _mm_cmpgt_ps(x, glm_zero); - __m128 const And0 = _mm_and_ps(Cmp0, glm_minus_one); - __m128 const And1 = _mm_and_ps(Cmp1, glm_one); - return _mm_or_ps(And0, And1); + __m128 const zro0 = _mm_setzero_ps(); + __m128 const cmp0 = _mm_cmplt_ps(x, zro0); + __m128 const cmp1 = _mm_cmpgt_ps(x, zro0); + __m128 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(-1.0f)); + __m128 const and1 = _mm_and_ps(cmp1, _mm_set1_ps(1.0f)); + __m128 const or0 = _mm_or_ps(and0, and1);; + return or0; } //round GLM_FUNC_QUALIFIER __m128 glm_f32v4_rnd(__m128 x) { - __m128 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(static_cast(0x80000000))); + __m128 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); __m128 const and0 = _mm_and_ps(sgn0, x); - __m128 const or0 = _mm_or_ps(and0, glm_ps_2pow23); + __m128 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f)); __m128 const add0 = _mm_add_ps(x, or0); __m128 const sub0 = _mm_sub_ps(add0, or0); return sub0; @@ -70,9 +85,9 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_flr(__m128 x) //roundEven GLM_FUNC_QUALIFIER __m128 glm_f32v4_rde(__m128 x) { - __m128 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(static_cast(0x80000000))); + __m128 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); __m128 const and0 = _mm_and_ps(sgn0, x); - __m128 const or0 = _mm_or_ps(and0, glm_ps_2pow23); + __m128 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f)); __m128 const add0 = _mm_add_ps(x, or0); __m128 const sub0 = _mm_sub_ps(add0, or0); return sub0; @@ -114,16 +129,15 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_mix(__m128 v1, __m128 v2, __m128 a) { __m128 const sub0 = _mm_sub_ps(_mm_set1_ps(1.0f), a); __m128 const mul0 = _mm_mul_ps(v1, sub0); - __m128 const mul1 = _mm_mul_ps(v2, a); - __m128 const add0 = _mm_add_ps(mul0, mul1); - return add0; + __m128 const mad0 = glm_f32v4_mad(v2, a, mul0); + return mad0; } //step GLM_FUNC_QUALIFIER __m128 glm_f32v4_stp(__m128 edge, __m128 x) { __m128 const cmp = _mm_cmple_ps(x, edge); - return _mm_movemask_ps(cmp) == 0 ? _mm_set1_ps(1.0f) : _mm_set1_ps(0.0f); + return _mm_movemask_ps(cmp) == 0 ? _mm_set1_ps(1.0f) : _mm_setzero_ps(); } // smoothstep @@ -132,9 +146,9 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_ssp(__m128 edge0, __m128 edge1, __m128 x) __m128 const sub0 = _mm_sub_ps(x, edge0); __m128 const sub1 = _mm_sub_ps(edge1, edge0); __m128 const div0 = _mm_sub_ps(sub0, sub1); - __m128 const clp0 = glm_f32v4_clp(div0, _mm_set1_ps(0.0f), _mm_set1_ps(1.0f)); - __m128 const mul0 = _mm_mul_ps(glm_two, clp0); - __m128 const sub2 = _mm_sub_ps(glm_three, mul0); + __m128 const clp0 = glm_f32v4_clp(div0, _mm_setzero_ps(), _mm_set1_ps(1.0f)); + __m128 const mul0 = _mm_mul_ps(_mm_set1_ps(2.0f), clp0); + __m128 const sub2 = _mm_sub_ps(_mm_set1_ps(3.0f), mul0); __m128 const mul1 = _mm_mul_ps(clp0, clp0); __m128 const mul2 = _mm_mul_ps(mul1, sub2); return mul2; @@ -149,7 +163,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_nan(__m128 x) __m128i const t4 = _mm_and_si128(t2, t3); // exponent __m128i const t5 = _mm_andnot_si128(t3, t2); // fraction __m128i const Equal = _mm_cmpeq_epi32(t3, t4); - __m128i const Nequal = _mm_cmpeq_epi32(t5, _mm_set1_epi32(0)); + __m128i const Nequal = _mm_cmpeq_epi32(t5, _mm_setzero_si128()); __m128i const And = _mm_and_si128(Equal, Nequal); return _mm_castsi128_ps(And); // exponent = all 1s and fraction != 0 } @@ -162,24 +176,6 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_inf(__m128 x) return _mm_castsi128_ps(_mm_cmpeq_epi32(t2, _mm_set1_epi32(0xFF000000))); // exponent is all 1s, fraction is 0 } -GLM_FUNC_QUALIFIER __m128 glm_f32v1_fma(__m128 a, __m128 b, __m128 c) -{ -# if GLM_ARCH & GLM_ARCH_AVX2 - return _mm_fmadd_ss(a, b, c); -# else - return _mm_add_ss(_mm_mul_ss(a, b), c); -# endif -} - -GLM_FUNC_QUALIFIER __m128 glm_f32v4_fma(__m128 a, __m128 b, __m128 c) -{ -# if GLM_ARCH & GLM_ARCH_AVX2 - return _mm_fmadd_ps(a, b, c); -# else - return _mm_add_ps(_mm_mul_ps(a, b), c); -# endif -} - // SSE scalar reciprocal sqrt using rsqrt op, plus one Newton-Rhaphson iteration // By Elan Ruskin, http://assemblyrequired.crashworks.org/ GLM_FUNC_QUALIFIER __m128 glm_f32v1_sqrt_wip(__m128 x) @@ -206,4 +202,4 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_sqrt_wip(__m128 x) return Mul3; } - +#endif//GLM_ARCH & GLM_ARCH_SSE2 diff --git a/glm/simd/exponential.h b/glm/simd/exponential.h index 2d689915..49be6cb4 100644 --- a/glm/simd/exponential.h +++ b/glm/simd/exponential.h @@ -1,2 +1,5 @@ /// @ref simd /// @file glm/simd/experimental.h + +#pragma once + diff --git a/glm/simd/geometric.h b/glm/simd/geometric.h index 86d3ac73..ac984d2d 100644 --- a/glm/simd/geometric.h +++ b/glm/simd/geometric.h @@ -5,6 +5,8 @@ #include "common.h" +#if GLM_ARCH & GLM_ARCH_SSE2 + GLM_FUNC_QUALIFIER __m128 glm_f32v4_dot(__m128 v1, __m128 v2) { # if GLM_ARCH & GLM_ARCH_AVX @@ -29,9 +31,9 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v1_dot(__m128 v1, __m128 v2) # if GLM_ARCH & GLM_ARCH_AVX return _mm_dp_ps(v1, v2, 0xff); # elif GLM_ARCH & GLM_ARCH_SSE3 - __m128 const Mul0 = _mm_mul_ps(v1, v2); - __m128 const Hadd0 = _mm_hadd_ps(Mul0, Mul0); - __m128 const Hadd1 = _mm_hadd_ps(Hadd0, Hadd0); + __m128 const mul0 = _mm_mul_ps(v1, v2); + __m128 const had0 = _mm_hadd_ps(mul0, mul0); + __m128 const had1 = _mm_hadd_ps(had0, had0); return Hadd1; # else __m128 const mul0 = _mm_mul_ps(v1, v2); @@ -81,7 +83,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_ffd(__m128 N, __m128 I, __m128 Nref) { __m128 dot0 = glm_f32v4_dot(Nref, I); __m128 sgn0 = glm_f32v4_sgn(dot0); - __m128 mul0 = _mm_mul_ps(sgn0, glm_minus_one); + __m128 mul0 = _mm_mul_ps(sgn0, _mm_set1_ps(-1.0f)); __m128 mul1 = _mm_mul_ps(N, mul0); return mul1; } @@ -90,7 +92,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_rfe(__m128 I, __m128 N) { __m128 dot0 = glm_f32v4_dot(N, I); __m128 mul0 = _mm_mul_ps(N, dot0); - __m128 mul1 = _mm_mul_ps(mul0, glm_two); + __m128 mul1 = _mm_mul_ps(mul0, _mm_set1_ps(2.0f)); __m128 sub0 = _mm_sub_ps(I, mul1); return sub0; } @@ -100,8 +102,8 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_rfa(__m128 I, __m128 N, __m128 eta) __m128 dot0 = glm_f32v4_dot(N, I); __m128 mul0 = _mm_mul_ps(eta, eta); __m128 mul1 = _mm_mul_ps(dot0, dot0); - __m128 sub0 = _mm_sub_ps(glm_one, mul0); - __m128 sub1 = _mm_sub_ps(glm_one, mul1); + __m128 sub0 = _mm_sub_ps(_mm_set1_ps(1.0f), mul0); + __m128 sub1 = _mm_sub_ps(_mm_set1_ps(1.0f), mul1); __m128 mul2 = _mm_mul_ps(sub0, sub1); if(_mm_movemask_ps(_mm_cmplt_ss(mul2, _mm_set1_ps(0.0f))) == 0) @@ -116,3 +118,5 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_rfa(__m128 I, __m128 N, __m128 eta) return sub2; } + +#endif//GLM_ARCH & GLM_ARCH_SSE2 diff --git a/glm/simd/integer.h b/glm/simd/integer.h index aa5c53c3..f645f3f5 100644 --- a/glm/simd/integer.h +++ b/glm/simd/integer.h @@ -1,3 +1,115 @@ /// @ref simd /// @file glm/simd/integer.h +#pragma once + +#if GLM_ARCH & GLM_ARCH_SSE2 + +GLM_FUNC_QUALIFIER __m128i glm_i128_interleave(__m128i x) +{ + __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); + __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); + __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); + __m128i const Mask1 = _mm_set1_epi32(0x33333333); + __m128i const Mask0 = _mm_set1_epi32(0x55555555); + + __m128i Reg1; + __m128i Reg2; + + // REG1 = x; + // REG2 = y; + //Reg1 = _mm_unpacklo_epi64(x, y); + Reg1 = x; + + //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); + //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); + Reg2 = _mm_slli_si128(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask4); + + //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); + //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); + Reg2 = _mm_slli_si128(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask3); + + //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); + //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); + Reg2 = _mm_slli_epi32(Reg1, 4); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask2); + + //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); + //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); + Reg2 = _mm_slli_epi32(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask1); + + //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); + //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask0); + + //return REG1 | (REG2 << 1); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg2 = _mm_srli_si128(Reg2, 8); + Reg1 = _mm_or_si128(Reg1, Reg2); + + return Reg1; +} + +GLM_FUNC_QUALIFIER __m128i glm_i128_interleave2(__m128i x, __m128i y) +{ + __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); + __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); + __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); + __m128i const Mask1 = _mm_set1_epi32(0x33333333); + __m128i const Mask0 = _mm_set1_epi32(0x55555555); + + __m128i Reg1; + __m128i Reg2; + + // REG1 = x; + // REG2 = y; + Reg1 = _mm_unpacklo_epi64(x, y); + + //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); + //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); + Reg2 = _mm_slli_si128(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask4); + + //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); + //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); + Reg2 = _mm_slli_si128(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask3); + + //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); + //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); + Reg2 = _mm_slli_epi32(Reg1, 4); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask2); + + //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); + //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); + Reg2 = _mm_slli_epi32(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask1); + + //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); + //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask0); + + //return REG1 | (REG2 << 1); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg2 = _mm_srli_si128(Reg2, 8); + Reg1 = _mm_or_si128(Reg1, Reg2); + + return Reg1; +} + +#endif//GLM_ARCH & GLM_ARCH_SSE2 diff --git a/glm/simd/matrix.h b/glm/simd/matrix.h index d0e8d95a..fbcda869 100644 --- a/glm/simd/matrix.h +++ b/glm/simd/matrix.h @@ -5,16 +5,13 @@ #include "geometric.h" +#if GLM_ARCH & GLM_ARCH_SSE2 + static const __m128 GLM_VAR_USED _m128_rad_ps = _mm_set_ps1(3.141592653589793238462643383279f / 180.f); static const __m128 GLM_VAR_USED _m128_deg_ps = _mm_set_ps1(180.f / 3.141592653589793238462643383279f); template -GLM_FUNC_QUALIFIER matType glm_comp_mul_f32m4 -( - __m128 const in1[4], - __m128 const in2[4], - __m128 out[4] -) +GLM_FUNC_QUALIFIER matType glm_f32m4_cml(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) { out[0] = _mm_mul_ps(in1[0], in2[0]); out[1] = _mm_mul_ps(in1[1], in2[1]); @@ -22,27 +19,23 @@ GLM_FUNC_QUALIFIER matType glm_comp_mul_f32m4 out[3] = _mm_mul_ps(in1[3], in2[3]); } -GLM_FUNC_QUALIFIER void glm_add_f32m4(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_f32m4_add(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) { - { - out[0] = _mm_add_ps(in1[0], in2[0]); - out[1] = _mm_add_ps(in1[1], in2[1]); - out[2] = _mm_add_ps(in1[2], in2[2]); - out[3] = _mm_add_ps(in1[3], in2[3]); - } + out[0] = _mm_add_ps(in1[0], in2[0]); + out[1] = _mm_add_ps(in1[1], in2[1]); + out[2] = _mm_add_ps(in1[2], in2[2]); + out[3] = _mm_add_ps(in1[3], in2[3]); } -GLM_FUNC_QUALIFIER void glm_sub_f32v4(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_f32m4_sub(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) { - { - out[0] = _mm_sub_ps(in1[0], in2[0]); - out[1] = _mm_sub_ps(in1[1], in2[1]); - out[2] = _mm_sub_ps(in1[2], in2[2]); - out[3] = _mm_sub_ps(in1[3], in2[3]); - } + out[0] = _mm_sub_ps(in1[0], in2[0]); + out[1] = _mm_sub_ps(in1[1], in2[1]); + out[2] = _mm_sub_ps(in1[2], in2[2]); + out[3] = _mm_sub_ps(in1[3], in2[3]); } -GLM_FUNC_QUALIFIER __m128 glm_mul_f32v4(__m128 const m[4], __m128 v) +GLM_FUNC_QUALIFIER __m128 glm_f32m4_mul(__m128 const m[4], __m128 v) { __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)); __m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)); @@ -61,7 +54,7 @@ GLM_FUNC_QUALIFIER __m128 glm_mul_f32v4(__m128 const m[4], __m128 v) return a2; } -GLM_FUNC_QUALIFIER __m128 glm_mul_f32v4(__m128 v, __m128 const m[4]) +GLM_FUNC_QUALIFIER __m128 glm_f32m4_mul(__m128 v, __m128 const m[4]) { __m128 i0 = m[0]; __m128 i1 = m[1]; @@ -88,7 +81,7 @@ GLM_FUNC_QUALIFIER __m128 glm_mul_f32v4(__m128 v, __m128 const m[4]) return f2; } -GLM_FUNC_QUALIFIER void glm_mul_f32v4(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_f32m4_mul(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) { { __m128 e0 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(0, 0, 0, 0)); @@ -164,7 +157,7 @@ GLM_FUNC_QUALIFIER void glm_mul_f32v4(__m128 const in1[4], __m128 const in2[4], } } -GLM_FUNC_QUALIFIER void glm_transpose_f32m4(__m128 const in[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_f32m4_transpose(__m128 const in[4], __m128 out[4]) { __m128 tmp0 = _mm_shuffle_ps(in[0], in[1], 0x44); __m128 tmp2 = _mm_shuffle_ps(in[0], in[1], 0xEE); @@ -177,7 +170,7 @@ GLM_FUNC_QUALIFIER void glm_transpose_f32m4(__m128 const in[4], __m128 out[4]) out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD); } -GLM_FUNC_QUALIFIER __m128 glm_det_highp_f32m4(__m128 const in[4]) +GLM_FUNC_QUALIFIER __m128 glm_f32m4_det_highp(__m128 const in[4]) { __m128 Fac0; { @@ -391,7 +384,7 @@ GLM_FUNC_QUALIFIER __m128 glm_det_highp_f32m4(__m128 const in[4]) return Det0; } -GLM_FUNC_QUALIFIER __m128 glm_detd_f32m4(__m128 const m[4]) +GLM_FUNC_QUALIFIER __m128 glm_f32m4_detd(__m128 const m[4]) { // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128( @@ -454,7 +447,7 @@ GLM_FUNC_QUALIFIER __m128 glm_detd_f32m4(__m128 const m[4]) return glm_f32v4_dot(m[0], DetCof); } -GLM_FUNC_QUALIFIER __m128 glm_det_f32m4(__m128 const m[4]) +GLM_FUNC_QUALIFIER __m128 glm_f32m4_det(__m128 const m[4]) { // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(add) @@ -517,7 +510,7 @@ GLM_FUNC_QUALIFIER __m128 glm_det_f32m4(__m128 const m[4]) return glm_f32v4_dot(m[0], DetCof); } -GLM_FUNC_QUALIFIER void glm_f32m4_inverse(__m128 const in[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_f32m4_inv(__m128 const in[4], __m128 out[4]) { __m128 Fac0; { @@ -728,7 +721,7 @@ GLM_FUNC_QUALIFIER void glm_f32m4_inverse(__m128 const in[4], __m128 out[4]) // + m[0][2] * Inverse[2][0] // + m[0][3] * Inverse[3][0]; __m128 Det0 = glm_f32v4_dot(in[0], Row2); - __m128 Rcp0 = _mm_div_ps(glm_one, Det0); + __m128 Rcp0 = _mm_div_ps(_mm_set1_ps(1.0f), Det0); //__m128 Rcp0 = _mm_rcp_ps(Det0); // Inverse /= Determinant; @@ -738,7 +731,7 @@ GLM_FUNC_QUALIFIER void glm_f32m4_inverse(__m128 const in[4], __m128 out[4]) out[3] = _mm_mul_ps(Inv3, Rcp0); } -GLM_FUNC_QUALIFIER void glm_lowp_f32v4_inverse(__m128 const in[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_f32m4_inv_lowp(__m128 const in[4], __m128 out[4]) { __m128 Fac0; { @@ -1035,3 +1028,5 @@ GLM_FUNC_QUALIFIER void glm_f32m4_outer(__m128 const & c, __m128 const & r, __m1 out[2] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(2, 2, 2, 2))); out[3] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(3, 3, 3, 3))); } + +#endif//GLM_ARCH & GLM_ARCH_SSE2 diff --git a/glm/simd/packing.h b/glm/simd/packing.h index 7333bb2c..cd84a261 100644 --- a/glm/simd/packing.h +++ b/glm/simd/packing.h @@ -1,2 +1,8 @@ /// @ref simd /// @file glm/simd/packing.h + +#pragma once + +#if GLM_ARCH & GLM_ARCH_SSE2 + +#endif//GLM_ARCH & GLM_ARCH_SSE2 diff --git a/glm/simd/trigonometric.h b/glm/simd/trigonometric.h index f7fa2008..37815145 100644 --- a/glm/simd/trigonometric.h +++ b/glm/simd/trigonometric.h @@ -1,2 +1,9 @@ /// @ref simd /// @file glm/simd/trigonometric.h + +#pragma once + +#if GLM_ARCH & GLM_ARCH_SSE2 + +#endif//GLM_ARCH & GLM_ARCH_SSE2 + diff --git a/glm/simd/vector_relational.h b/glm/simd/vector_relational.h index e5870266..c49d5af8 100644 --- a/glm/simd/vector_relational.h +++ b/glm/simd/vector_relational.h @@ -1,2 +1,8 @@ /// @ref simd /// @file glm/simd/vector_relational.h + +#pragma once + +#if GLM_ARCH & GLM_ARCH_SSE2 + +#endif//GLM_ARCH & GLM_ARCH_SSE2 diff --git a/test/core/core_func_common.cpp b/test/core/core_func_common.cpp index b413168e..27a5c9d2 100644 --- a/test/core/core_func_common.cpp +++ b/test/core/core_func_common.cpp @@ -1237,8 +1237,6 @@ int main() { int Error = 0; - __m128 const flr0 = glm_f32v4_flr(_mm_set_ps(1.1f, 1.9f, -1.1f, -1.9f)); - glm::ivec4 const a(1); glm::ivec4 const b = ~a; diff --git a/test/gtc/gtc_bitfield.cpp b/test/gtc/gtc_bitfield.cpp index 32f276f3..7858ccef 100644 --- a/test/gtc/gtc_bitfield.cpp +++ b/test/gtc/gtc_bitfield.cpp @@ -505,17 +505,17 @@ namespace bitfieldInterleave assert(A == C); assert(A == D); -# if(GLM_ARCH != GLM_ARCH_PURE) +# if GLM_ARCH & GLM_ARCH_SSE2 glm::uint64 E = sseBitfieldInterleave(x, y); glm::uint64 F = sseUnalignedBitfieldInterleave(x, y); assert(A == E); assert(A == F); - __m128i G = glm::detail::_mm_bit_interleave_si128(_mm_set_epi32(0, y, 0, x)); + __m128i G = glm_i128_interleave(_mm_set_epi32(0, y, 0, x)); glm::uint64 Result[2]; _mm_storeu_si128((__m128i*)Result, G); assert(A == Result[0]); -# endif//(GLM_ARCH != GLM_ARCH_PURE) +# endif//GLM_ARCH & GLM_ARCH_SSE2 } } @@ -629,7 +629,7 @@ namespace bitfieldInterleave std::printf("glm::detail::bitfieldInterleave Time %d clocks\n", static_cast(Time)); } -# if(GLM_ARCH != GLM_ARCH_PURE && !(GLM_COMPILER & GLM_COMPILER_GCC)) +# if(GLM_ARCH & GLM_ARCH_SSE2 && !(GLM_COMPILER & GLM_COMPILER_GCC)) { // SIMD std::vector<__m128i> SimdData; @@ -642,13 +642,13 @@ namespace bitfieldInterleave std::clock_t LastTime = std::clock(); for(std::size_t i = 0; i < SimdData.size(); ++i) - SimdData[i] = glm::detail::_mm_bit_interleave_si128(SimdParam[i]); + SimdData[i] = glm_i128_interleave(SimdParam[i]); std::clock_t Time = std::clock() - LastTime; std::printf("_mm_bit_interleave_si128 Time %d clocks\n", static_cast(Time)); } -# endif//(GLM_ARCH != GLM_ARCH_PURE) +# endif//GLM_ARCH & GLM_ARCH_SSE2 return 0; }