|
|
|
@ -270,12 +270,12 @@ inline __m128 _mm_inf_ps(__m128 x) |
|
|
|
|
|
|
|
|
|
// SSE scalar reciprocal sqrt using rsqrt op, plus one Newton-Rhaphson iteration |
|
|
|
|
// By Elan Ruskin, |
|
|
|
|
inline __m128 _mm_sqrt_wip_ss(__m128 const x) |
|
|
|
|
inline __m128 _mm_sqrt_wip_ss(__m128 const & x) |
|
|
|
|
{ |
|
|
|
|
__m128 recip = _mm_rsqrt_ss( x ); // "estimate" opcode |
|
|
|
|
const static __m128 three = { 3, 3, 3, 3 }; // aligned consts for fast load |
|
|
|
|
const static __m128 half = { 0.5,0.5,0.5,0.5 }; |
|
|
|
|
__m128 halfrecip = _mm_mul_ss( half, recip ); |
|
|
|
|
__m128 threeminus_xrr = _mm_sub_ss( three, _mm_mul_ss( x, _mm_mul_ss ( recip, recip ) ) ); |
|
|
|
|
return _mm_mul_ss( halfrecip, threeminus_xrr ); |
|
|
|
|
__m128 recip = _mm_rsqrt_ss(x); // "estimate" opcode |
|
|
|
|
const static __m128 three = {3, 3, 3, 3}; // aligned consts for fast load |
|
|
|
|
const static __m128 half = {0.5,0.5,0.5,0.5}; |
|
|
|
|
__m128 halfrecip = _mm_mul_ss(half, recip); |
|
|
|
|
__m128 threeminus_xrr = _mm_sub_ss(three, _mm_mul_ss(x, _mm_mul_ss (recip, recip))); |
|
|
|
|
return _mm_mul_ss( halfrecip, threeminus_xrr); |
|
|
|
|
} |
|
|
|
|