@ -423,9 +423,6 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD mix
// Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation
// will most likely be faster, but that'll need to be left to people much smarter than I.
//
// The issue, I think, is loading the __m128 variables with initial data. Can probably be replaced with an SSE-optimized approximation of
// glm::sin(). Maybe a fastMix() function would be better for that?
float s0 = glm::sin((1.0f - a) * angle);
float s1 = glm::sin(a * angle);
@ -495,6 +492,73 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD slerp
}
}
GLM_FUNC_QUALIFIER detail::fquatSIMD fastMix
(
detail::fquatSIMD const & x,
detail::fquatSIMD const & y,
float const & a
)
{
float cosTheta = dot(x, y);
if (cosTheta > 1.0f - glm::epsilon<float>())
{
return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
}
else
{
float angle = glm::fastAcos(cosTheta);
__m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f));
__m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3));
__m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2));
__m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
}
}
GLM_FUNC_QUALIFIER detail::fquatSIMD fastSlerp
(
detail::fquatSIMD const & x,
detail::fquatSIMD const & y,
float const & a
)
{
detail::fquatSIMD z = y;
float cosTheta = dot(x, y);
if (cosTheta < 0.0f)
{
z = -y;
cosTheta = -cosTheta;
}
if(cosTheta > 1.0f - epsilon<float>())
{
return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
}
else
{
float angle = glm::fastAcos(cosTheta);
__m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f));
__m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3));
__m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2));
__m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
}
}
GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate
(
detail::fquatSIMD const & q
@ -544,4 +608,22 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
}
GLM_FUNC_QUALIFIER __m128 fastSin(__m128 x)
{
static const __m128 c0 = _mm_set1_ps(0.16666666666666666666666666666667f);
static const __m128 c1 = _mm_set1_ps(0.00833333333333333333333333333333f);
static const __m128 c2 = _mm_set1_ps(0.00019841269841269841269841269841f);
__m128 x3 = _mm_mul_ps(x, _mm_mul_ps(x, x));
__m128 x5 = _mm_mul_ps(x3, _mm_mul_ps(x, x));
__m128 x7 = _mm_mul_ps(x5, _mm_mul_ps(x, x));
__m128 y0 = _mm_mul_ps(x3, c0);
__m128 y1 = _mm_mul_ps(x5, c1);
__m128 y2 = _mm_mul_ps(x7, c2);
return _mm_sub_ps(_mm_add_ps(_mm_sub_ps(x, y0), y1), y2);
}
}//namespace glm