From c1006718b354e4c354bb520f454c75a53e90a090 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 13:55:38 +1000 Subject: [PATCH] Add fastMix() and fastSlerp() implementations. These have stricter pre-conditions than standard mix() and slerp() - 1) Input quaternions must be unit length. - 2) The interpolation factor (a) must be in the range [0, 1] None of these restrictions should be too bad. The reason for these is that it uses fastAcos() and fastSin(), both of which have a limited allowable range. In my contrived tests, I observed about a 10x improvement over the standard versions. This is mostly because of the faster acos/sin operations. The fastSin(__m128) implementation also helps here because it can do four fastSin() operations simultaneously using SSE (mix() and slerp() each need three). --- glm/gtx/simd_quat.hpp | 37 +++++++++++++++++- glm/gtx/simd_quat.inl | 88 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 121 insertions(+), 4 deletions(-) diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp index b60b5650..a3921cb3 100644 --- a/glm/gtx/simd_quat.hpp +++ b/glm/gtx/simd_quat.hpp @@ -41,6 +41,7 @@ // Dependency: #include "../glm.hpp" #include "../gtc/quaternion.hpp" +#include "../gtx/fast_trigonometry.hpp" #if(GLM_ARCH != GLM_ARCH_PURE) @@ -223,7 +224,7 @@ namespace detail /// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1]. /// @tparam T Value type used to build the quaternion. Supported: half, float or double. /// @see gtc_quaternion - /// @see - slerp(detail::tquat const & x, detail::tquat const & y, T const & a) + /// @see - slerp(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) detail::fquatSIMD mix( detail::fquatSIMD const & x, detail::fquatSIMD const & y, @@ -255,6 +256,35 @@ namespace detail detail::fquatSIMD const & y, float const & a); + + /// Faster spherical linear interpolation of two unit length quaternions. + /// + /// This is the same as mix(), except for two rules: + /// 1) The two quaternions must be unit length. + /// 2) The interpolation factor (a) must be in the range [0, 1]. + /// + /// This will use the equivalent to fastAcos() and fastSin(). + /// + /// @see gtc_quaternion + /// @see - mix(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + detail::fquatSIMD fastMix( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Identical to fastMix() except takes the shortest path. + /// + /// The same rules apply here as those in fastMix(). Both quaternions must be unit length and 'a' must be + /// in the range [0, 1]. + /// + /// @see - fastMix(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + /// @see - slerp(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + detail::fquatSIMD fastSlerp( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Returns the q conjugate. /// /// @see gtc_quaternion @@ -292,6 +322,11 @@ namespace detail float const & z); + // TODO: Move this to somewhere more appropriate. Used with fastMix() and fastSlerp(). + /// Performs the equivalent of glm::fastSin() on each component of the given __m128. + __m128 fastSin(__m128 x); + + /// @} }//namespace glm diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index d4fc4f3a..3ff493fb 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -423,9 +423,6 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD mix // Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation // will most likely be faster, but that'll need to be left to people much smarter than I. - // - // The issue, I think, is loading the __m128 variables with initial data. Can probably be replaced with an SSE-optimized approximation of - // glm::sin(). Maybe a fastMix() function would be better for that? float s0 = glm::sin((1.0f - a) * angle); float s1 = glm::sin(a * angle); @@ -495,6 +492,73 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD slerp } } + +GLM_FUNC_QUALIFIER detail::fquatSIMD fastMix +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + float cosTheta = dot(x, y); + + if (cosTheta > 1.0f - glm::epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::fastAcos(cosTheta); + + + __m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f)); + + __m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1))); + + return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); + } +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD fastSlerp +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + detail::fquatSIMD z = y; + + float cosTheta = dot(x, y); + if (cosTheta < 0.0f) + { + z = -y; + cosTheta = -cosTheta; + } + + + if(cosTheta > 1.0f - epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::fastAcos(cosTheta); + + + __m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f)); + + __m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1))); + + return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); + } +} + + + GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate ( detail::fquatSIMD const & q @@ -544,4 +608,22 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD } +GLM_FUNC_QUALIFIER __m128 fastSin(__m128 x) +{ + static const __m128 c0 = _mm_set1_ps(0.16666666666666666666666666666667f); + static const __m128 c1 = _mm_set1_ps(0.00833333333333333333333333333333f); + static const __m128 c2 = _mm_set1_ps(0.00019841269841269841269841269841f); + + __m128 x3 = _mm_mul_ps(x, _mm_mul_ps(x, x)); + __m128 x5 = _mm_mul_ps(x3, _mm_mul_ps(x, x)); + __m128 x7 = _mm_mul_ps(x5, _mm_mul_ps(x, x)); + + __m128 y0 = _mm_mul_ps(x3, c0); + __m128 y1 = _mm_mul_ps(x5, c1); + __m128 y2 = _mm_mul_ps(x7, c2); + + return _mm_sub_ps(_mm_add_ps(_mm_sub_ps(x, y0), y1), y2); +} + + }//namespace glm