From 7563a8bc4db0ffabceef4210a7b4deb10c17e2d0 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Tue, 23 Apr 2013 15:36:12 +1000 Subject: [PATCH 1/9] Add initial implementation of SIMD optimized quaternions. A few things here can probably be improved by people a lot smarter then me, but for the most part things are generally faster. A few notes: - A fquatSIMD can be converted to a fmat4x4SIMD using mat4SIMD_cast(). - A tquat can be converted to a fquatSIMD using quatSIMD_cast(). - Some functions are virtually the same as their scalar counterparts because I've just not been able to get them faster. - Only the basic functions are implemented. Future plans include fast, approximate normalize, length and mix/slerp functions. --- glm/gtx/simd_quat.hpp | 291 +++++++++++++++++++++++ glm/gtx/simd_quat.inl | 532 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 823 insertions(+) create mode 100644 glm/gtx/simd_quat.hpp create mode 100644 glm/gtx/simd_quat.inl diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp new file mode 100644 index 00000000..962b9413 --- /dev/null +++ b/glm/gtx/simd_quat.hpp @@ -0,0 +1,291 @@ +/////////////////////////////////////////////////////////////////////////////////// +/// OpenGL Mathematics (glm.g-truc.net) +/// +/// Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net) +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +/// THE SOFTWARE. +/// +/// @ref gtx_simd_quat +/// @file glm/gtx/simd_quat.hpp +/// @date 2009-05-07 / 2011-06-07 +/// @author Christophe Riccio +/// +/// @see core (dependence) +/// +/// @defgroup gtx_simd_vec4 GLM_GTX_simd_quat +/// @ingroup gtx +/// +/// @brief SIMD implementation of quat type. +/// +/// need to be included to use these functionalities. +/////////////////////////////////////////////////////////////////////////////////// + +#ifndef GLM_GTX_simd_quat +#define GLM_GTX_simd_quat GLM_VERSION + +// Dependency: +#include "../glm.hpp" +#include "../gtc/quaternion.hpp" + +#if(GLM_ARCH != GLM_ARCH_PURE) + +#if(GLM_ARCH & GLM_ARCH_SSE2) +# include "../core/intrinsic_common.hpp" +# include "../core/intrinsic_geometric.hpp" +# include "../gtx/simd_mat4.hpp" +#else +# error "GLM: GLM_GTX_simd_quat requires compiler support of SSE2 through intrinsics" +#endif + +#if(defined(GLM_MESSAGES) && !defined(glm_ext)) +# pragma message("GLM: GLM_GTX_simd_quat extension included") +#endif + + +// Warning silencer for nameless struct/union. +#if (GLM_COMPILER & GLM_COMPILER_VC) +# pragma warning(push) +# pragma warning(disable:4201) // warning C4201: nonstandard extension used : nameless struct/union +#endif + + +namespace glm{ +namespace detail +{ + /// Quaternion implemented using SIMD SEE intrinsics. + /// \ingroup gtx_simd_vec4 + GLM_ALIGNED_STRUCT(16) fquatSIMD + { + enum ctor{null}; + typedef __m128 value_type; + typedef std::size_t size_type; + static size_type value_size(); + + typedef fquatSIMD type; + typedef tquat bool_type; + +#ifdef GLM_SIMD_ENABLE_XYZW_UNION + union + { + __m128 Data; + struct {float x, y, z, w;}; + }; +#else + __m128 Data; +#endif + + ////////////////////////////////////// + // Implicit basic constructors + + fquatSIMD(); + fquatSIMD(__m128 const & Data); + fquatSIMD(fquatSIMD const & q); + + ////////////////////////////////////// + // Explicit basic constructors + + explicit fquatSIMD( + ctor); + explicit fquatSIMD( + float const & w, + float const & x, + float const & y, + float const & z); + explicit fquatSIMD( + quat const & v); + explicit fquatSIMD( + vec3 const & eulerAngles); + + + ////////////////////////////////////// + // Unary arithmetic operators + + fquatSIMD& operator =(fquatSIMD const & q); + fquatSIMD& operator*=(float const & s); + fquatSIMD& operator/=(float const & s); + }; + + + ////////////////////////////////////// + // Arithmetic operators + + detail::fquatSIMD operator- ( + detail::fquatSIMD const & q); + + detail::fquatSIMD operator+ ( + detail::fquatSIMD const & q, + detail::fquatSIMD const & p); + + detail::fquatSIMD operator* ( + detail::fquatSIMD const & q, + detail::fquatSIMD const & p); + + detail::fvec4SIMD operator* ( + detail::fquatSIMD const & q, + detail::fvec4SIMD const & v); + + detail::fvec4SIMD operator* ( + detail::fvec4SIMD const & v, + detail::fquatSIMD const & q); + + detail::fquatSIMD operator* ( + detail::fquatSIMD const & q, + float s); + + detail::fquatSIMD operator* ( + float s, + detail::fquatSIMD const & q); + + detail::fquatSIMD operator/ ( + detail::fquatSIMD const & q, + float s); + +}//namespace detail + + typedef glm::detail::fquatSIMD simdQuat; + + /// @addtogroup gtx_simd_quat + /// @{ + + //! Convert a simdQuat to a quat. + //! (From GLM_GTX_simd_quat extension) + quat quat_cast( + detail::fquatSIMD const & x); + + //! Convert a simdMat4 to a simdQuat. + //! (From GLM_GTX_simd_quat extension) + detail::fquatSIMD quatSIMD_cast( + detail::fmat4x4SIMD const & m); + + //! Convert a simdQuat to a simdMat4 + //! (From GLM_GTX_simd_quat extension) + detail::fmat4x4SIMD mat4SIMD_cast( + detail::fquatSIMD const & q); + + + /// Returns the length of the quaternion. + /// + /// @see gtc_quaternion + float length( + detail::fquatSIMD const & x); + + /// Returns the normalized quaternion. + /// + /// @see gtc_quaternion + detail::fquatSIMD normalize( + detail::fquatSIMD const & x); + + /// Returns dot product of q1 and q2, i.e., q1[0] * q2[0] + q1[1] * q2[1] + ... + /// + /// @see gtc_quaternion + float dot( + detail::fquatSIMD const & q1, + detail::fquatSIMD const & q2); + + /// Spherical linear interpolation of two quaternions. + /// The interpolation is oriented and the rotation is performed at constant speed. + /// For short path spherical linear interpolation, use the slerp function. + /// + /// @param x A quaternion + /// @param y A quaternion + /// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1]. + /// @tparam T Value type used to build the quaternion. Supported: half, float or double. + /// @see gtc_quaternion + /// @see - slerp(detail::tquat const & x, detail::tquat const & y, T const & a) + detail::fquatSIMD mix( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Linear interpolation of two quaternions. + /// The interpolation is oriented. + /// + /// @param x A quaternion + /// @param y A quaternion + /// @param a Interpolation factor. The interpolation is defined in the range [0, 1]. + /// @tparam T Value type used to build the quaternion. Supported: half, float or double. + /// @see gtc_quaternion + detail::fquatSIMD lerp( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Spherical linear interpolation of two quaternions. + /// The interpolation always take the short path and the rotation is performed at constant speed. + /// + /// @param x A quaternion + /// @param y A quaternion + /// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1]. + /// @tparam T Value type used to build the quaternion. Supported: half, float or double. + /// @see gtc_quaternion + detail::fquatSIMD slerp( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Returns the q conjugate. + /// + /// @see gtc_quaternion + detail::fquatSIMD conjugate( + detail::fquatSIMD const & q); + + /// Returns the q inverse. + /// + /// @see gtc_quaternion + detail::fquatSIMD inverse( + detail::fquatSIMD const & q); + + /// Build a quaternion from an angle and a normalized axis. + /// + /// @param angle Angle expressed in radians if GLM_FORCE_RADIANS is define or degrees otherwise. + /// @param axis Axis of the quaternion, must be normalized. + /// + /// @see gtc_quaternion + detail::fquatSIMD angleAxisSIMD( + float const & angle, + vec3 const & axis); + + /// Build a quaternion from an angle and a normalized axis. + /// + /// @param angle Angle expressed in radians if GLM_FORCE_RADIANS is define or degrees otherwise. + /// @param x x component of the x-axis, x, y, z must be a normalized axis + /// @param y y component of the y-axis, x, y, z must be a normalized axis + /// @param z z component of the z-axis, x, y, z must be a normalized axis + /// + /// @see gtc_quaternion + detail::fquatSIMD angleAxisSIMD( + float const & angle, + float const & x, + float const & y, + float const & z); + + + /// @} +}//namespace glm + +#include "simd_quat.inl" + + +#if (GLM_COMPILER & GLM_COMPILER_VC) +# pragma warning(pop) +#endif + + +#endif//(GLM_ARCH != GLM_ARCH_PURE) + +#endif//GLM_GTX_simd_quat diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl new file mode 100644 index 00000000..f9df7a94 --- /dev/null +++ b/glm/gtx/simd_quat.inl @@ -0,0 +1,532 @@ +/////////////////////////////////////////////////////////////////////////////////////////////////// +// OpenGL Mathematics Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net) +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Created : 2013-04-22 +// Updated : 2013-04-22 +// Licence : This source is under MIT License +// File : glm/gtx/simd_quat.inl +/////////////////////////////////////////////////////////////////////////////////////////////////// + + +namespace glm{ +namespace detail{ + + +////////////////////////////////////// +// Debugging +#if 0 +void print(__m128 v) +{ + GLM_ALIGN(16) float result[4]; + _mm_store_ps(result, v); + + printf("__m128: %f %f %f %f\n", result[0], result[1], result[2], result[3]); +} + +void print(const fvec4SIMD &v) +{ + printf("fvec4SIMD: %f %f %f %f\n", v.x, v.y, v.z, v.w); +} +#endif + + +////////////////////////////////////// +// Implicit basic constructors + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD() +#ifdef GLM_SIMD_ENABLE_DEFAULT_INIT + : Data(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f)) +#endif +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(__m128 const & Data) : + Data(Data) +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(fquatSIMD const & q) : + Data(q.Data) +{} + + +////////////////////////////////////// +// Explicit basic constructors + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(float const & w, float const & x, float const & y, float const & z) : + Data(_mm_set_ps(w, z, y, x)) +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(quat const & q) : + Data(_mm_set_ps(q.w, q.z, q.y, q.x)) +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(vec3 const & eulerAngles) +{ + vec3 c = glm::cos(eulerAngles * 0.5f); + vec3 s = glm::sin(eulerAngles * 0.5f); + + Data = _mm_set_ps( + (c.x * c.y * c.z) + (s.x * s.y * s.z), + (c.x * c.y * s.z) - (s.x * s.y * c.z), + (c.x * s.y * c.z) + (s.x * c.y * s.z), + (s.x * c.y * c.z) - (c.x * s.y * s.z)); +} + + +////////////////////////////////////// +// Unary arithmetic operators + +GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator=(fquatSIMD const & q) +{ + this->Data = q.Data; + return *this; +} + +GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator*=(float const & s) +{ + this->Data = _mm_mul_ps(this->Data, _mm_set_ps1(s)); + return *this; +} + +GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator/=(float const & s) +{ + this->Data = _mm_div_ps(Data, _mm_set1_ps(s)); + return *this; +} + + + +// negate operator +GLM_FUNC_QUALIFIER fquatSIMD operator- (fquatSIMD const & q) +{ + return fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(-1.0f, -1.0f, -1.0f, -1.0f))); +} + +// operator+ +GLM_FUNC_QUALIFIER fquatSIMD operator+ (fquatSIMD const & q1, fquatSIMD const & q2) +{ + return fquatSIMD(_mm_add_ps(q1.Data, q2.Data)); +} + +//operator* +GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & q2) +{ + // SSE2 STATS: + // 11 shuffle + // 8 mul + // 8 add + + // SSE3 STATS: + // 3 shuffle + // 8 mul + // 8 add + + // SSE4 STATS: + // 3 shuffle + // 8 mul + // 4 dpps + + __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data); + __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3))); + __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); + __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); + + mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); + mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); + mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); + mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + + +# if((GLM_ARCH & GLM_ARCH_SSE4)) + __m128 add0 = _mm_dp_ps(mul0, _mm_set1_ps(1.0f), 0xff); + __m128 add1 = _mm_dp_ps(mul1, _mm_set1_ps(1.0f), 0xff); + __m128 add2 = _mm_dp_ps(mul2, _mm_set1_ps(1.0f), 0xff); + __m128 add3 = _mm_dp_ps(mul3, _mm_set1_ps(1.0f), 0xff); +# elif((GLM_ARCH & GLM_ARCH_SSE3)) + __m128 add0 = _mm_hadd_ps(mul0, mul0); + add0 = _mm_hadd_ps(add0, add0); + __m128 add1 = _mm_hadd_ps(mul1, mul1); + add1 = _mm_hadd_ps(add1, add1); + __m128 add2 = _mm_hadd_ps(mul2, mul2); + add2 = _mm_hadd_ps(add2, add2); + __m128 add3 = _mm_hadd_ps(mul3, mul3); + add3 = _mm_hadd_ps(add3, add3); +# else + __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); + add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); + __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); + add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); + __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); + add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); + __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); + add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); +#endif + + + + // I had tried something clever here using shuffles to produce the final result, but it turns out that using + // _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case. + + float w; + float x; + float y; + float z; + + _mm_store_ss(&w, add0); + _mm_store_ss(&x, add1); + _mm_store_ss(&y, add2); + _mm_store_ss(&z, add3); + + return detail::fquatSIMD(w, x, y, z); + + + //return _mm_shuffle_ps(_mm_shuffle_ps(add1, add2, 0), + // _mm_shuffle_ps(add3, add0, 0), + // _MM_SHUFFLE(2, 0, 2, 0)); +} + +GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v) +{ + __m128 uv = detail::sse_xpd_ps(q.Data, v.Data); + __m128 uuv = detail::sse_xpd_ps(q.Data, uv); + + uv = _mm_mul_ps(uv, _mm_mul_ps(_mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)), _mm_set1_ps(2.0f))); + uuv = _mm_mul_ps(uuv, _mm_set1_ps(2.0f)); + + return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv)); +} + +GLM_FUNC_QUALIFIER fvec4SIMD operator* (fvec4SIMD const & v, fquatSIMD const & q) +{ + return inverse(q) * v; +} + +GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q, float s) +{ + return fquatSIMD(_mm_mul_ps(q.Data, _mm_set1_ps(s))); +} + +GLM_FUNC_QUALIFIER fquatSIMD operator* (float s, fquatSIMD const & q) +{ + return fquatSIMD(_mm_mul_ps(_mm_set1_ps(s), q.Data)); +} + + +//operator/ +GLM_FUNC_QUALIFIER fquatSIMD operator/ (fquatSIMD const & q, float s) +{ + return fquatSIMD(_mm_div_ps(q.Data, _mm_set1_ps(s))); +} + + +}//namespace detail + + +GLM_FUNC_QUALIFIER quat quat_cast +( + detail::fquatSIMD const & x +) +{ + GLM_ALIGN(16) quat Result; + _mm_store_ps(&Result[0], x.Data); + return Result; +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast +( + detail::fmat4x4SIMD const & m +) +{ + // Scalar implementation for now. + + GLM_ALIGN(16) float m0[4]; + GLM_ALIGN(16) float m1[4]; + GLM_ALIGN(16) float m2[4]; + GLM_ALIGN(16) float m3[4]; + + _mm_store_ps(m0, m[0].Data); + _mm_store_ps(m1, m[1].Data); + _mm_store_ps(m2, m[2].Data); + _mm_store_ps(m3, m[3].Data); + + + float trace = m0[0] + m1[1] + m2[2] + 1.0f; + if (trace > 0) + { + float s = 0.5f / sqrt(trace); + + return _mm_set_ps( + 0.25f / s, + (m0[1] - m1[0]) * s, + (m2[0] - m0[2]) * s, + (m1[2] - m2[1]) * s); + } + else + { + if (m0[0] > m1[1]) + { + if (m0[0] > m2[2]) + { + // X is biggest. + float s = sqrt(m0[0] - m1[1] - m2[2] + 1.0f) * 0.5f; + + return _mm_set_ps( + (m1[2] - m2[1]) * s, + (m2[0] + m0[2]) * s, + (m0[1] + m1[0]) * s, + 0.5f * s); + } + } + else + { + if (m1[1] > m2[2]) + { + // Y is biggest. + float s = sqrt(m1[1] - m0[0] - m2[2] + 1.0f) * 0.5f; + + return _mm_set_ps( + (m2[0] - m0[2]) * s, + (m1[2] + m2[1]) * s, + 0.5f * s, + (m0[1] + m1[0]) * s); + } + } + + // Z is biggest. + float s = sqrt(m2[2] - m0[0] - m1[1] + 1.0f) * 0.5f; + + return _mm_set_ps( + (m0[1] - m1[0]) * s, + 0.5f * s, + (m1[2] + m2[1]) * s, + (m2[0] + m0[2]) * s); + } +} + + +GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast +( + detail::fquatSIMD const & q +) +{ + detail::fmat4x4SIMD result; + + __m128 _wwww = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 _xyzw = q.Data; + __m128 _zxyw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 _yzxw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1)); + + __m128 _xyzw2 = _mm_add_ps(_xyzw, _xyzw); + __m128 _zxyw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 _yzxw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 0, 2, 1)); + + __m128 _tmp0 = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_yzxw2, _yzxw)); + _tmp0 = _mm_sub_ps(_tmp0, _mm_mul_ps(_zxyw2, _zxyw)); + + __m128 _tmp1 = _mm_mul_ps(_yzxw2, _xyzw); + _tmp1 = _mm_add_ps(_tmp1, _mm_mul_ps(_zxyw2, _wwww)); + + __m128 _tmp2 = _mm_mul_ps(_zxyw2, _xyzw); + _tmp2 = _mm_sub_ps(_tmp2, _mm_mul_ps(_yzxw2, _wwww)); + + + // There's probably a better, more politically correct way of doing this... + result[0].Data = _mm_set_ps( + 0.0f, + reinterpret_cast(&_tmp2)[0], + reinterpret_cast(&_tmp1)[0], + reinterpret_cast(&_tmp0)[0]); + + result[1].Data = _mm_set_ps( + 0.0f, + reinterpret_cast(&_tmp1)[1], + reinterpret_cast(&_tmp0)[1], + reinterpret_cast(&_tmp2)[1]); + + result[2].Data = _mm_set_ps( + 0.0f, + reinterpret_cast(&_tmp0)[2], + reinterpret_cast(&_tmp2)[2], + reinterpret_cast(&_tmp1)[2]); + + result[3].Data = _mm_set_ps( + 1.0f, + 0.0f, + 0.0f, + 0.0f); + + + return result; +} + + + +GLM_FUNC_QUALIFIER float length +( + detail::fquatSIMD const & q +) +{ + return glm::sqrt(dot(q, q)); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD normalize +( + detail::fquatSIMD const & q +) +{ + return _mm_mul_ps(q.Data, _mm_set1_ps(1.0f / length(q))); +} + +GLM_FUNC_QUALIFIER float dot +( + detail::fquatSIMD const & q1, + detail::fquatSIMD const & q2 +) +{ + float result; + _mm_store_ss(&result, detail::sse_dot_ps(q1.Data, q2.Data)); + + return result; +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD mix +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + float cosTheta = dot(x, y); + + if (cosTheta > 1.0f - glm::epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::acos(cosTheta); + + + // Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation + // will most likely be faster, but that'll need to be left to people much smarter than I. + // + // The issue, I think, is loading the __m128 variables with initial data. Can probably be replaced with an SSE-optimized approximation of + // glm::sin(). Maybe a fastMix() function would be better for that? + + float s0 = glm::sin((1.0f - a) * angle); + float s1 = glm::sin(a * angle); + float d = 1.0f / glm::sin(angle); + + return (s0 * x + s1 * y) * d; + + + //__m128 s0 = _mm_set1_ps(glm::sin((1.0f - a) * angle)); + //__m128 s1 = _mm_set1_ps(glm::sin(a * angle)); + //__m128 d = _mm_set1_ps(1.0f / glm::sin(angle)); + // + //return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); + } +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD lerp +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + // Lerp is only defined in [0, 1] + assert(a >= 0.0f); + assert(a <= 1.0f); + + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD slerp +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + detail::fquatSIMD z = y; + + float cosTheta = dot(x, y); + + // If cosTheta < 0, the interpolation will take the long way around the sphere. + // To fix this, one quat must be negated. + if (cosTheta < 0.0f) + { + z = -y; + cosTheta = -cosTheta; + } + + // Perform a linear interpolation when cosTheta is close to 1 to avoid side effect of sin(angle) becoming a zero denominator + if(cosTheta > 1.0f - epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::acos(cosTheta); + + + // See mix() above for an explanation to the rationale behind this non-SSE implementation. + + float s0 = glm::sin((1.0f - a) * angle); + float s1 = glm::sin(a * angle); + float d = 1.0f / glm::sin(angle); + + return (s0 * x + s1 * y) * d; + } +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate +( + detail::fquatSIMD const & q +) +{ + return detail::fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f))); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD inverse +( + detail::fquatSIMD const & q +) +{ + return conjugate(q) / dot(q, q); +} + + +GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD +( + float const & angle, + vec3 const & v +) +{ +#ifdef GLM_FORCE_RADIANS + float a(angle); +#else + float a(glm::radians(angle)); +#endif + float s = glm::sin(a * 0.5f); + + return _mm_set_ps( + glm::cos(a * 0.5f), + v.z * s, + v.y * s, + v.x * s); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD +( + float const & angle, + float const & x, + float const & y, + float const & z +) +{ + return angleAxisSIMD(angle, vec3(x, y, z)); +} + + +}//namespace glm From 197b6c96d8a068fcd1002b0731def6bcbb2af5fc Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 07:58:41 +1000 Subject: [PATCH 2/9] Improve efficiency of operator*(fquatSIMD, fquatSIMD) in SSE4 mode. Now only requires 3 shuffle, 4 mul and 4 dpps. --- glm/gtx/simd_quat.inl | 57 ++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index f9df7a94..25848a57 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -122,7 +122,7 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & // SSE4 STATS: // 3 shuffle - // 8 mul + // 4 mul // 4 dpps __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data); @@ -130,35 +130,36 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); - mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); - mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); - mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); - mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); - - # if((GLM_ARCH & GLM_ARCH_SSE4)) - __m128 add0 = _mm_dp_ps(mul0, _mm_set1_ps(1.0f), 0xff); - __m128 add1 = _mm_dp_ps(mul1, _mm_set1_ps(1.0f), 0xff); - __m128 add2 = _mm_dp_ps(mul2, _mm_set1_ps(1.0f), 0xff); - __m128 add3 = _mm_dp_ps(mul3, _mm_set1_ps(1.0f), 0xff); -# elif((GLM_ARCH & GLM_ARCH_SSE3)) - __m128 add0 = _mm_hadd_ps(mul0, mul0); - add0 = _mm_hadd_ps(add0, add0); - __m128 add1 = _mm_hadd_ps(mul1, mul1); - add1 = _mm_hadd_ps(add1, add1); - __m128 add2 = _mm_hadd_ps(mul2, mul2); - add2 = _mm_hadd_ps(add2, add2); - __m128 add3 = _mm_hadd_ps(mul3, mul3); - add3 = _mm_hadd_ps(add3, add3); + __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff); + __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff); + __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); + __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); # else - __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); - add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); - __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); - add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); - __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); - add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); - __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); - add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); + mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); + mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); + mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); + mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + +# if((GLM_ARCH & GLM_ARCH_SSE3)) + __m128 add0 = _mm_hadd_ps(mul0, mul0); + add0 = _mm_hadd_ps(add0, add0); + __m128 add1 = _mm_hadd_ps(mul1, mul1); + add1 = _mm_hadd_ps(add1, add1); + __m128 add2 = _mm_hadd_ps(mul2, mul2); + add2 = _mm_hadd_ps(add2, add2); + __m128 add3 = _mm_hadd_ps(mul3, mul3); + add3 = _mm_hadd_ps(add3, add3); +# else + __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); + add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); + __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); + add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); + __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); + add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); + __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); + add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); +# endif #endif From 13837e107976e390515ace747362131e303c10f8 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 08:51:17 +1000 Subject: [PATCH 3/9] Remove the SSE3 implementation in operator*(fquatSIMD, fquatSIMD). The SSE2 version is now running faster than the SSE3 version. --- glm/gtx/simd_quat.inl | 45 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index 25848a57..9502a6ab 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -115,11 +115,6 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & // 8 mul // 8 add - // SSE3 STATS: - // 3 shuffle - // 8 mul - // 8 add - // SSE4 STATS: // 3 shuffle // 4 mul @@ -136,33 +131,23 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); # else - mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); - mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); - mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); - mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); - -# if((GLM_ARCH & GLM_ARCH_SSE3)) - __m128 add0 = _mm_hadd_ps(mul0, mul0); - add0 = _mm_hadd_ps(add0, add0); - __m128 add1 = _mm_hadd_ps(mul1, mul1); - add1 = _mm_hadd_ps(add1, add1); - __m128 add2 = _mm_hadd_ps(mul2, mul2); - add2 = _mm_hadd_ps(add2, add2); - __m128 add3 = _mm_hadd_ps(mul3, mul3); - add3 = _mm_hadd_ps(add3, add3); -# else - __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); - add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); - __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); - add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); - __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); - add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); - __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); - add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); -# endif + mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); + __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); + add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); + + mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); + __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); + add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); + + mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); + __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); + add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); + + mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); + add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); #endif - // I had tried something clever here using shuffles to produce the final result, but it turns out that using // _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case. From 942bf08fe30598d415a6d0bb1620e81b9fff6523 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 09:39:22 +1000 Subject: [PATCH 4/9] Add the ability to convert standard mat4s and mat3s to SIMD quats. --- glm/gtx/simd_quat.hpp | 14 +++++- glm/gtx/simd_quat.inl | 99 ++++++++++++++++++++++++++----------------- 2 files changed, 73 insertions(+), 40 deletions(-) diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp index 962b9413..cce5ee61 100644 --- a/glm/gtx/simd_quat.hpp +++ b/glm/gtx/simd_quat.hpp @@ -77,7 +77,7 @@ namespace detail static size_type value_size(); typedef fquatSIMD type; - typedef tquat bool_type; + typedef tquat bool_type; #ifdef GLM_SIMD_ENABLE_XYZW_UNION union @@ -172,6 +172,18 @@ namespace detail detail::fquatSIMD quatSIMD_cast( detail::fmat4x4SIMD const & m); + //! Converts a mat4 to a simdQuat. + //! (From GLM_GTX_simd_quat extension) + template + detail::fquatSIMD quatSIMD_cast( + detail::tmat4x4 const & m); + + //! Converts a mat3 to a simdQuat. + //! (From GLM_GTX_simd_quat extension) + template + detail::fquatSIMD quatSIMD_cast( + detail::tmat3x3 const & m); + //! Convert a simdQuat to a simdMat4 //! (From GLM_GTX_simd_quat extension) detail::fmat4x4SIMD mat4SIMD_cast( diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index 9502a6ab..90b9a730 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -214,37 +214,23 @@ GLM_FUNC_QUALIFIER quat quat_cast { GLM_ALIGN(16) quat Result; _mm_store_ps(&Result[0], x.Data); + return Result; } -GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast -( - detail::fmat4x4SIMD const & m -) +template +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast_impl(const T m0[], const T m1[], const T m2[]) { - // Scalar implementation for now. - - GLM_ALIGN(16) float m0[4]; - GLM_ALIGN(16) float m1[4]; - GLM_ALIGN(16) float m2[4]; - GLM_ALIGN(16) float m3[4]; - - _mm_store_ps(m0, m[0].Data); - _mm_store_ps(m1, m[1].Data); - _mm_store_ps(m2, m[2].Data); - _mm_store_ps(m3, m[3].Data); - - - float trace = m0[0] + m1[1] + m2[2] + 1.0f; - if (trace > 0) + T trace = m0[0] + m1[1] + m2[2] + T(1.0); + if (trace > T(0)) { - float s = 0.5f / sqrt(trace); + T s = T(0.5) / sqrt(trace); return _mm_set_ps( - 0.25f / s, - (m0[1] - m1[0]) * s, - (m2[0] - m0[2]) * s, - (m1[2] - m2[1]) * s); + static_cast(T(0.25) / s), + static_cast((m0[1] - m1[0]) * s), + static_cast((m2[0] - m0[2]) * s), + static_cast((m1[2] - m2[1]) * s)); } else { @@ -253,13 +239,13 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast if (m0[0] > m2[2]) { // X is biggest. - float s = sqrt(m0[0] - m1[1] - m2[2] + 1.0f) * 0.5f; + T s = sqrt(m0[0] - m1[1] - m2[2] + T(1.0)) * T(0.5); return _mm_set_ps( - (m1[2] - m2[1]) * s, - (m2[0] + m0[2]) * s, - (m0[1] + m1[0]) * s, - 0.5f * s); + static_cast((m1[2] - m2[1]) * s), + static_cast((m2[0] + m0[2]) * s), + static_cast((m0[1] + m1[0]) * s), + static_cast(T(0.5) * s)); } } else @@ -267,27 +253,62 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast if (m1[1] > m2[2]) { // Y is biggest. - float s = sqrt(m1[1] - m0[0] - m2[2] + 1.0f) * 0.5f; + T s = sqrt(m1[1] - m0[0] - m2[2] + T(1.0)) * T(0.5); return _mm_set_ps( - (m2[0] - m0[2]) * s, - (m1[2] + m2[1]) * s, - 0.5f * s, - (m0[1] + m1[0]) * s); + static_cast((m2[0] - m0[2]) * s), + static_cast((m1[2] + m2[1]) * s), + static_cast(T(0.5) * s), + static_cast((m0[1] + m1[0]) * s)); } } // Z is biggest. - float s = sqrt(m2[2] - m0[0] - m1[1] + 1.0f) * 0.5f; + T s = sqrt(m2[2] - m0[0] - m1[1] + T(1.0)) * T(0.5); return _mm_set_ps( - (m0[1] - m1[0]) * s, - 0.5f * s, - (m1[2] + m2[1]) * s, - (m2[0] + m0[2]) * s); + static_cast((m0[1] - m1[0]) * s), + static_cast(T(0.5) * s), + static_cast((m1[2] + m2[1]) * s), + static_cast((m2[0] + m0[2]) * s)); } } +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast +( + detail::fmat4x4SIMD const & m +) +{ + // Scalar implementation for now. + GLM_ALIGN(16) float m0[4]; + GLM_ALIGN(16) float m1[4]; + GLM_ALIGN(16) float m2[4]; + + _mm_store_ps(m0, m[0].Data); + _mm_store_ps(m1, m[1].Data); + _mm_store_ps(m2, m[2].Data); + + return quatSIMD_cast_impl(m0, m1, m2); +} + +template +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast +( + detail::tmat4x4 const & m +) +{ + return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]); +} + +template +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast +( + detail::tmat3x3 const & m +) +{ + return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]); +} + GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast ( From d07496460a39d7a94846e6e8eb040bd4d3e8f698 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 09:54:28 +1000 Subject: [PATCH 5/9] Add the ability to convert a SIMD quat to a standard mat4. --- glm/gtx/simd_quat.hpp | 5 +++++ glm/gtx/simd_quat.inl | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp index cce5ee61..b60b5650 100644 --- a/glm/gtx/simd_quat.hpp +++ b/glm/gtx/simd_quat.hpp @@ -189,6 +189,11 @@ namespace detail detail::fmat4x4SIMD mat4SIMD_cast( detail::fquatSIMD const & q); + //! Converts a simdQuat to a standard mat4. + //! (From GLM_GTX_simd_quat extension) + mat4 mat4_cast( + detail::fquatSIMD const & q); + /// Returns the length of the quaternion. /// diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index 90b9a730..d4fc4f3a 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -365,6 +365,14 @@ GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast return result; } +GLM_FUNC_QUALIFIER mat4 mat4_cast +( + detail::fquatSIMD const & q +) +{ + return mat4_cast(mat4SIMD_cast(q)); +} + GLM_FUNC_QUALIFIER float length From c1006718b354e4c354bb520f454c75a53e90a090 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 13:55:38 +1000 Subject: [PATCH 6/9] Add fastMix() and fastSlerp() implementations. These have stricter pre-conditions than standard mix() and slerp() - 1) Input quaternions must be unit length. - 2) The interpolation factor (a) must be in the range [0, 1] None of these restrictions should be too bad. The reason for these is that it uses fastAcos() and fastSin(), both of which have a limited allowable range. In my contrived tests, I observed about a 10x improvement over the standard versions. This is mostly because of the faster acos/sin operations. The fastSin(__m128) implementation also helps here because it can do four fastSin() operations simultaneously using SSE (mix() and slerp() each need three). --- glm/gtx/simd_quat.hpp | 37 +++++++++++++++++- glm/gtx/simd_quat.inl | 88 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 121 insertions(+), 4 deletions(-) diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp index b60b5650..a3921cb3 100644 --- a/glm/gtx/simd_quat.hpp +++ b/glm/gtx/simd_quat.hpp @@ -41,6 +41,7 @@ // Dependency: #include "../glm.hpp" #include "../gtc/quaternion.hpp" +#include "../gtx/fast_trigonometry.hpp" #if(GLM_ARCH != GLM_ARCH_PURE) @@ -223,7 +224,7 @@ namespace detail /// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1]. /// @tparam T Value type used to build the quaternion. Supported: half, float or double. /// @see gtc_quaternion - /// @see - slerp(detail::tquat const & x, detail::tquat const & y, T const & a) + /// @see - slerp(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) detail::fquatSIMD mix( detail::fquatSIMD const & x, detail::fquatSIMD const & y, @@ -255,6 +256,35 @@ namespace detail detail::fquatSIMD const & y, float const & a); + + /// Faster spherical linear interpolation of two unit length quaternions. + /// + /// This is the same as mix(), except for two rules: + /// 1) The two quaternions must be unit length. + /// 2) The interpolation factor (a) must be in the range [0, 1]. + /// + /// This will use the equivalent to fastAcos() and fastSin(). + /// + /// @see gtc_quaternion + /// @see - mix(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + detail::fquatSIMD fastMix( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Identical to fastMix() except takes the shortest path. + /// + /// The same rules apply here as those in fastMix(). Both quaternions must be unit length and 'a' must be + /// in the range [0, 1]. + /// + /// @see - fastMix(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + /// @see - slerp(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + detail::fquatSIMD fastSlerp( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Returns the q conjugate. /// /// @see gtc_quaternion @@ -292,6 +322,11 @@ namespace detail float const & z); + // TODO: Move this to somewhere more appropriate. Used with fastMix() and fastSlerp(). + /// Performs the equivalent of glm::fastSin() on each component of the given __m128. + __m128 fastSin(__m128 x); + + /// @} }//namespace glm diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index d4fc4f3a..3ff493fb 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -423,9 +423,6 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD mix // Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation // will most likely be faster, but that'll need to be left to people much smarter than I. - // - // The issue, I think, is loading the __m128 variables with initial data. Can probably be replaced with an SSE-optimized approximation of - // glm::sin(). Maybe a fastMix() function would be better for that? float s0 = glm::sin((1.0f - a) * angle); float s1 = glm::sin(a * angle); @@ -495,6 +492,73 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD slerp } } + +GLM_FUNC_QUALIFIER detail::fquatSIMD fastMix +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + float cosTheta = dot(x, y); + + if (cosTheta > 1.0f - glm::epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::fastAcos(cosTheta); + + + __m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f)); + + __m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1))); + + return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); + } +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD fastSlerp +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + detail::fquatSIMD z = y; + + float cosTheta = dot(x, y); + if (cosTheta < 0.0f) + { + z = -y; + cosTheta = -cosTheta; + } + + + if(cosTheta > 1.0f - epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::fastAcos(cosTheta); + + + __m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f)); + + __m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1))); + + return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); + } +} + + + GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate ( detail::fquatSIMD const & q @@ -544,4 +608,22 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD } +GLM_FUNC_QUALIFIER __m128 fastSin(__m128 x) +{ + static const __m128 c0 = _mm_set1_ps(0.16666666666666666666666666666667f); + static const __m128 c1 = _mm_set1_ps(0.00833333333333333333333333333333f); + static const __m128 c2 = _mm_set1_ps(0.00019841269841269841269841269841f); + + __m128 x3 = _mm_mul_ps(x, _mm_mul_ps(x, x)); + __m128 x5 = _mm_mul_ps(x3, _mm_mul_ps(x, x)); + __m128 x7 = _mm_mul_ps(x5, _mm_mul_ps(x, x)); + + __m128 y0 = _mm_mul_ps(x3, c0); + __m128 y1 = _mm_mul_ps(x5, c1); + __m128 y2 = _mm_mul_ps(x7, c2); + + return _mm_sub_ps(_mm_add_ps(_mm_sub_ps(x, y0), y1), y2); +} + + }//namespace glm From 1eb88e4bc139c7855ddde33af08957e1753615d4 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 14:06:28 +1000 Subject: [PATCH 7/9] Cleanup. --- glm/gtx/simd_quat.inl | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index 3ff493fb..6fce84c2 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -420,22 +420,12 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD mix { float angle = glm::acos(cosTheta); - - // Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation - // will most likely be faster, but that'll need to be left to people much smarter than I. float s0 = glm::sin((1.0f - a) * angle); float s1 = glm::sin(a * angle); float d = 1.0f / glm::sin(angle); return (s0 * x + s1 * y) * d; - - - //__m128 s0 = _mm_set1_ps(glm::sin((1.0f - a) * angle)); - //__m128 s1 = _mm_set1_ps(glm::sin(a * angle)); - //__m128 d = _mm_set1_ps(1.0f / glm::sin(angle)); - // - //return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); } } @@ -482,8 +472,6 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD slerp float angle = glm::acos(cosTheta); - // See mix() above for an explanation to the rationale behind this non-SSE implementation. - float s0 = glm::sin((1.0f - a) * angle); float s1 = glm::sin(a * angle); float d = 1.0f / glm::sin(angle); From c08ea7656b5ed4117e6f651432fd6a07fa2e17e8 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 14:22:27 +1000 Subject: [PATCH 8/9] Reorganize operator* to makes things a bit clearer. --- glm/gtx/simd_quat.inl | 50 +++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index 6fce84c2..36aae93d 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -120,54 +120,54 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & // 4 mul // 4 dpps - __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data); - __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3))); - __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); - __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); + __m128 mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3))); + __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); + __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); + __m128 mul3 = _mm_mul_ps(q1.Data, q2.Data); # if((GLM_ARCH & GLM_ARCH_SSE4)) - __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff); - __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff); - __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); - __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); + __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff); + __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); + __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); + __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff); # else - mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); + mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); - mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); + mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); - mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); + mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); - mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); #endif - // I had tried something clever here using shuffles to produce the final result, but it turns out that using - // _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case. - - float w; + // This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than + // the final code below. I'll keep this here for reference - maybe somebody else can do something better... + // + //__m128 xxyy = _mm_shuffle_ps(add0, add1, _MM_SHUFFLE(0, 0, 0, 0)); + //__m128 zzww = _mm_shuffle_ps(add2, add3, _MM_SHUFFLE(0, 0, 0, 0)); + // + //return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0)); + float x; float y; float z; + float w; - _mm_store_ss(&w, add0); - _mm_store_ss(&x, add1); - _mm_store_ss(&y, add2); - _mm_store_ss(&z, add3); + _mm_store_ss(&x, add0); + _mm_store_ss(&y, add1); + _mm_store_ss(&z, add2); + _mm_store_ss(&w, add3); return detail::fquatSIMD(w, x, y, z); - - - //return _mm_shuffle_ps(_mm_shuffle_ps(add1, add2, 0), - // _mm_shuffle_ps(add3, add0, 0), - // _MM_SHUFFLE(2, 0, 2, 0)); } GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v) From e0cfd7d672f4b90e733f3d52a9e6dc764320e13c Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 15:00:03 +1000 Subject: [PATCH 9/9] Attempt a small optimization in operator*(fquatSIMD, fvec4SIMD). No big improvement. --- glm/gtx/simd_quat.inl | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index 36aae93d..d96f48d4 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -172,11 +172,22 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v) { - __m128 uv = detail::sse_xpd_ps(q.Data, v.Data); - __m128 uuv = detail::sse_xpd_ps(q.Data, uv); + static const __m128 two = _mm_set1_ps(2.0f); + + __m128 q_wwww = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 q_swp0 = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 q_swp1 = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 v_swp0 = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 v_swp1 = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 1, 0, 2)); + + __m128 uv = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0)); + __m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 uuv = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0)); - uv = _mm_mul_ps(uv, _mm_mul_ps(_mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)), _mm_set1_ps(2.0f))); - uuv = _mm_mul_ps(uuv, _mm_set1_ps(2.0f)); + + uv = _mm_mul_ps(uv, _mm_mul_ps(q_wwww, two)); + uuv = _mm_mul_ps(uuv, two); return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv)); }