From 7563a8bc4db0ffabceef4210a7b4deb10c17e2d0 Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Tue, 23 Apr 2013 15:36:12 +1000
Subject: [PATCH 1/9] Add initial implementation of SIMD optimized quaternions.

A few things here can probably be improved by people a lot smarter then
me, but for the most part things are generally faster.

A few notes:
 - A fquatSIMD can be converted to a fmat4x4SIMD using mat4SIMD_cast().
 - A tquat<float> can be converted to a fquatSIMD using quatSIMD_cast().
 - Some functions are virtually the same as their scalar counterparts
   because I've just not been able to get them faster.
 - Only the basic functions are implemented. Future plans include fast,
   approximate normalize, length and mix/slerp functions.
---
 glm/gtx/simd_quat.hpp | 291 +++++++++++++++++++++++
 glm/gtx/simd_quat.inl | 532 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 823 insertions(+)
 create mode 100644 glm/gtx/simd_quat.hpp
 create mode 100644 glm/gtx/simd_quat.inl
diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp
new file mode 100644
index 00000000..962b9413
--- /dev/null
+++ b/glm/gtx/simd_quat.hpp
@@ -0,0 +1,291 @@
+///////////////////////////////////////////////////////////////////////////////////
+/// OpenGL Mathematics (glm.g-truc.net)
+///
+/// Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net)
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to deal
+/// in the Software without restriction, including without limitation the rights
+/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+/// copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+/// 
+/// The above copyright notice and this permission notice shall be included in
+/// all copies or substantial portions of the Software.
+/// 
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+/// THE SOFTWARE.
+///
+/// @ref gtx_simd_quat
+/// @file glm/gtx/simd_quat.hpp
+/// @date 2009-05-07 / 2011-06-07
+/// @author Christophe Riccio
+///
+/// @see core (dependence)
+///
+/// @defgroup gtx_simd_vec4 GLM_GTX_simd_quat
+/// @ingroup gtx
+/// 
+/// @brief SIMD implementation of quat type.
+/// 
+/// <glm/gtx/simd_quat.hpp> need to be included to use these functionalities.
+///////////////////////////////////////////////////////////////////////////////////
+
+#ifndef GLM_GTX_simd_quat
+#define GLM_GTX_simd_quat GLM_VERSION
+
+// Dependency:
+#include "../glm.hpp"
+#include "../gtc/quaternion.hpp"
+
+#if(GLM_ARCH != GLM_ARCH_PURE)
+
+#if(GLM_ARCH & GLM_ARCH_SSE2)
+#	include "../core/intrinsic_common.hpp"
+#	include "../core/intrinsic_geometric.hpp"
+#   include "../gtx/simd_mat4.hpp"
+#else
+#	error "GLM: GLM_GTX_simd_quat requires compiler support of SSE2 through intrinsics"
+#endif
+
+#if(defined(GLM_MESSAGES) && !defined(glm_ext))
+#	pragma message("GLM: GLM_GTX_simd_quat extension included")
+#endif
+
+
+// Warning silencer for nameless struct/union.
+#if (GLM_COMPILER & GLM_COMPILER_VC)
+#   pragma warning(push)
+#   pragma warning(disable:4201)   // warning C4201: nonstandard extension used : nameless struct/union
+#endif
+
+
+namespace glm{
+namespace detail
+{
+	/// Quaternion implemented using SIMD SEE intrinsics.
+	/// \ingroup gtx_simd_vec4
+	GLM_ALIGNED_STRUCT(16) fquatSIMD
+	{
+		enum ctor{null};
+		typedef __m128 value_type;
+		typedef std::size_t size_type;
+		static size_type value_size();
+
+		typedef fquatSIMD type;
+		typedef tquat<bool, highp> bool_type;
+
+#ifdef GLM_SIMD_ENABLE_XYZW_UNION
+        union
+        {
+		    __m128 Data;
+            struct {float x, y, z, w;};
+        };
+#else
+        __m128 Data;
+#endif
+
+		//////////////////////////////////////
+		// Implicit basic constructors
+
+		fquatSIMD();
+		fquatSIMD(__m128 const & Data);
+		fquatSIMD(fquatSIMD const & q);
+
+		//////////////////////////////////////
+		// Explicit basic constructors
+
+		explicit fquatSIMD(
+			ctor);
+		explicit fquatSIMD(
+			float const & w, 
+			float const & x, 
+			float const & y, 
+			float const & z);
+		explicit fquatSIMD(
+			quat const & v);
+        explicit fquatSIMD(
+			vec3 const & eulerAngles);
+		
+
+		//////////////////////////////////////
+		// Unary arithmetic operators
+
+        fquatSIMD& operator =(fquatSIMD const & q);
+        fquatSIMD& operator*=(float const & s);
+		fquatSIMD& operator/=(float const & s);
+	};
+
+
+    //////////////////////////////////////
+    // Arithmetic operators
+
+	detail::fquatSIMD operator- (
+		detail::fquatSIMD const & q);
+
+	detail::fquatSIMD operator+ ( 
+		detail::fquatSIMD const & q, 
+		detail::fquatSIMD const & p); 
+
+	detail::fquatSIMD operator* ( 
+		detail::fquatSIMD const & q, 
+		detail::fquatSIMD const & p); 
+
+	detail::fvec4SIMD operator* (
+		detail::fquatSIMD const & q, 
+		detail::fvec4SIMD const & v);
+
+	detail::fvec4SIMD operator* (
+		detail::fvec4SIMD const & v,
+		detail::fquatSIMD const & q);
+
+	detail::fquatSIMD operator* (
+		detail::fquatSIMD const & q, 
+		float s);
+
+	detail::fquatSIMD operator* (
+		float s,
+		detail::fquatSIMD const & q);
+
+	detail::fquatSIMD operator/ (
+		detail::fquatSIMD const & q, 
+		float s);
+
+}//namespace detail
+
+	typedef glm::detail::fquatSIMD simdQuat;
+
+	/// @addtogroup gtx_simd_quat
+	/// @{
+
+    //! Convert a simdQuat to a quat.
+	//! (From GLM_GTX_simd_quat extension)
+	quat quat_cast(
+		detail::fquatSIMD const & x);
+
+    //! Convert a simdMat4 to a simdQuat.
+    //! (From GLM_GTX_simd_quat extension)
+    detail::fquatSIMD quatSIMD_cast(
+        detail::fmat4x4SIMD const & m);
+
+    //! Convert a simdQuat to a simdMat4
+    //! (From GLM_GTX_simd_quat extension)
+    detail::fmat4x4SIMD mat4SIMD_cast(
+        detail::fquatSIMD const & q);
+
+
+	/// Returns the length of the quaternion. 
+	/// 
+	/// @see gtc_quaternion
+	float length(
+		detail::fquatSIMD const & x);
+
+	/// Returns the normalized quaternion. 
+	/// 
+	/// @see gtc_quaternion
+	detail::fquatSIMD normalize(
+		detail::fquatSIMD const & x);
+
+    /// Returns dot product of q1 and q2, i.e., q1[0] * q2[0] + q1[1] * q2[1] + ... 
+	/// 
+	/// @see gtc_quaternion
+	float dot(
+		detail::fquatSIMD const & q1, 
+		detail::fquatSIMD const & q2);
+
+    /// Spherical linear interpolation of two quaternions.
+	/// The interpolation is oriented and the rotation is performed at constant speed.
+	/// For short path spherical linear interpolation, use the slerp function.
+	/// 
+	/// @param x A quaternion
+	/// @param y A quaternion
+	/// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1].
+	/// @tparam T Value type used to build the quaternion. Supported: half, float or double.
+	/// @see gtc_quaternion
+	/// @see - slerp(detail::tquat<T> const & x, detail::tquat<T> const & y, T const & a) 
+	detail::fquatSIMD mix(
+		detail::fquatSIMD const & x, 
+		detail::fquatSIMD const & y, 
+		float const & a);
+
+    /// Linear interpolation of two quaternions. 
+	/// The interpolation is oriented.
+	/// 
+	/// @param x A quaternion
+	/// @param y A quaternion
+	/// @param a Interpolation factor. The interpolation is defined in the range [0, 1].
+	/// @tparam T Value type used to build the quaternion. Supported: half, float or double.
+	/// @see gtc_quaternion
+	detail::fquatSIMD lerp(
+		detail::fquatSIMD const & x, 
+		detail::fquatSIMD const & y, 
+		float const & a);
+
+	/// Spherical linear interpolation of two quaternions.
+	/// The interpolation always take the short path and the rotation is performed at constant speed.
+	/// 
+	/// @param x A quaternion
+	/// @param y A quaternion
+	/// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1].
+	/// @tparam T Value type used to build the quaternion. Supported: half, float or double.
+	/// @see gtc_quaternion
+	detail::fquatSIMD slerp(
+		detail::fquatSIMD const & x, 
+		detail::fquatSIMD const & y, 
+		float const & a);
+
+	/// Returns the q conjugate. 
+	/// 
+	/// @see gtc_quaternion
+	detail::fquatSIMD conjugate(
+		detail::fquatSIMD const & q);
+
+	/// Returns the q inverse. 
+	/// 
+	/// @see gtc_quaternion
+	detail::fquatSIMD inverse(
+		detail::fquatSIMD const & q);
+
+    /// Build a quaternion from an angle and a normalized axis.
+	///
+	/// @param angle Angle expressed in radians if GLM_FORCE_RADIANS is define or degrees otherwise.
+	/// @param axis Axis of the quaternion, must be normalized. 
+	///
+	/// @see gtc_quaternion
+	detail::fquatSIMD angleAxisSIMD(
+		float const & angle, 
+		vec3 const & axis);
+
+    /// Build a quaternion from an angle and a normalized axis. 
+	///
+	/// @param angle Angle expressed in radians if GLM_FORCE_RADIANS is define or degrees otherwise.
+	/// @param x x component of the x-axis, x, y, z must be a normalized axis
+	/// @param y y component of the y-axis, x, y, z must be a normalized axis
+	/// @param z z component of the z-axis, x, y, z must be a normalized axis
+	///
+	/// @see gtc_quaternion
+	detail::fquatSIMD angleAxisSIMD(
+		float const & angle, 
+		float const & x, 
+		float const & y, 
+		float const & z);
+
+
+	/// @}
+}//namespace glm
+
+#include "simd_quat.inl"
+
+
+#if (GLM_COMPILER & GLM_COMPILER_VC)
+#   pragma warning(pop)
+#endif
+
+
+#endif//(GLM_ARCH != GLM_ARCH_PURE)
+
+#endif//GLM_GTX_simd_quat
diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
new file mode 100644
index 00000000..f9df7a94
--- /dev/null
+++ b/glm/gtx/simd_quat.inl
@@ -0,0 +1,532 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// OpenGL Mathematics Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net)
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Created : 2013-04-22
+// Updated : 2013-04-22
+// Licence : This source is under MIT License
+// File    : glm/gtx/simd_quat.inl
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+namespace glm{
+namespace detail{
+
+
+//////////////////////////////////////
+// Debugging
+#if 0
+void print(__m128 v)
+{
+    GLM_ALIGN(16) float result[4];
+    _mm_store_ps(result, v);
+
+    printf("__m128:    %f %f %f %f\n", result[0], result[1], result[2], result[3]);
+}
+
+void print(const fvec4SIMD &v)
+{
+    printf("fvec4SIMD: %f %f %f %f\n", v.x, v.y, v.z, v.w);
+}
+#endif
+
+
+//////////////////////////////////////
+// Implicit basic constructors
+
+GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD()
+#ifdef GLM_SIMD_ENABLE_DEFAULT_INIT
+    : Data(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f))
+#endif
+{}
+
+GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(__m128 const & Data) :
+	Data(Data)
+{}
+
+GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(fquatSIMD const & q) :
+	Data(q.Data)
+{}
+
+
+//////////////////////////////////////
+// Explicit basic constructors
+
+GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(float const & w, float const & x, float const & y, float const & z) :
+	Data(_mm_set_ps(w, z, y, x))
+{}
+
+GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(quat const & q) :
+	Data(_mm_set_ps(q.w, q.z, q.y, q.x))
+{}
+
+GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(vec3 const & eulerAngles)
+{
+    vec3 c = glm::cos(eulerAngles * 0.5f);
+	vec3 s = glm::sin(eulerAngles * 0.5f);
+
+    Data = _mm_set_ps(
+        (c.x * c.y * c.z) + (s.x * s.y * s.z),
+        (c.x * c.y * s.z) - (s.x * s.y * c.z),
+        (c.x * s.y * c.z) + (s.x * c.y * s.z),
+        (s.x * c.y * c.z) - (c.x * s.y * s.z));
+}
+
+
+//////////////////////////////////////
+// Unary arithmetic operators
+
+GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator=(fquatSIMD const & q)
+{
+    this->Data = q.Data;
+    return *this;
+}
+
+GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator*=(float const & s)
+{
+	this->Data = _mm_mul_ps(this->Data, _mm_set_ps1(s));
+	return *this;
+}
+
+GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator/=(float const & s)
+{
+	this->Data = _mm_div_ps(Data, _mm_set1_ps(s));
+	return *this;
+}
+
+
+
+// negate operator
+GLM_FUNC_QUALIFIER fquatSIMD operator- (fquatSIMD const & q)
+{
+    return fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(-1.0f, -1.0f, -1.0f, -1.0f)));
+}
+
+// operator+
+GLM_FUNC_QUALIFIER fquatSIMD operator+ (fquatSIMD const & q1, fquatSIMD const & q2)
+{
+	return fquatSIMD(_mm_add_ps(q1.Data, q2.Data));
+}
+
+//operator*
+GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & q2)
+{
+    // SSE2 STATS:
+    //    11 shuffle
+    //    8  mul
+    //    8  add
+    
+    // SSE3 STATS:
+    //    3 shuffle
+    //    8 mul
+    //    8 add
+
+    // SSE4 STATS:
+    //    3 shuffle
+    //    8 mul
+    //    4 dpps
+
+    __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data);
+    __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));
+    __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
+    __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
+
+           mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
+           mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
+           mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
+           mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
+
+    
+#   if((GLM_ARCH & GLM_ARCH_SSE4))
+    __m128 add0 = _mm_dp_ps(mul0, _mm_set1_ps(1.0f), 0xff);
+    __m128 add1 = _mm_dp_ps(mul1, _mm_set1_ps(1.0f), 0xff);
+    __m128 add2 = _mm_dp_ps(mul2, _mm_set1_ps(1.0f), 0xff);
+    __m128 add3 = _mm_dp_ps(mul3, _mm_set1_ps(1.0f), 0xff);
+#   elif((GLM_ARCH & GLM_ARCH_SSE3))
+    __m128 add0 = _mm_hadd_ps(mul0, mul0);
+           add0 = _mm_hadd_ps(add0, add0);
+    __m128 add1 = _mm_hadd_ps(mul1, mul1);
+           add1 = _mm_hadd_ps(add1, add1);
+    __m128 add2 = _mm_hadd_ps(mul2, mul2);
+           add2 = _mm_hadd_ps(add2, add2);
+    __m128 add3 = _mm_hadd_ps(mul3, mul3);
+           add3 = _mm_hadd_ps(add3, add3);
+#   else
+    __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
+           add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
+    __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
+           add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
+    __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
+           add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
+    __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
+           add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
+#endif
+
+    
+
+    // I had tried something clever here using shuffles to produce the final result, but it turns out that using
+    // _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case.
+
+    float w;
+    float x;
+    float y;
+    float z;
+
+    _mm_store_ss(&w, add0);
+    _mm_store_ss(&x, add1);
+    _mm_store_ss(&y, add2);
+    _mm_store_ss(&z, add3);
+
+    return detail::fquatSIMD(w, x, y, z);
+
+           
+    //return _mm_shuffle_ps(_mm_shuffle_ps(add1, add2, 0),
+    //                      _mm_shuffle_ps(add3, add0, 0),
+    //                      _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v)
+{
+    __m128 uv  = detail::sse_xpd_ps(q.Data, v.Data);
+    __m128 uuv = detail::sse_xpd_ps(q.Data, uv);
+
+    uv  = _mm_mul_ps(uv,  _mm_mul_ps(_mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)), _mm_set1_ps(2.0f)));
+    uuv = _mm_mul_ps(uuv, _mm_set1_ps(2.0f));
+
+    return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv));
+}
+
+GLM_FUNC_QUALIFIER fvec4SIMD operator* (fvec4SIMD const & v, fquatSIMD const & q)
+{
+	return inverse(q) * v;
+}
+
+GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q, float s)
+{
+	return fquatSIMD(_mm_mul_ps(q.Data, _mm_set1_ps(s)));
+}
+
+GLM_FUNC_QUALIFIER fquatSIMD operator* (float s, fquatSIMD const & q)
+{
+	return fquatSIMD(_mm_mul_ps(_mm_set1_ps(s), q.Data));
+}
+
+
+//operator/
+GLM_FUNC_QUALIFIER fquatSIMD operator/ (fquatSIMD const & q, float s)
+{
+	return fquatSIMD(_mm_div_ps(q.Data, _mm_set1_ps(s)));
+}
+
+
+}//namespace detail
+
+
+GLM_FUNC_QUALIFIER quat quat_cast
+(
+	detail::fquatSIMD const & x
+)
+{
+	GLM_ALIGN(16) quat Result;
+	_mm_store_ps(&Result[0], x.Data);
+	return Result;
+}
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
+(
+	detail::fmat4x4SIMD const & m
+)
+{
+    // Scalar implementation for now.
+
+    GLM_ALIGN(16) float m0[4];
+    GLM_ALIGN(16) float m1[4];
+    GLM_ALIGN(16) float m2[4];
+    GLM_ALIGN(16) float m3[4];
+
+    _mm_store_ps(m0, m[0].Data);
+    _mm_store_ps(m1, m[1].Data);
+    _mm_store_ps(m2, m[2].Data);
+    _mm_store_ps(m3, m[3].Data);
+
+
+    float trace = m0[0] + m1[1] + m2[2] + 1.0f;
+    if (trace > 0)
+    {
+        float s = 0.5f / sqrt(trace);
+
+        return _mm_set_ps(
+            0.25f / s,
+            (m0[1] - m1[0]) * s,
+            (m2[0] - m0[2]) * s,
+            (m1[2] - m2[1]) * s);
+    }
+    else
+    {
+        if (m0[0] > m1[1])
+        {
+            if (m0[0] > m2[2])
+            {
+                // X is biggest.
+                float s = sqrt(m0[0] - m1[1] - m2[2] + 1.0f) * 0.5f;
+
+                return _mm_set_ps(
+                    (m1[2] - m2[1]) * s,
+                    (m2[0] + m0[2]) * s,
+                    (m0[1] + m1[0]) * s,
+                    0.5f            * s);
+            }
+        }
+        else
+        {
+            if (m1[1] > m2[2])
+            {
+                // Y is biggest.
+                float s = sqrt(m1[1] - m0[0] - m2[2] + 1.0f) * 0.5f;
+
+                return _mm_set_ps(
+                    (m2[0] - m0[2]) * s,
+                    (m1[2] + m2[1]) * s,
+                    0.5f            * s,
+                    (m0[1] + m1[0]) * s);
+            }
+        }
+
+        // Z is biggest.
+        float s = sqrt(m2[2] - m0[0] - m1[1] + 1.0f) * 0.5f;
+
+        return _mm_set_ps(
+            (m0[1] - m1[0]) * s,
+            0.5f            * s,
+            (m1[2] + m2[1]) * s,
+            (m2[0] + m0[2]) * s);
+    }
+}
+
+
+GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast
+(
+	detail::fquatSIMD const & q
+)
+{
+    detail::fmat4x4SIMD result;
+
+    __m128 _wwww  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 _xyzw  = q.Data;
+    __m128 _zxyw  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2));
+    __m128 _yzxw  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1));
+
+    __m128 _xyzw2 = _mm_add_ps(_xyzw, _xyzw);
+    __m128 _zxyw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 1, 0, 2));
+    __m128 _yzxw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 0, 2, 1));
+    
+    __m128 _tmp0  = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_yzxw2, _yzxw));
+           _tmp0  = _mm_sub_ps(_tmp0, _mm_mul_ps(_zxyw2, _zxyw));
+
+    __m128 _tmp1  = _mm_mul_ps(_yzxw2, _xyzw);
+           _tmp1  = _mm_add_ps(_tmp1, _mm_mul_ps(_zxyw2, _wwww));
+
+    __m128 _tmp2  = _mm_mul_ps(_zxyw2, _xyzw);
+           _tmp2  = _mm_sub_ps(_tmp2, _mm_mul_ps(_yzxw2, _wwww));
+
+
+    // There's probably a better, more politically correct way of doing this...
+    result[0].Data = _mm_set_ps(
+        0.0f,
+        reinterpret_cast<float*>(&_tmp2)[0],
+        reinterpret_cast<float*>(&_tmp1)[0],
+        reinterpret_cast<float*>(&_tmp0)[0]);
+
+    result[1].Data = _mm_set_ps(
+        0.0f,
+        reinterpret_cast<float*>(&_tmp1)[1],
+        reinterpret_cast<float*>(&_tmp0)[1],
+        reinterpret_cast<float*>(&_tmp2)[1]);
+
+    result[2].Data = _mm_set_ps(
+        0.0f,
+        reinterpret_cast<float*>(&_tmp0)[2],
+        reinterpret_cast<float*>(&_tmp2)[2],
+        reinterpret_cast<float*>(&_tmp1)[2]);
+
+   result[3].Data = _mm_set_ps(
+        1.0f,
+        0.0f,
+        0.0f,
+        0.0f);
+
+
+    return result;
+}
+
+
+
+GLM_FUNC_QUALIFIER float length
+(
+	detail::fquatSIMD const & q
+)
+{
+    return glm::sqrt(dot(q, q));
+}
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD normalize
+(
+	detail::fquatSIMD const & q
+)
+{
+    return _mm_mul_ps(q.Data, _mm_set1_ps(1.0f / length(q)));
+}
+
+GLM_FUNC_QUALIFIER float dot
+(
+	detail::fquatSIMD const & q1,
+	detail::fquatSIMD const & q2
+)
+{
+    float result;
+    _mm_store_ss(&result, detail::sse_dot_ps(q1.Data, q2.Data));
+
+    return result;
+}
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD mix
+(
+	detail::fquatSIMD const & x, 
+	detail::fquatSIMD const & y, 
+	float const & a
+)
+{
+	float cosTheta = dot(x, y);
+
+    if (cosTheta > 1.0f - glm::epsilon<float>())
+    {
+	    return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
+    }
+    else
+    {
+        float angle = glm::acos(cosTheta);
+        
+
+        // Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation
+        // will most likely be faster, but that'll need to be left to people much smarter than I.
+        //
+        // The issue, I think, is loading the __m128 variables with initial data. Can probably be replaced with an SSE-optimized approximation of
+        // glm::sin(). Maybe a fastMix() function would be better for that?
+        
+        float s0 = glm::sin((1.0f - a) * angle);
+        float s1 = glm::sin(a * angle);
+        float d  = 1.0f / glm::sin(angle);
+
+        return (s0 * x + s1 * y) * d;
+
+
+        //__m128 s0 = _mm_set1_ps(glm::sin((1.0f - a) * angle));
+        //__m128 s1 = _mm_set1_ps(glm::sin(a * angle));
+        //__m128 d  = _mm_set1_ps(1.0f / glm::sin(angle));
+        //
+        //return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
+    }
+}
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD lerp
+(
+	detail::fquatSIMD const & x, 
+	detail::fquatSIMD const & y, 
+	float const & a
+)
+{
+	// Lerp is only defined in [0, 1]
+	assert(a >= 0.0f);
+	assert(a <= 1.0f);
+
+    return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
+}
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD slerp
+(
+	detail::fquatSIMD const & x, 
+	detail::fquatSIMD const & y, 
+	float const & a
+)
+{
+	detail::fquatSIMD z = y;
+
+	float cosTheta = dot(x, y);
+
+	// If cosTheta < 0, the interpolation will take the long way around the sphere. 
+	// To fix this, one quat must be negated.
+	if (cosTheta < 0.0f)
+	{
+		z        = -y;
+		cosTheta = -cosTheta;
+	}
+
+	// Perform a linear interpolation when cosTheta is close to 1 to avoid side effect of sin(angle) becoming a zero denominator
+	if(cosTheta > 1.0f - epsilon<float>())
+	{
+		return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
+	}
+	else
+	{
+        float angle = glm::acos(cosTheta);
+
+
+        // See mix() above for an explanation to the rationale behind this non-SSE implementation.
+
+		float s0 = glm::sin((1.0f - a) * angle);
+        float s1 = glm::sin(a * angle);
+        float d  = 1.0f / glm::sin(angle);
+
+        return (s0 * x + s1 * y) * d;
+	}
+}
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate
+(
+	detail::fquatSIMD const & q
+)
+{
+	return detail::fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)));
+}
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD inverse
+(
+	detail::fquatSIMD const & q
+)
+{
+	return conjugate(q) / dot(q, q);
+}
+
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
+(
+	float const & angle,
+	vec3 const & v
+)
+{
+#ifdef GLM_FORCE_RADIANS
+	float a(angle);
+#else
+	float a(glm::radians(angle));
+#endif
+	float s = glm::sin(a * 0.5f);
+
+    return _mm_set_ps(
+        glm::cos(a * 0.5f),
+        v.z * s,
+        v.y * s,
+        v.x * s);
+}
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
+(
+	float const & angle, 
+	float const & x, 
+	float const & y, 
+	float const & z
+)
+{
+	return angleAxisSIMD(angle, vec3(x, y, z));
+}
+
+
+}//namespace glm

From 197b6c96d8a068fcd1002b0731def6bcbb2af5fc Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Wed, 24 Apr 2013 07:58:41 +1000
Subject: [PATCH 2/9] Improve efficiency of operator*(fquatSIMD, fquatSIMD) in
 SSE4 mode.

Now only requires 3 shuffle, 4 mul and 4 dpps.
---
 glm/gtx/simd_quat.inl | 57 ++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
index f9df7a94..25848a57 100644
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@@ -122,7 +122,7 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
 
     // SSE4 STATS:
     //    3 shuffle
-    //    8 mul
+    //    4 mul
     //    4 dpps
 
     __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data);
@@ -130,35 +130,36 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
     __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
     __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
 
-           mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
-           mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
-           mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
-           mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
-
-    
 #   if((GLM_ARCH & GLM_ARCH_SSE4))
-    __m128 add0 = _mm_dp_ps(mul0, _mm_set1_ps(1.0f), 0xff);
-    __m128 add1 = _mm_dp_ps(mul1, _mm_set1_ps(1.0f), 0xff);
-    __m128 add2 = _mm_dp_ps(mul2, _mm_set1_ps(1.0f), 0xff);
-    __m128 add3 = _mm_dp_ps(mul3, _mm_set1_ps(1.0f), 0xff);
-#   elif((GLM_ARCH & GLM_ARCH_SSE3))
-    __m128 add0 = _mm_hadd_ps(mul0, mul0);
-           add0 = _mm_hadd_ps(add0, add0);
-    __m128 add1 = _mm_hadd_ps(mul1, mul1);
-           add1 = _mm_hadd_ps(add1, add1);
-    __m128 add2 = _mm_hadd_ps(mul2, mul2);
-           add2 = _mm_hadd_ps(add2, add2);
-    __m128 add3 = _mm_hadd_ps(mul3, mul3);
-           add3 = _mm_hadd_ps(add3, add3);
+    __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
+    __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f), 0xff);
+    __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
+    __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
 #   else
-    __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
-           add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
-    __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
-           add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
-    __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
-           add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
-    __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
-           add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
+               mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
+               mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
+               mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
+               mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
+
+#       if((GLM_ARCH & GLM_ARCH_SSE3))
+        __m128 add0 = _mm_hadd_ps(mul0, mul0);
+               add0 = _mm_hadd_ps(add0, add0);
+        __m128 add1 = _mm_hadd_ps(mul1, mul1);
+               add1 = _mm_hadd_ps(add1, add1);
+        __m128 add2 = _mm_hadd_ps(mul2, mul2);
+               add2 = _mm_hadd_ps(add2, add2);
+        __m128 add3 = _mm_hadd_ps(mul3, mul3);
+               add3 = _mm_hadd_ps(add3, add3);
+#       else
+        __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
+               add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
+        __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
+               add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
+        __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
+               add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
+        __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
+               add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
+#       endif
 #endif
 
     

From 13837e107976e390515ace747362131e303c10f8 Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Wed, 24 Apr 2013 08:51:17 +1000
Subject: [PATCH 3/9] Remove the SSE3 implementation in operator*(fquatSIMD,
 fquatSIMD).

The SSE2 version is now running faster than the SSE3 version.
---
 glm/gtx/simd_quat.inl | 45 +++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
index 25848a57..9502a6ab 100644
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@@ -115,11 +115,6 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
     //    8  mul
     //    8  add
     
-    // SSE3 STATS:
-    //    3 shuffle
-    //    8 mul
-    //    8 add
-
     // SSE4 STATS:
     //    3 shuffle
     //    4 mul
@@ -136,33 +131,23 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
     __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
     __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
 #   else
-               mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
-               mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
-               mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
-               mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
-
-#       if((GLM_ARCH & GLM_ARCH_SSE3))
-        __m128 add0 = _mm_hadd_ps(mul0, mul0);
-               add0 = _mm_hadd_ps(add0, add0);
-        __m128 add1 = _mm_hadd_ps(mul1, mul1);
-               add1 = _mm_hadd_ps(add1, add1);
-        __m128 add2 = _mm_hadd_ps(mul2, mul2);
-               add2 = _mm_hadd_ps(add2, add2);
-        __m128 add3 = _mm_hadd_ps(mul3, mul3);
-               add3 = _mm_hadd_ps(add3, add3);
-#       else
-        __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
-               add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
-        __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
-               add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
-        __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
-               add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
-        __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
-               add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
-#       endif
+           mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
+    __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
+           add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
+
+           mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
+    __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
+           add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
+
+           mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
+    __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
+           add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
+
+           mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
+    __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
+           add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
 #endif
 
-    
 
     // I had tried something clever here using shuffles to produce the final result, but it turns out that using
     // _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case.

From 942bf08fe30598d415a6d0bb1620e81b9fff6523 Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Wed, 24 Apr 2013 09:39:22 +1000
Subject: [PATCH 4/9] Add the ability to convert standard mat4s and mat3s to
 SIMD quats.

---
 glm/gtx/simd_quat.hpp | 14 +++++-
 glm/gtx/simd_quat.inl | 99 ++++++++++++++++++++++++++-----------------
 2 files changed, 73 insertions(+), 40 deletions(-)

diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp
index 962b9413..cce5ee61 100644
--- a/glm/gtx/simd_quat.hpp
+++ b/glm/gtx/simd_quat.hpp
@@ -77,7 +77,7 @@ namespace detail
 		static size_type value_size();
 
 		typedef fquatSIMD type;
-		typedef tquat<bool, highp> bool_type;
+		typedef tquat<bool, defaultp> bool_type;
 
 #ifdef GLM_SIMD_ENABLE_XYZW_UNION
         union
@@ -172,6 +172,18 @@ namespace detail
     detail::fquatSIMD quatSIMD_cast(
         detail::fmat4x4SIMD const & m);
 
+    //! Converts a mat4 to a simdQuat.
+    //! (From GLM_GTX_simd_quat extension)
+    template <typename T, precision P>
+    detail::fquatSIMD quatSIMD_cast(
+        detail::tmat4x4<T, P> const & m);
+
+    //! Converts a mat3 to a simdQuat.
+    //! (From GLM_GTX_simd_quat extension)
+    template <typename T, precision P>
+    detail::fquatSIMD quatSIMD_cast(
+        detail::tmat3x3<T, P> const & m);
+
     //! Convert a simdQuat to a simdMat4
     //! (From GLM_GTX_simd_quat extension)
     detail::fmat4x4SIMD mat4SIMD_cast(
diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
index 9502a6ab..90b9a730 100644
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@@ -214,37 +214,23 @@ GLM_FUNC_QUALIFIER quat quat_cast
 {
 	GLM_ALIGN(16) quat Result;
 	_mm_store_ps(&Result[0], x.Data);
+
 	return Result;
 }
 
-GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
-(
-	detail::fmat4x4SIMD const & m
-)
+template <typename T>
+GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast_impl(const T m0[], const T m1[], const T m2[])
 {
-    // Scalar implementation for now.
-
-    GLM_ALIGN(16) float m0[4];
-    GLM_ALIGN(16) float m1[4];
-    GLM_ALIGN(16) float m2[4];
-    GLM_ALIGN(16) float m3[4];
-
-    _mm_store_ps(m0, m[0].Data);
-    _mm_store_ps(m1, m[1].Data);
-    _mm_store_ps(m2, m[2].Data);
-    _mm_store_ps(m3, m[3].Data);
-
-
-    float trace = m0[0] + m1[1] + m2[2] + 1.0f;
-    if (trace > 0)
+    T trace = m0[0] + m1[1] + m2[2] + T(1.0);
+    if (trace > T(0))
     {
-        float s = 0.5f / sqrt(trace);
+        T s = T(0.5) / sqrt(trace);
 
         return _mm_set_ps(
-            0.25f / s,
-            (m0[1] - m1[0]) * s,
-            (m2[0] - m0[2]) * s,
-            (m1[2] - m2[1]) * s);
+            static_cast<float>(T(0.25) / s),
+            static_cast<float>((m0[1] - m1[0]) * s),
+            static_cast<float>((m2[0] - m0[2]) * s),
+            static_cast<float>((m1[2] - m2[1]) * s));
     }
     else
     {
@@ -253,13 +239,13 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
             if (m0[0] > m2[2])
             {
                 // X is biggest.
-                float s = sqrt(m0[0] - m1[1] - m2[2] + 1.0f) * 0.5f;
+                T s = sqrt(m0[0] - m1[1] - m2[2] + T(1.0)) * T(0.5);
 
                 return _mm_set_ps(
-                    (m1[2] - m2[1]) * s,
-                    (m2[0] + m0[2]) * s,
-                    (m0[1] + m1[0]) * s,
-                    0.5f            * s);
+                    static_cast<float>((m1[2] - m2[1]) * s),
+                    static_cast<float>((m2[0] + m0[2]) * s),
+                    static_cast<float>((m0[1] + m1[0]) * s),
+                    static_cast<float>(T(0.5)          * s));
             }
         }
         else
@@ -267,27 +253,62 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
             if (m1[1] > m2[2])
             {
                 // Y is biggest.
-                float s = sqrt(m1[1] - m0[0] - m2[2] + 1.0f) * 0.5f;
+                T s = sqrt(m1[1] - m0[0] - m2[2] + T(1.0)) * T(0.5);
 
                 return _mm_set_ps(
-                    (m2[0] - m0[2]) * s,
-                    (m1[2] + m2[1]) * s,
-                    0.5f            * s,
-                    (m0[1] + m1[0]) * s);
+                    static_cast<float>((m2[0] - m0[2]) * s),
+                    static_cast<float>((m1[2] + m2[1]) * s),
+                    static_cast<float>(T(0.5)          * s),
+                    static_cast<float>((m0[1] + m1[0]) * s));
             }
         }
 
         // Z is biggest.
-        float s = sqrt(m2[2] - m0[0] - m1[1] + 1.0f) * 0.5f;
+        T s = sqrt(m2[2] - m0[0] - m1[1] + T(1.0)) * T(0.5);
 
         return _mm_set_ps(
-            (m0[1] - m1[0]) * s,
-            0.5f            * s,
-            (m1[2] + m2[1]) * s,
-            (m2[0] + m0[2]) * s);
+            static_cast<float>((m0[1] - m1[0]) * s),
+            static_cast<float>(T(0.5)          * s),
+            static_cast<float>((m1[2] + m2[1]) * s),
+            static_cast<float>((m2[0] + m0[2]) * s));
     }
 }
 
+GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
+(
+	detail::fmat4x4SIMD const & m
+)
+{
+    // Scalar implementation for now.
+    GLM_ALIGN(16) float m0[4];
+    GLM_ALIGN(16) float m1[4];
+    GLM_ALIGN(16) float m2[4];
+
+    _mm_store_ps(m0, m[0].Data);
+    _mm_store_ps(m1, m[1].Data);
+    _mm_store_ps(m2, m[2].Data);
+
+    return quatSIMD_cast_impl(m0, m1, m2);
+}
+
+template <typename T, precision P>
+GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
+(
+    detail::tmat4x4<T, P> const & m
+)
+{
+    return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]);
+}
+
+template <typename T, precision P>
+GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
+(
+    detail::tmat3x3<T, P> const & m
+)
+{
+    return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]);
+}
+
 
 GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast
 (

From d07496460a39d7a94846e6e8eb040bd4d3e8f698 Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Wed, 24 Apr 2013 09:54:28 +1000
Subject: [PATCH 5/9] Add the ability to convert a SIMD quat to a standard
 mat4.

---
 glm/gtx/simd_quat.hpp | 5 +++++
 glm/gtx/simd_quat.inl | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp
index cce5ee61..b60b5650 100644
--- a/glm/gtx/simd_quat.hpp
+++ b/glm/gtx/simd_quat.hpp
@@ -189,6 +189,11 @@ namespace detail
     detail::fmat4x4SIMD mat4SIMD_cast(
         detail::fquatSIMD const & q);
 
+    //! Converts a simdQuat to a standard mat4.
+    //! (From GLM_GTX_simd_quat extension)
+    mat4 mat4_cast(
+        detail::fquatSIMD const & q);
+
 
 	/// Returns the length of the quaternion. 
 	/// 
diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
index 90b9a730..d4fc4f3a 100644
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@@ -365,6 +365,14 @@ GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast
     return result;
 }
 
+GLM_FUNC_QUALIFIER mat4 mat4_cast
+(
+	detail::fquatSIMD const & q
+)
+{
+    return mat4_cast(mat4SIMD_cast(q));
+}
+
 
 
 GLM_FUNC_QUALIFIER float length

From c1006718b354e4c354bb520f454c75a53e90a090 Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Wed, 24 Apr 2013 13:55:38 +1000
Subject: [PATCH 6/9] Add fastMix() and fastSlerp() implementations.

These have stricter pre-conditions than standard mix() and slerp()
  - 1) Input quaternions must be unit length.
  - 2) The interpolation factor (a) must be in the range [0, 1]

None of these restrictions should be too bad. The reason for these is that it uses fastAcos()
and fastSin(), both of which have a limited allowable range.

In my contrived tests, I observed about a 10x improvement over the standard versions. This is
mostly because of the faster acos/sin operations. The fastSin(__m128) implementation also helps
here because it can do four fastSin() operations simultaneously using SSE (mix() and slerp()
each need three).
---
 glm/gtx/simd_quat.hpp | 37 +++++++++++++++++-
 glm/gtx/simd_quat.inl | 88 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 121 insertions(+), 4 deletions(-)

diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp
index b60b5650..a3921cb3 100644
--- a/glm/gtx/simd_quat.hpp
+++ b/glm/gtx/simd_quat.hpp
@@ -41,6 +41,7 @@
 // Dependency:
 #include "../glm.hpp"
 #include "../gtc/quaternion.hpp"
+#include "../gtx/fast_trigonometry.hpp"
 
 #if(GLM_ARCH != GLM_ARCH_PURE)
 
@@ -223,7 +224,7 @@ namespace detail
 	/// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1].
 	/// @tparam T Value type used to build the quaternion. Supported: half, float or double.
 	/// @see gtc_quaternion
-	/// @see - slerp(detail::tquat<T> const & x, detail::tquat<T> const & y, T const & a) 
+	/// @see - slerp(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) 
 	detail::fquatSIMD mix(
 		detail::fquatSIMD const & x, 
 		detail::fquatSIMD const & y, 
@@ -255,6 +256,35 @@ namespace detail
 		detail::fquatSIMD const & y, 
 		float const & a);
 
+
+    /// Faster spherical linear interpolation of two unit length quaternions.
+    ///
+    /// This is the same as mix(), except for two rules:
+    ///   1) The two quaternions must be unit length.
+    ///   2) The interpolation factor (a) must be in the range [0, 1].
+    ///
+    /// This will use the equivalent to fastAcos() and fastSin().
+    ///
+	/// @see gtc_quaternion
+	/// @see - mix(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) 
+	detail::fquatSIMD fastMix(
+		detail::fquatSIMD const & x, 
+		detail::fquatSIMD const & y, 
+		float const & a);
+
+    /// Identical to fastMix() except takes the shortest path.
+    ///
+    /// The same rules apply here as those in fastMix(). Both quaternions must be unit length and 'a' must be
+    /// in the range [0, 1].
+    ///
+	/// @see - fastMix(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) 
+	/// @see - slerp(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) 
+    detail::fquatSIMD fastSlerp(
+		detail::fquatSIMD const & x, 
+		detail::fquatSIMD const & y, 
+		float const & a);
+
+
 	/// Returns the q conjugate. 
 	/// 
 	/// @see gtc_quaternion
@@ -292,6 +322,11 @@ namespace detail
 		float const & z);
 
 
+    // TODO: Move this to somewhere more appropriate. Used with fastMix() and fastSlerp().
+    /// Performs the equivalent of glm::fastSin() on each component of the given __m128.
+    __m128 fastSin(__m128 x);
+
+
 	/// @}
 }//namespace glm
 
diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
index d4fc4f3a..3ff493fb 100644
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@@ -423,9 +423,6 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD mix
 
         // Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation
         // will most likely be faster, but that'll need to be left to people much smarter than I.
-        //
-        // The issue, I think, is loading the __m128 variables with initial data. Can probably be replaced with an SSE-optimized approximation of
-        // glm::sin(). Maybe a fastMix() function would be better for that?
         
         float s0 = glm::sin((1.0f - a) * angle);
         float s1 = glm::sin(a * angle);
@@ -495,6 +492,73 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD slerp
 	}
 }
 
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD fastMix
+(
+	detail::fquatSIMD const & x, 
+	detail::fquatSIMD const & y, 
+	float const & a
+)
+{
+	float cosTheta = dot(x, y);
+
+    if (cosTheta > 1.0f - glm::epsilon<float>())
+    {
+	    return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
+    }
+    else
+    {
+        float angle = glm::fastAcos(cosTheta);
+
+
+        __m128 s  = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f));
+
+        __m128 s0 =                               _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3));
+        __m128 s1 =                               _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2));
+        __m128 d  = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1)));
+        
+        return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
+    }
+}
+
+GLM_FUNC_QUALIFIER detail::fquatSIMD fastSlerp
+(
+	detail::fquatSIMD const & x, 
+	detail::fquatSIMD const & y, 
+	float const & a
+)
+{
+	detail::fquatSIMD z = y;
+
+	float cosTheta = dot(x, y);
+	if (cosTheta < 0.0f)
+	{
+		z        = -y;
+		cosTheta = -cosTheta;
+	}
+
+
+	if(cosTheta > 1.0f - epsilon<float>())
+	{
+		return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
+	}
+	else
+	{
+        float angle = glm::fastAcos(cosTheta);
+
+
+        __m128 s  = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f));
+
+        __m128 s0 =                               _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3));
+        __m128 s1 =                               _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2));
+        __m128 d  = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1)));
+        
+        return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
+	}
+}
+
+
+
 GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate
 (
 	detail::fquatSIMD const & q
@@ -544,4 +608,22 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
 }
 
 
+GLM_FUNC_QUALIFIER __m128 fastSin(__m128 x)
+{
+    static const __m128 c0 = _mm_set1_ps(0.16666666666666666666666666666667f);
+    static const __m128 c1 = _mm_set1_ps(0.00833333333333333333333333333333f);
+    static const __m128 c2 = _mm_set1_ps(0.00019841269841269841269841269841f);
+
+    __m128 x3 = _mm_mul_ps(x,  _mm_mul_ps(x, x));
+    __m128 x5 = _mm_mul_ps(x3, _mm_mul_ps(x, x));
+    __m128 x7 = _mm_mul_ps(x5, _mm_mul_ps(x, x));
+
+    __m128 y0 = _mm_mul_ps(x3, c0);
+    __m128 y1 = _mm_mul_ps(x5, c1);
+    __m128 y2 = _mm_mul_ps(x7, c2);
+        
+    return _mm_sub_ps(_mm_add_ps(_mm_sub_ps(x, y0), y1), y2);
+}
+
+
 }//namespace glm

From 1eb88e4bc139c7855ddde33af08957e1753615d4 Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Wed, 24 Apr 2013 14:06:28 +1000
Subject: [PATCH 7/9] Cleanup.

---
 glm/gtx/simd_quat.inl | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
index 3ff493fb..6fce84c2 100644
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@@ -420,22 +420,12 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD mix
     {
         float angle = glm::acos(cosTheta);
         
-
-        // Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation
-        // will most likely be faster, but that'll need to be left to people much smarter than I.
         
         float s0 = glm::sin((1.0f - a) * angle);
         float s1 = glm::sin(a * angle);
         float d  = 1.0f / glm::sin(angle);
 
         return (s0 * x + s1 * y) * d;
-
-
-        //__m128 s0 = _mm_set1_ps(glm::sin((1.0f - a) * angle));
-        //__m128 s1 = _mm_set1_ps(glm::sin(a * angle));
-        //__m128 d  = _mm_set1_ps(1.0f / glm::sin(angle));
-        //
-        //return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
     }
 }
 
@@ -482,8 +472,6 @@ GLM_FUNC_QUALIFIER detail::fquatSIMD slerp
         float angle = glm::acos(cosTheta);
 
 
-        // See mix() above for an explanation to the rationale behind this non-SSE implementation.
-
 		float s0 = glm::sin((1.0f - a) * angle);
         float s1 = glm::sin(a * angle);
         float d  = 1.0f / glm::sin(angle);

From c08ea7656b5ed4117e6f651432fd6a07fa2e17e8 Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Wed, 24 Apr 2013 14:22:27 +1000
Subject: [PATCH 8/9] Reorganize operator* to makes things a bit clearer.

---
 glm/gtx/simd_quat.inl | 50 +++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
index 6fce84c2..36aae93d 100644
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@@ -120,54 +120,54 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
     //    4 mul
     //    4 dpps
 
-    __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data);
-    __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));
-    __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
-    __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
+    __m128 mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));
+    __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
+    __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
+    __m128 mul3 = _mm_mul_ps(q1.Data, q2.Data);
 
 #   if((GLM_ARCH & GLM_ARCH_SSE4))
-    __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
-    __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f), 0xff);
-    __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
-    __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
+    __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f), 0xff);
+    __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
+    __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
+    __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
 #   else
-           mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
+           mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
     __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
            add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
 
-           mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
+           mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
     __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
            add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
 
-           mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
+           mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
     __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
            add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
 
-           mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
+           mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
     __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
            add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
 #endif
 
 
-    // I had tried something clever here using shuffles to produce the final result, but it turns out that using
-    // _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case.
-
-    float w;
+    // This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than
+    // the final code below. I'll keep this here for reference - maybe somebody else can do something better...
+    //
+    //__m128 xxyy = _mm_shuffle_ps(add0, add1, _MM_SHUFFLE(0, 0, 0, 0));
+    //__m128 zzww = _mm_shuffle_ps(add2, add3, _MM_SHUFFLE(0, 0, 0, 0));
+    //
+    //return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0));
+    
     float x;
     float y;
     float z;
+    float w;
 
-    _mm_store_ss(&w, add0);
-    _mm_store_ss(&x, add1);
-    _mm_store_ss(&y, add2);
-    _mm_store_ss(&z, add3);
+    _mm_store_ss(&x, add0);
+    _mm_store_ss(&y, add1);
+    _mm_store_ss(&z, add2);
+    _mm_store_ss(&w, add3);
 
     return detail::fquatSIMD(w, x, y, z);
-
-           
-    //return _mm_shuffle_ps(_mm_shuffle_ps(add1, add2, 0),
-    //                      _mm_shuffle_ps(add3, add0, 0),
-    //                      _MM_SHUFFLE(2, 0, 2, 0));
 }
 
 GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v)

From e0cfd7d672f4b90e733f3d52a9e6dc764320e13c Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Wed, 24 Apr 2013 15:00:03 +1000
Subject: [PATCH 9/9] Attempt a small optimization in operator*(fquatSIMD,
 fvec4SIMD). No big improvement.

---
 glm/gtx/simd_quat.inl | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
index 36aae93d..d96f48d4 100644
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@@ -172,11 +172,22 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
 
 GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v)
 {
-    __m128 uv  = detail::sse_xpd_ps(q.Data, v.Data);
-    __m128 uuv = detail::sse_xpd_ps(q.Data, uv);
+    static const __m128 two = _mm_set1_ps(2.0f);
+
+    __m128 q_wwww  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 q_swp0  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1));
+	__m128 q_swp1  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2));
+	__m128 v_swp0  = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 0, 2, 1));
+	__m128 v_swp1  = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 1, 0, 2));
+	
+	__m128 uv      = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0));
+    __m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1));
+    __m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2));
+    __m128 uuv     = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0));
 
-    uv  = _mm_mul_ps(uv,  _mm_mul_ps(_mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)), _mm_set1_ps(2.0f)));
-    uuv = _mm_mul_ps(uuv, _mm_set1_ps(2.0f));
+    
+    uv  = _mm_mul_ps(uv,  _mm_mul_ps(q_wwww, two));
+    uuv = _mm_mul_ps(uuv, two);
 
     return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv));
 }