You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and dots ('.'), can be up to 35 characters long. Letters must be lowercase.
518 lines
13 KiB
518 lines
13 KiB
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
// OpenGL Mathematics Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net) |
|
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
// Created : 2013-04-22 |
|
// Updated : 2013-04-22 |
|
// Licence : This source is under MIT License |
|
// File : glm/gtx/simd_quat.inl |
|
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
|
namespace glm{ |
|
namespace detail{ |
|
|
|
|
|
////////////////////////////////////// |
|
// Debugging |
|
#if 0 |
|
void print(__m128 v) |
|
{ |
|
GLM_ALIGN(16) float result[4]; |
|
_mm_store_ps(result, v); |
|
|
|
printf("__m128: %f %f %f %f\n", result[0], result[1], result[2], result[3]); |
|
} |
|
|
|
void print(const fvec4SIMD &v) |
|
{ |
|
printf("fvec4SIMD: %f %f %f %f\n", v.x, v.y, v.z, v.w); |
|
} |
|
#endif |
|
|
|
|
|
////////////////////////////////////// |
|
// Implicit basic constructors |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD() |
|
#ifdef GLM_SIMD_ENABLE_DEFAULT_INIT |
|
: Data(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f)) |
|
#endif |
|
{} |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(__m128 const & Data) : |
|
Data(Data) |
|
{} |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(fquatSIMD const & q) : |
|
Data(q.Data) |
|
{} |
|
|
|
|
|
////////////////////////////////////// |
|
// Explicit basic constructors |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(float const & w, float const & x, float const & y, float const & z) : |
|
Data(_mm_set_ps(w, z, y, x)) |
|
{} |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(quat const & q) : |
|
Data(_mm_set_ps(q.w, q.z, q.y, q.x)) |
|
{} |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(vec3 const & eulerAngles) |
|
{ |
|
vec3 c = glm::cos(eulerAngles * 0.5f); |
|
vec3 s = glm::sin(eulerAngles * 0.5f); |
|
|
|
Data = _mm_set_ps( |
|
(c.x * c.y * c.z) + (s.x * s.y * s.z), |
|
(c.x * c.y * s.z) - (s.x * s.y * c.z), |
|
(c.x * s.y * c.z) + (s.x * c.y * s.z), |
|
(s.x * c.y * c.z) - (c.x * s.y * s.z)); |
|
} |
|
|
|
|
|
////////////////////////////////////// |
|
// Unary arithmetic operators |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator=(fquatSIMD const & q) |
|
{ |
|
this->Data = q.Data; |
|
return *this; |
|
} |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator*=(float const & s) |
|
{ |
|
this->Data = _mm_mul_ps(this->Data, _mm_set_ps1(s)); |
|
return *this; |
|
} |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator/=(float const & s) |
|
{ |
|
this->Data = _mm_div_ps(Data, _mm_set1_ps(s)); |
|
return *this; |
|
} |
|
|
|
|
|
|
|
// negate operator |
|
GLM_FUNC_QUALIFIER fquatSIMD operator- (fquatSIMD const & q) |
|
{ |
|
return fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(-1.0f, -1.0f, -1.0f, -1.0f))); |
|
} |
|
|
|
// operator+ |
|
GLM_FUNC_QUALIFIER fquatSIMD operator+ (fquatSIMD const & q1, fquatSIMD const & q2) |
|
{ |
|
return fquatSIMD(_mm_add_ps(q1.Data, q2.Data)); |
|
} |
|
|
|
//operator* |
|
GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & q2) |
|
{ |
|
// SSE2 STATS: |
|
// 11 shuffle |
|
// 8 mul |
|
// 8 add |
|
|
|
// SSE4 STATS: |
|
// 3 shuffle |
|
// 4 mul |
|
// 4 dpps |
|
|
|
__m128 mul0 = _mm_mul_ps(q1.Data, q2.Data); |
|
__m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3))); |
|
__m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); |
|
__m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); |
|
|
|
# if((GLM_ARCH & GLM_ARCH_SSE4)) |
|
__m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff); |
|
__m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff); |
|
__m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); |
|
__m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); |
|
# else |
|
mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); |
|
__m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); |
|
add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); |
|
|
|
mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); |
|
__m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); |
|
add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); |
|
|
|
mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); |
|
__m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); |
|
add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); |
|
|
|
mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); |
|
__m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); |
|
add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); |
|
#endif |
|
|
|
|
|
// I had tried something clever here using shuffles to produce the final result, but it turns out that using |
|
// _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case. |
|
|
|
float w; |
|
float x; |
|
float y; |
|
float z; |
|
|
|
_mm_store_ss(&w, add0); |
|
_mm_store_ss(&x, add1); |
|
_mm_store_ss(&y, add2); |
|
_mm_store_ss(&z, add3); |
|
|
|
return detail::fquatSIMD(w, x, y, z); |
|
|
|
|
|
//return _mm_shuffle_ps(_mm_shuffle_ps(add1, add2, 0), |
|
// _mm_shuffle_ps(add3, add0, 0), |
|
// _MM_SHUFFLE(2, 0, 2, 0)); |
|
} |
|
|
|
GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v) |
|
{ |
|
__m128 uv = detail::sse_xpd_ps(q.Data, v.Data); |
|
__m128 uuv = detail::sse_xpd_ps(q.Data, uv); |
|
|
|
uv = _mm_mul_ps(uv, _mm_mul_ps(_mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)), _mm_set1_ps(2.0f))); |
|
uuv = _mm_mul_ps(uuv, _mm_set1_ps(2.0f)); |
|
|
|
return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv)); |
|
} |
|
|
|
GLM_FUNC_QUALIFIER fvec4SIMD operator* (fvec4SIMD const & v, fquatSIMD const & q) |
|
{ |
|
return inverse(q) * v; |
|
} |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q, float s) |
|
{ |
|
return fquatSIMD(_mm_mul_ps(q.Data, _mm_set1_ps(s))); |
|
} |
|
|
|
GLM_FUNC_QUALIFIER fquatSIMD operator* (float s, fquatSIMD const & q) |
|
{ |
|
return fquatSIMD(_mm_mul_ps(_mm_set1_ps(s), q.Data)); |
|
} |
|
|
|
|
|
//operator/ |
|
GLM_FUNC_QUALIFIER fquatSIMD operator/ (fquatSIMD const & q, float s) |
|
{ |
|
return fquatSIMD(_mm_div_ps(q.Data, _mm_set1_ps(s))); |
|
} |
|
|
|
|
|
}//namespace detail |
|
|
|
|
|
GLM_FUNC_QUALIFIER quat quat_cast |
|
( |
|
detail::fquatSIMD const & x |
|
) |
|
{ |
|
GLM_ALIGN(16) quat Result; |
|
_mm_store_ps(&Result[0], x.Data); |
|
return Result; |
|
} |
|
|
|
GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast |
|
( |
|
detail::fmat4x4SIMD const & m |
|
) |
|
{ |
|
// Scalar implementation for now. |
|
|
|
GLM_ALIGN(16) float m0[4]; |
|
GLM_ALIGN(16) float m1[4]; |
|
GLM_ALIGN(16) float m2[4]; |
|
GLM_ALIGN(16) float m3[4]; |
|
|
|
_mm_store_ps(m0, m[0].Data); |
|
_mm_store_ps(m1, m[1].Data); |
|
_mm_store_ps(m2, m[2].Data); |
|
_mm_store_ps(m3, m[3].Data); |
|
|
|
|
|
float trace = m0[0] + m1[1] + m2[2] + 1.0f; |
|
if (trace > 0) |
|
{ |
|
float s = 0.5f / sqrt(trace); |
|
|
|
return _mm_set_ps( |
|
0.25f / s, |
|
(m0[1] - m1[0]) * s, |
|
(m2[0] - m0[2]) * s, |
|
(m1[2] - m2[1]) * s); |
|
} |
|
else |
|
{ |
|
if (m0[0] > m1[1]) |
|
{ |
|
if (m0[0] > m2[2]) |
|
{ |
|
// X is biggest. |
|
float s = sqrt(m0[0] - m1[1] - m2[2] + 1.0f) * 0.5f; |
|
|
|
return _mm_set_ps( |
|
(m1[2] - m2[1]) * s, |
|
(m2[0] + m0[2]) * s, |
|
(m0[1] + m1[0]) * s, |
|
0.5f * s); |
|
} |
|
} |
|
else |
|
{ |
|
if (m1[1] > m2[2]) |
|
{ |
|
// Y is biggest. |
|
float s = sqrt(m1[1] - m0[0] - m2[2] + 1.0f) * 0.5f; |
|
|
|
return _mm_set_ps( |
|
(m2[0] - m0[2]) * s, |
|
(m1[2] + m2[1]) * s, |
|
0.5f * s, |
|
(m0[1] + m1[0]) * s); |
|
} |
|
} |
|
|
|
// Z is biggest. |
|
float s = sqrt(m2[2] - m0[0] - m1[1] + 1.0f) * 0.5f; |
|
|
|
return _mm_set_ps( |
|
(m0[1] - m1[0]) * s, |
|
0.5f * s, |
|
(m1[2] + m2[1]) * s, |
|
(m2[0] + m0[2]) * s); |
|
} |
|
} |
|
|
|
|
|
GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast |
|
( |
|
detail::fquatSIMD const & q |
|
) |
|
{ |
|
detail::fmat4x4SIMD result; |
|
|
|
__m128 _wwww = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)); |
|
__m128 _xyzw = q.Data; |
|
__m128 _zxyw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2)); |
|
__m128 _yzxw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1)); |
|
|
|
__m128 _xyzw2 = _mm_add_ps(_xyzw, _xyzw); |
|
__m128 _zxyw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 1, 0, 2)); |
|
__m128 _yzxw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 0, 2, 1)); |
|
|
|
__m128 _tmp0 = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_yzxw2, _yzxw)); |
|
_tmp0 = _mm_sub_ps(_tmp0, _mm_mul_ps(_zxyw2, _zxyw)); |
|
|
|
__m128 _tmp1 = _mm_mul_ps(_yzxw2, _xyzw); |
|
_tmp1 = _mm_add_ps(_tmp1, _mm_mul_ps(_zxyw2, _wwww)); |
|
|
|
__m128 _tmp2 = _mm_mul_ps(_zxyw2, _xyzw); |
|
_tmp2 = _mm_sub_ps(_tmp2, _mm_mul_ps(_yzxw2, _wwww)); |
|
|
|
|
|
// There's probably a better, more politically correct way of doing this... |
|
result[0].Data = _mm_set_ps( |
|
0.0f, |
|
reinterpret_cast<float*>(&_tmp2)[0], |
|
reinterpret_cast<float*>(&_tmp1)[0], |
|
reinterpret_cast<float*>(&_tmp0)[0]); |
|
|
|
result[1].Data = _mm_set_ps( |
|
0.0f, |
|
reinterpret_cast<float*>(&_tmp1)[1], |
|
reinterpret_cast<float*>(&_tmp0)[1], |
|
reinterpret_cast<float*>(&_tmp2)[1]); |
|
|
|
result[2].Data = _mm_set_ps( |
|
0.0f, |
|
reinterpret_cast<float*>(&_tmp0)[2], |
|
reinterpret_cast<float*>(&_tmp2)[2], |
|
reinterpret_cast<float*>(&_tmp1)[2]); |
|
|
|
result[3].Data = _mm_set_ps( |
|
1.0f, |
|
0.0f, |
|
0.0f, |
|
0.0f); |
|
|
|
|
|
return result; |
|
} |
|
|
|
|
|
|
|
GLM_FUNC_QUALIFIER float length |
|
( |
|
detail::fquatSIMD const & q |
|
) |
|
{ |
|
return glm::sqrt(dot(q, q)); |
|
} |
|
|
|
GLM_FUNC_QUALIFIER detail::fquatSIMD normalize |
|
( |
|
detail::fquatSIMD const & q |
|
) |
|
{ |
|
return _mm_mul_ps(q.Data, _mm_set1_ps(1.0f / length(q))); |
|
} |
|
|
|
GLM_FUNC_QUALIFIER float dot |
|
( |
|
detail::fquatSIMD const & q1, |
|
detail::fquatSIMD const & q2 |
|
) |
|
{ |
|
float result; |
|
_mm_store_ss(&result, detail::sse_dot_ps(q1.Data, q2.Data)); |
|
|
|
return result; |
|
} |
|
|
|
GLM_FUNC_QUALIFIER detail::fquatSIMD mix |
|
( |
|
detail::fquatSIMD const & x, |
|
detail::fquatSIMD const & y, |
|
float const & a |
|
) |
|
{ |
|
float cosTheta = dot(x, y); |
|
|
|
if (cosTheta > 1.0f - glm::epsilon<float>()) |
|
{ |
|
return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); |
|
} |
|
else |
|
{ |
|
float angle = glm::acos(cosTheta); |
|
|
|
|
|
// Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation |
|
// will most likely be faster, but that'll need to be left to people much smarter than I. |
|
// |
|
// The issue, I think, is loading the __m128 variables with initial data. Can probably be replaced with an SSE-optimized approximation of |
|
// glm::sin(). Maybe a fastMix() function would be better for that? |
|
|
|
float s0 = glm::sin((1.0f - a) * angle); |
|
float s1 = glm::sin(a * angle); |
|
float d = 1.0f / glm::sin(angle); |
|
|
|
return (s0 * x + s1 * y) * d; |
|
|
|
|
|
//__m128 s0 = _mm_set1_ps(glm::sin((1.0f - a) * angle)); |
|
//__m128 s1 = _mm_set1_ps(glm::sin(a * angle)); |
|
//__m128 d = _mm_set1_ps(1.0f / glm::sin(angle)); |
|
// |
|
//return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); |
|
} |
|
} |
|
|
|
GLM_FUNC_QUALIFIER detail::fquatSIMD lerp |
|
( |
|
detail::fquatSIMD const & x, |
|
detail::fquatSIMD const & y, |
|
float const & a |
|
) |
|
{ |
|
// Lerp is only defined in [0, 1] |
|
assert(a >= 0.0f); |
|
assert(a <= 1.0f); |
|
|
|
return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); |
|
} |
|
|
|
GLM_FUNC_QUALIFIER detail::fquatSIMD slerp |
|
( |
|
detail::fquatSIMD const & x, |
|
detail::fquatSIMD const & y, |
|
float const & a |
|
) |
|
{ |
|
detail::fquatSIMD z = y; |
|
|
|
float cosTheta = dot(x, y); |
|
|
|
// If cosTheta < 0, the interpolation will take the long way around the sphere. |
|
// To fix this, one quat must be negated. |
|
if (cosTheta < 0.0f) |
|
{ |
|
z = -y; |
|
cosTheta = -cosTheta; |
|
} |
|
|
|
// Perform a linear interpolation when cosTheta is close to 1 to avoid side effect of sin(angle) becoming a zero denominator |
|
if(cosTheta > 1.0f - epsilon<float>()) |
|
{ |
|
return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); |
|
} |
|
else |
|
{ |
|
float angle = glm::acos(cosTheta); |
|
|
|
|
|
// See mix() above for an explanation to the rationale behind this non-SSE implementation. |
|
|
|
float s0 = glm::sin((1.0f - a) * angle); |
|
float s1 = glm::sin(a * angle); |
|
float d = 1.0f / glm::sin(angle); |
|
|
|
return (s0 * x + s1 * y) * d; |
|
} |
|
} |
|
|
|
GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate |
|
( |
|
detail::fquatSIMD const & q |
|
) |
|
{ |
|
return detail::fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f))); |
|
} |
|
|
|
GLM_FUNC_QUALIFIER detail::fquatSIMD inverse |
|
( |
|
detail::fquatSIMD const & q |
|
) |
|
{ |
|
return conjugate(q) / dot(q, q); |
|
} |
|
|
|
|
|
GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD |
|
( |
|
float const & angle, |
|
vec3 const & v |
|
) |
|
{ |
|
#ifdef GLM_FORCE_RADIANS |
|
float a(angle); |
|
#else |
|
float a(glm::radians(angle)); |
|
#endif |
|
float s = glm::sin(a * 0.5f); |
|
|
|
return _mm_set_ps( |
|
glm::cos(a * 0.5f), |
|
v.z * s, |
|
v.y * s, |
|
v.x * s); |
|
} |
|
|
|
GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD |
|
( |
|
float const & angle, |
|
float const & x, |
|
float const & y, |
|
float const & z |
|
) |
|
{ |
|
return angleAxisSIMD(angle, vec3(x, y, z)); |
|
} |
|
|
|
|
|
}//namespace glm
|
|
|