glm/glm/gtx/simd_quat.inl

///////////////////////////////////////////////////////////////////////////////////////////////////
// OpenGL Mathematics Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net)
///////////////////////////////////////////////////////////////////////////////////////////////////
// Created : 2013-04-22
// Updated : 2013-04-22
// Licence : This source is under MIT License
// File    : glm/gtx/simd_quat.inl
///////////////////////////////////////////////////////////////////////////////////////////////////


namespace glm{
namespace detail{


//////////////////////////////////////
// Debugging
#if 0
void print(__m128 v)
{
    GLM_ALIGN(16) float result[4];
    _mm_store_ps(result, v);

    printf("__m128:    %f %f %f %f\n", result[0], result[1], result[2], result[3]);
}

void print(const fvec4SIMD &v)
{
    printf("fvec4SIMD: %f %f %f %f\n", v.x, v.y, v.z, v.w);
}
#endif


//////////////////////////////////////
// Implicit basic constructors

GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD()
#ifdef GLM_SIMD_ENABLE_DEFAULT_INIT
    : Data(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f))
#endif
{}

GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(__m128 const & Data) :
	Data(Data)
{}

GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(fquatSIMD const & q) :
	Data(q.Data)
{}


//////////////////////////////////////
// Explicit basic constructors

GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(float const & w, float const & x, float const & y, float const & z) :
	Data(_mm_set_ps(w, z, y, x))
{}

GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(quat const & q) :
	Data(_mm_set_ps(q.w, q.z, q.y, q.x))
{}

GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(vec3 const & eulerAngles)
{
    vec3 c = glm::cos(eulerAngles * 0.5f);
	vec3 s = glm::sin(eulerAngles * 0.5f);

    Data = _mm_set_ps(
        (c.x * c.y * c.z) + (s.x * s.y * s.z),
        (c.x * c.y * s.z) - (s.x * s.y * c.z),
        (c.x * s.y * c.z) + (s.x * c.y * s.z),
        (s.x * c.y * c.z) - (c.x * s.y * s.z));
}


//////////////////////////////////////
// Unary arithmetic operators

GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator=(fquatSIMD const & q)
{
    this->Data = q.Data;
    return *this;
}

GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator*=(float const & s)
{
	this->Data = _mm_mul_ps(this->Data, _mm_set_ps1(s));
	return *this;
}

GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator/=(float const & s)
{
	this->Data = _mm_div_ps(Data, _mm_set1_ps(s));
	return *this;
}


// negate operator
GLM_FUNC_QUALIFIER fquatSIMD operator- (fquatSIMD const & q)
{
    return fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(-1.0f, -1.0f, -1.0f, -1.0f)));
}

// operator+
GLM_FUNC_QUALIFIER fquatSIMD operator+ (fquatSIMD const & q1, fquatSIMD const & q2)
{
	return fquatSIMD(_mm_add_ps(q1.Data, q2.Data));
}

//operator*
GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & q2)
{
    // SSE2 STATS:
    //    11 shuffle
    //    8  mul
    //    8  add

    // SSE4 STATS:
    //    3 shuffle
    //    4 mul
    //    4 dpps

    __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data);
    __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));
    __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
    __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));

#   if((GLM_ARCH & GLM_ARCH_SSE4))
    __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
    __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f), 0xff);
    __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
    __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
#   else
           mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
    __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
           add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));

           mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
    __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
           add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));

           mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
    __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
           add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));

           mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
    __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
           add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
#endif


    // I had tried something clever here using shuffles to produce the final result, but it turns out that using
    // _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case.

    float w;
    float x;
    float y;
    float z;

    _mm_store_ss(&w, add0);
    _mm_store_ss(&x, add1);
    _mm_store_ss(&y, add2);
    _mm_store_ss(&z, add3);

    return detail::fquatSIMD(w, x, y, z);


    //return _mm_shuffle_ps(_mm_shuffle_ps(add1, add2, 0),
    //                      _mm_shuffle_ps(add3, add0, 0),
    //                      _MM_SHUFFLE(2, 0, 2, 0));
}

GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v)
{
    __m128 uv  = detail::sse_xpd_ps(q.Data, v.Data);
    __m128 uuv = detail::sse_xpd_ps(q.Data, uv);

    uv  = _mm_mul_ps(uv,  _mm_mul_ps(_mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)), _mm_set1_ps(2.0f)));
    uuv = _mm_mul_ps(uuv, _mm_set1_ps(2.0f));

    return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv));
}

GLM_FUNC_QUALIFIER fvec4SIMD operator* (fvec4SIMD const & v, fquatSIMD const & q)
{
	return inverse(q) * v;
}

GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q, float s)
{
	return fquatSIMD(_mm_mul_ps(q.Data, _mm_set1_ps(s)));
}

GLM_FUNC_QUALIFIER fquatSIMD operator* (float s, fquatSIMD const & q)
{
	return fquatSIMD(_mm_mul_ps(_mm_set1_ps(s), q.Data));
}


//operator/
GLM_FUNC_QUALIFIER fquatSIMD operator/ (fquatSIMD const & q, float s)
{
	return fquatSIMD(_mm_div_ps(q.Data, _mm_set1_ps(s)));
}


}//namespace detail


GLM_FUNC_QUALIFIER quat quat_cast
(
	detail::fquatSIMD const & x
)
{
	GLM_ALIGN(16) quat Result;
	_mm_store_ps(&Result[0], x.Data);
	return Result;
}

GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast
(
	detail::fmat4x4SIMD const & m
)
{
    // Scalar implementation for now.

    GLM_ALIGN(16) float m0[4];
    GLM_ALIGN(16) float m1[4];
    GLM_ALIGN(16) float m2[4];
    GLM_ALIGN(16) float m3[4];

    _mm_store_ps(m0, m[0].Data);
    _mm_store_ps(m1, m[1].Data);
    _mm_store_ps(m2, m[2].Data);
    _mm_store_ps(m3, m[3].Data);


    float trace = m0[0] + m1[1] + m2[2] + 1.0f;
    if (trace > 0)
    {
        float s = 0.5f / sqrt(trace);

        return _mm_set_ps(
            0.25f / s,
            (m0[1] - m1[0]) * s,
            (m2[0] - m0[2]) * s,
            (m1[2] - m2[1]) * s);
    }
    else
    {
        if (m0[0] > m1[1])
        {
            if (m0[0] > m2[2])
            {
                // X is biggest.
                float s = sqrt(m0[0] - m1[1] - m2[2] + 1.0f) * 0.5f;

                return _mm_set_ps(
                    (m1[2] - m2[1]) * s,
                    (m2[0] + m0[2]) * s,
                    (m0[1] + m1[0]) * s,
                    0.5f            * s);
            }
        }
        else
        {
            if (m1[1] > m2[2])
            {
                // Y is biggest.
                float s = sqrt(m1[1] - m0[0] - m2[2] + 1.0f) * 0.5f;

                return _mm_set_ps(
                    (m2[0] - m0[2]) * s,
                    (m1[2] + m2[1]) * s,
                    0.5f            * s,
                    (m0[1] + m1[0]) * s);
            }
        }

        // Z is biggest.
        float s = sqrt(m2[2] - m0[0] - m1[1] + 1.0f) * 0.5f;

        return _mm_set_ps(
            (m0[1] - m1[0]) * s,
            0.5f            * s,
            (m1[2] + m2[1]) * s,
            (m2[0] + m0[2]) * s);
    }
}


GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast
(
	detail::fquatSIMD const & q
)
{
    detail::fmat4x4SIMD result;

    __m128 _wwww  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 _xyzw  = q.Data;
    __m128 _zxyw  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2));
    __m128 _yzxw  = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1));

    __m128 _xyzw2 = _mm_add_ps(_xyzw, _xyzw);
    __m128 _zxyw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 1, 0, 2));
    __m128 _yzxw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 0, 2, 1));

    __m128 _tmp0  = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_yzxw2, _yzxw));
           _tmp0  = _mm_sub_ps(_tmp0, _mm_mul_ps(_zxyw2, _zxyw));

    __m128 _tmp1  = _mm_mul_ps(_yzxw2, _xyzw);
           _tmp1  = _mm_add_ps(_tmp1, _mm_mul_ps(_zxyw2, _wwww));

    __m128 _tmp2  = _mm_mul_ps(_zxyw2, _xyzw);
           _tmp2  = _mm_sub_ps(_tmp2, _mm_mul_ps(_yzxw2, _wwww));


    // There's probably a better, more politically correct way of doing this...
    result[0].Data = _mm_set_ps(
        0.0f,
        reinterpret_cast<float*>(&_tmp2)[0],
        reinterpret_cast<float*>(&_tmp1)[0],
        reinterpret_cast<float*>(&_tmp0)[0]);

    result[1].Data = _mm_set_ps(
        0.0f,
        reinterpret_cast<float*>(&_tmp1)[1],
        reinterpret_cast<float*>(&_tmp0)[1],
        reinterpret_cast<float*>(&_tmp2)[1]);

    result[2].Data = _mm_set_ps(
        0.0f,
        reinterpret_cast<float*>(&_tmp0)[2],
        reinterpret_cast<float*>(&_tmp2)[2],
        reinterpret_cast<float*>(&_tmp1)[2]);

   result[3].Data = _mm_set_ps(
        1.0f,
        0.0f,
        0.0f,
        0.0f);


    return result;
}


GLM_FUNC_QUALIFIER float length
(
	detail::fquatSIMD const & q
)
{
    return glm::sqrt(dot(q, q));
}

GLM_FUNC_QUALIFIER detail::fquatSIMD normalize
(
	detail::fquatSIMD const & q
)
{
    return _mm_mul_ps(q.Data, _mm_set1_ps(1.0f / length(q)));
}

GLM_FUNC_QUALIFIER float dot
(
	detail::fquatSIMD const & q1,
	detail::fquatSIMD const & q2
)
{
    float result;
    _mm_store_ss(&result, detail::sse_dot_ps(q1.Data, q2.Data));

    return result;
}

GLM_FUNC_QUALIFIER detail::fquatSIMD mix
(
	detail::fquatSIMD const & x,
	detail::fquatSIMD const & y,
	float const & a
)
{
	float cosTheta = dot(x, y);

    if (cosTheta > 1.0f - glm::epsilon<float>())
    {
	    return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
    }
    else
    {
        float angle = glm::acos(cosTheta);


        // Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation
        // will most likely be faster, but that'll need to be left to people much smarter than I.
        //
        // The issue, I think, is loading the __m128 variables with initial data. Can probably be replaced with an SSE-optimized approximation of
        // glm::sin(). Maybe a fastMix() function would be better for that?

        float s0 = glm::sin((1.0f - a) * angle);
        float s1 = glm::sin(a * angle);
        float d  = 1.0f / glm::sin(angle);

        return (s0 * x + s1 * y) * d;


        //__m128 s0 = _mm_set1_ps(glm::sin((1.0f - a) * angle));
        //__m128 s1 = _mm_set1_ps(glm::sin(a * angle));
        //__m128 d  = _mm_set1_ps(1.0f / glm::sin(angle));
        //
        //return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d);
    }
}

GLM_FUNC_QUALIFIER detail::fquatSIMD lerp
(
	detail::fquatSIMD const & x,
	detail::fquatSIMD const & y,
	float const & a
)
{
	// Lerp is only defined in [0, 1]
	assert(a >= 0.0f);
	assert(a <= 1.0f);

    return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
}

GLM_FUNC_QUALIFIER detail::fquatSIMD slerp
(
	detail::fquatSIMD const & x,
	detail::fquatSIMD const & y,
	float const & a
)
{
	detail::fquatSIMD z = y;

	float cosTheta = dot(x, y);

	// If cosTheta < 0, the interpolation will take the long way around the sphere.
	// To fix this, one quat must be negated.
	if (cosTheta < 0.0f)
	{
		z        = -y;
		cosTheta = -cosTheta;
	}

	// Perform a linear interpolation when cosTheta is close to 1 to avoid side effect of sin(angle) becoming a zero denominator
	if(cosTheta > 1.0f - epsilon<float>())
	{
		return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data)));
	}
	else
	{
        float angle = glm::acos(cosTheta);


        // See mix() above for an explanation to the rationale behind this non-SSE implementation.

		float s0 = glm::sin((1.0f - a) * angle);
        float s1 = glm::sin(a * angle);
        float d  = 1.0f / glm::sin(angle);

        return (s0 * x + s1 * y) * d;
	}
}

GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate
(
	detail::fquatSIMD const & q
)
{
	return detail::fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)));
}

GLM_FUNC_QUALIFIER detail::fquatSIMD inverse
(
	detail::fquatSIMD const & q
)
{
	return conjugate(q) / dot(q, q);
}


GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
(
	float const & angle,
	vec3 const & v
)
{
#ifdef GLM_FORCE_RADIANS
	float a(angle);
#else
	float a(glm::radians(angle));
#endif
	float s = glm::sin(a * 0.5f);

    return _mm_set_ps(
        glm::cos(a * 0.5f),
        v.z * s,
        v.y * s,
        v.x * s);
}

GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD
(
	float const & angle,
	float const & x,
	float const & y,
	float const & z
)
{
	return angleAxisSIMD(angle, vec3(x, y, z));
}


}//namespace glm