From 2386237528a8aecde635868804ac410acf22a892 Mon Sep 17 00:00:00 2001
From: Christophe Riccio <christopheri@unity3d.com>
Date: Sun, 29 May 2016 17:58:53 +0200
Subject: [PATCH] common function SIMD optimization

---
 glm/detail/func_common.inl      |  76 ++++++++++++------
 glm/detail/func_common_simd.inl | 136 +++++++++++++++++++++++++++-----
 2 files changed, 169 insertions(+), 43 deletions(-)
diff --git a/glm/detail/func_common.inl b/glm/detail/func_common.inl
index 283949ec..02468cc1 100644
--- a/glm/detail/func_common.inl
+++ b/glm/detail/func_common.inl
@@ -10,6 +10,23 @@
 
 namespace glm
 {
+	// min
+	template <typename genType>
+	GLM_FUNC_QUALIFIER genType min(genType x, genType y)
+	{
+		GLM_STATIC_ASSERT(std::numeric_limits<genType>::is_iec559 || std::numeric_limits<genType>::is_integer, "'min' only accept floating-point or integer inputs");
+		return x < y ? x : y;
+	}
+
+	// max
+	template <typename genType>
+	GLM_FUNC_QUALIFIER genType max(genType x, genType y)
+	{
+		GLM_STATIC_ASSERT(std::numeric_limits<genType>::is_iec559 || std::numeric_limits<genType>::is_integer, "'max' only accept floating-point or integer inputs");
+
+		return x > y ? x : y;
+	}
+
 	// abs
 	template <>
 	GLM_FUNC_QUALIFIER int32 abs(int32 x)
@@ -239,6 +256,33 @@ namespace detail
 			return a - b * floor(a / b);
 		}
 	};
+
+	template <typename T, precision P, template <typename, precision> class vecType>
+	struct compute_min_vector
+	{
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x, vecType<T, P> const & y)
+		{
+			return detail::functor2<T, P, vecType>::call(min, x, y);
+		}
+	};
+
+	template <typename T, precision P, template <typename, precision> class vecType>
+	struct compute_max_vector
+	{
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x, vecType<T, P> const & y)
+		{
+			return detail::functor2<T, P, vecType>::call(max, x, y);
+		}
+	};
+
+	template <typename T, precision P, template <typename, precision> class vecType>
+	struct compute_clamp_vector
+	{
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x, vecType<T, P> const & minVal, vecType<T, P> const & maxVal)
+		{
+			return min(max(x, minVal), maxVal);
+		}
+	};
 }//namespace detail
 
 	template <typename genFIType>
@@ -441,45 +485,30 @@ namespace detail
 	//CHAR_BIT - 1)));
 
 	// min
-	template <typename genType>
-	GLM_FUNC_QUALIFIER genType min(genType x, genType y)
-	{
-		GLM_STATIC_ASSERT(std::numeric_limits<genType>::is_iec559 || std::numeric_limits<genType>::is_integer, "'min' only accept floating-point or integer inputs");
-
-		return x < y ? x : y;
-	}
-
 	template <typename T, precision P, template <typename, precision> class vecType>
 	GLM_FUNC_QUALIFIER vecType<T, P> min(vecType<T, P> const & a, T b)
 	{
-		return detail::functor2_vec_sca<T, P, vecType>::call(min, a, b);
+		GLM_STATIC_ASSERT(std::numeric_limits<T>::is_iec559, "'min' only accept floating-point inputs for the interpolator a");
+		return detail::compute_min_vector<T, P, vecType>::call(a, vecType<T, P>(b));
 	}
 
 	template <typename T, precision P, template <typename, precision> class vecType>
 	GLM_FUNC_QUALIFIER vecType<T, P> min(vecType<T, P> const & a, vecType<T, P> const & b)
 	{
-		return detail::functor2<T, P, vecType>::call(min, a, b);
+		return detail::compute_min_vector<T, P, vecType>::call(a, b);
 	}
 
 	// max
-	template <typename genType>
-	GLM_FUNC_QUALIFIER genType max(genType x, genType y)
-	{
-		GLM_STATIC_ASSERT(std::numeric_limits<genType>::is_iec559 || std::numeric_limits<genType>::is_integer, "'max' only accept floating-point or integer inputs");
-
-		return x > y ? x : y;
-	}
-
 	template <typename T, precision P, template <typename, precision> class vecType>
 	GLM_FUNC_QUALIFIER vecType<T, P> max(vecType<T, P> const & a, T b)
 	{
-		return detail::functor2_vec_sca<T, P, vecType>::call(max, a, b);
+		return detail::compute_max_vector<T, P, vecType>::call(a, vecType<T, P>(b));
 	}
 
 	template <typename T, precision P, template <typename, precision> class vecType>
 	GLM_FUNC_QUALIFIER vecType<T, P> max(vecType<T, P> const & a, vecType<T, P> const & b)
 	{
-		return detail::functor2<T, P, vecType>::call(max, a, b);
+		return detail::compute_max_vector<T, P, vecType>::call(a, b);
 	}
 
 	// clamp
@@ -487,7 +516,6 @@ namespace detail
 	GLM_FUNC_QUALIFIER genType clamp(genType x, genType minVal, genType maxVal)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<genType>::is_iec559 || std::numeric_limits<genType>::is_integer, "'clamp' only accept floating-point or integer inputs");
-		
 		return min(max(x, minVal), maxVal);
 	}
 
@@ -495,16 +523,14 @@ namespace detail
 	GLM_FUNC_QUALIFIER vecType<T, P> clamp(vecType<T, P> const & x, T minVal, T maxVal)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<T>::is_iec559 || std::numeric_limits<T>::is_integer, "'clamp' only accept floating-point or integer inputs");
-
-		return min(max(x, minVal), maxVal);
+		return detail::compute_clamp_vector<T, P, vecType>::call(x, vecType<T, P>(minVal), vecType<T, P>(maxVal));
 	}
 
 	template <typename T, precision P, template <typename, precision> class vecType>
 	GLM_FUNC_QUALIFIER vecType<T, P> clamp(vecType<T, P> const & x, vecType<T, P> const & minVal, vecType<T, P> const & maxVal)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<T>::is_iec559 || std::numeric_limits<T>::is_integer, "'clamp' only accept floating-point or integer inputs");
-
-		return min(max(x, minVal), maxVal);
+		return detail::compute_clamp_vector<T, P, vecType>::call(x, minVal, maxVal);
 	}
 
 	template <typename genTypeT, typename genTypeU>
diff --git a/glm/detail/func_common_simd.inl b/glm/detail/func_common_simd.inl
index 0bcd6c4a..b2cb7dfc 100644
--- a/glm/detail/func_common_simd.inl
+++ b/glm/detail/func_common_simd.inl
@@ -32,24 +32,6 @@ namespace detail
 		}
 	};
 
-	template <precision P>
-	struct compute_mix_vector<float, bool, P, tvec4>
-	{
-		GLM_FUNC_QUALIFIER static tvec4<float, P> call(tvec4<float, P> const & x, tvec4<float, P> const & y, tvec4<bool, P> const & a)
-		{
-			__m128i const Load = _mm_set_epi32(-(int)a.w, -(int)a.z, -(int)a.y, -(int)a.x);
-			__m128 const Mask = _mm_castsi128_ps(Load);
-
-			tvec4<float, P> Result(uninitialize);
-#			if 0 && GLM_ARCH & GLM_ARCH_AVX
-				Result.data = _mm_blendv_ps(x.data, y.data, Mask);
-#			else
-				Result.data = _mm_or_ps(_mm_and_ps(Mask, y.data), _mm_andnot_ps(Mask, x.data));
-#			endif
-			return Result;
-		}
-	};
-
 	template <precision P>
 	struct compute_floor<float, P, tvec4>
 	{
@@ -105,6 +87,124 @@ namespace detail
 		}
 	};
 
+	template <precision P>
+	struct compute_min_vector<float, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<float, P> call(tvec4<float, P> const & v1, tvec4<float, P> const & v2)
+		{
+			tvec4<float, P> result(uninitialize);
+			result.data = _mm_min_ps(v1.data, v2.data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_min_vector<int32, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<int32, P> call(tvec4<int32, P> const & v1, tvec4<int32, P> const & v2)
+		{
+			tvec4<int32, P> result(uninitialize);
+			result.data = _mm_min_epi32(v1.data, v2.data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_min_vector<uint32, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<int32, P> call(tvec4<uint32, P> const & v1, tvec4<uint32, P> const & v2)
+		{
+			tvec4<uint32, P> result(uninitialize);
+			result.data = _mm_min_epu32(v1.data, v2.data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_max_vector<float, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<float, P> call(tvec4<float, P> const & v1, tvec4<float, P> const & v2)
+		{
+			tvec4<float, P> result(uninitialize);
+			result.data = _mm_max_ps(v1.data, v2.data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_max_vector<int32, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<int32, P> call(tvec4<int32, P> const & v1, tvec4<int32, P> const & v2)
+		{
+			tvec4<int32, P> result(uninitialize);
+			result.data = _mm_max_epi32(v1.data, v2.data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_max_vector<uint32, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<uint32, P> call(tvec4<uint32, P> const & v1, tvec4<uint32, P> const & v2)
+		{
+			tvec4<uint32, P> result(uninitialize);
+			result.data = _mm_max_epu32(v1.data, v2.data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_clamp_vector<float, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<float, P> call(tvec4<float, P> const & x, tvec4<float, P> const & minVal, tvec4<float, P> const & maxVal)
+		{
+			tvec4<float, P> result(uninitialize);
+			result.data = _mm_min_ps(_mm_max_ps(x.data, minVal.data), maxVal.data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_clamp_vector<int32, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<int32, P> call(tvec4<int32, P> const & x, tvec4<int32, P> const & minVal, tvec4<int32, P> const & maxVal)
+		{
+			tvec4<int32, P> result(uninitialize);
+			result.data = _mm_min_epi32(_mm_max_epi32(x.data, minVal.data), maxVal.data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_clamp_vector<uint32, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<uint32, P> call(tvec4<uint32, P> const & x, tvec4<uint32, P> const & minVal, tvec4<uint32, P> const & maxVal)
+		{
+			tvec4<uint32, P> result(uninitialize);
+			result.data = _mm_min_epu32(_mm_max_epu32(x.data, minVal.data), maxVal.data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_mix_vector<float, bool, P, tvec4>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<float, P> call(tvec4<float, P> const & x, tvec4<float, P> const & y, tvec4<bool, P> const & a)
+		{
+			__m128i const Load = _mm_set_epi32(-(int)a.w, -(int)a.z, -(int)a.y, -(int)a.x);
+			__m128 const Mask = _mm_castsi128_ps(Load);
+
+			tvec4<float, P> Result(uninitialize);
+#			if 0 && GLM_ARCH & GLM_ARCH_AVX
+				Result.data = _mm_blendv_ps(x.data, y.data, Mask);
+#			else
+				Result.data = _mm_or_ps(_mm_and_ps(Mask, y.data), _mm_andnot_ps(Mask, x.data));
+#			endif
+			return Result;
+		}
+	};
+
+
 }//namespace detail
 }//namespace glm