|
|
@ -577,7 +577,20 @@ namespace detail { |
|
|
|
static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) |
|
|
|
static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) |
|
|
|
{ |
|
|
|
{ |
|
|
|
vec<4, float, Q> Result; |
|
|
|
vec<4, float, Q> Result; |
|
|
|
|
|
|
|
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT |
|
|
|
Result.data = vdivq_f32(a.data, b.data); |
|
|
|
Result.data = vdivq_f32(a.data, b.data); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
/* Arm assembler reference: |
|
|
|
|
|
|
|
* |
|
|
|
|
|
|
|
* The Newton-Raphson iteration: x[n+1] = x[n] * (2 - d * x[n]) |
|
|
|
|
|
|
|
* converges to (1/d) if x0 is the result of VRECPE applied to d. |
|
|
|
|
|
|
|
* |
|
|
|
|
|
|
|
* Note: The precision usually improves with two interactions, but more than two iterations are not helpful. */ |
|
|
|
|
|
|
|
float32x4_t x = vrecpeq_f32(b.data); |
|
|
|
|
|
|
|
x = vmulq_f32(vrecpsq_f32(b.data, x), x); |
|
|
|
|
|
|
|
x = vmulq_f32(vrecpsq_f32(b.data, x), x); |
|
|
|
|
|
|
|
Result.data = vmulq_f32(a.data, x); |
|
|
|
|
|
|
|
#endif |
|
|
|
return Result; |
|
|
|
return Result; |
|
|
|
} |
|
|
|
} |
|
|
|
}; |
|
|
|
}; |
|
|
|