Merge pull request #972 from amaury-ml/mat4/inverse
glm-aarch64: Add neon's mat4 inverse #972master
commit
8a34283c87
3 changed files with 285 additions and 12 deletions
@ -0,0 +1,155 @@ |
||||
/// @ref simd_neon
|
||||
/// @file glm/simd/neon.h
|
||||
|
||||
#pragma once |
||||
|
||||
#if GLM_ARCH & GLM_ARCH_NEON_BIT |
||||
#include <arm_neon.h> |
||||
|
||||
namespace glm { |
||||
namespace neon { |
||||
static float32x4_t dupq_lane(float32x4_t vsrc, int lane) { |
||||
switch(lane) { |
||||
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT |
||||
case 0: return vdupq_laneq_f32(vsrc, 0); |
||||
case 1: return vdupq_laneq_f32(vsrc, 1); |
||||
case 2: return vdupq_laneq_f32(vsrc, 2); |
||||
case 3: return vdupq_laneq_f32(vsrc, 3); |
||||
#else |
||||
case 0: return vdupq_n_f32(vgetq_lane_f32(vsrc, 0)); |
||||
case 1: return vdupq_n_f32(vgetq_lane_f32(vsrc, 1)); |
||||
case 2: return vdupq_n_f32(vgetq_lane_f32(vsrc, 2)); |
||||
case 3: return vdupq_n_f32(vgetq_lane_f32(vsrc, 3)); |
||||
#endif |
||||
} |
||||
assert(!"Unreachable code executed!"); |
||||
return vdupq_n_f32(0.0f); |
||||
} |
||||
|
||||
static float32x2_t dup_lane(float32x4_t vsrc, int lane) { |
||||
switch(lane) { |
||||
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT |
||||
case 0: return vdup_laneq_f32(vsrc, 0); |
||||
case 1: return vdup_laneq_f32(vsrc, 1); |
||||
case 2: return vdup_laneq_f32(vsrc, 2); |
||||
case 3: return vdup_laneq_f32(vsrc, 3); |
||||
#else |
||||
case 0: return vdup_n_f32(vgetq_lane_f32(vsrc, 0)); |
||||
case 1: return vdup_n_f32(vgetq_lane_f32(vsrc, 1)); |
||||
case 2: return vdup_n_f32(vgetq_lane_f32(vsrc, 2)); |
||||
case 3: return vdup_n_f32(vgetq_lane_f32(vsrc, 3)); |
||||
#endif |
||||
} |
||||
assert(!"Unreachable code executed!"); |
||||
return vdup_n_f32(0.0f); |
||||
} |
||||
|
||||
static float32x4_t copy_lane(float32x4_t vdst, int dlane, float32x4_t vsrc, int slane) { |
||||
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT |
||||
switch(dlane) { |
||||
case 0: |
||||
switch(slane) { |
||||
case 0: return vcopyq_laneq_f32(vdst, 0, vsrc, 0); |
||||
case 1: return vcopyq_laneq_f32(vdst, 0, vsrc, 1); |
||||
case 2: return vcopyq_laneq_f32(vdst, 0, vsrc, 2); |
||||
case 3: return vcopyq_laneq_f32(vdst, 0, vsrc, 3); |
||||
} |
||||
assert(!"Unreachable code executed!"); |
||||
case 1: |
||||
switch(slane) { |
||||
case 0: return vcopyq_laneq_f32(vdst, 1, vsrc, 0); |
||||
case 1: return vcopyq_laneq_f32(vdst, 1, vsrc, 1); |
||||
case 2: return vcopyq_laneq_f32(vdst, 1, vsrc, 2); |
||||
case 3: return vcopyq_laneq_f32(vdst, 1, vsrc, 3); |
||||
} |
||||
assert(!"Unreachable code executed!"); |
||||
case 2: |
||||
switch(slane) { |
||||
case 0: return vcopyq_laneq_f32(vdst, 2, vsrc, 0); |
||||
case 1: return vcopyq_laneq_f32(vdst, 2, vsrc, 1); |
||||
case 2: return vcopyq_laneq_f32(vdst, 2, vsrc, 2); |
||||
case 3: return vcopyq_laneq_f32(vdst, 2, vsrc, 3); |
||||
} |
||||
assert(!"Unreachable code executed!"); |
||||
case 3: |
||||
switch(slane) { |
||||
case 0: return vcopyq_laneq_f32(vdst, 3, vsrc, 0); |
||||
case 1: return vcopyq_laneq_f32(vdst, 3, vsrc, 1); |
||||
case 2: return vcopyq_laneq_f32(vdst, 3, vsrc, 2); |
||||
case 3: return vcopyq_laneq_f32(vdst, 3, vsrc, 3); |
||||
} |
||||
assert(!"Unreachable code executed!"); |
||||
} |
||||
#else |
||||
|
||||
float l; |
||||
switch(slane) { |
||||
case 0: l = vgetq_lane_f32(vsrc, 0); break; |
||||
case 1: l = vgetq_lane_f32(vsrc, 1); break; |
||||
case 2: l = vgetq_lane_f32(vsrc, 2); break; |
||||
case 3: l = vgetq_lane_f32(vsrc, 3); break; |
||||
default:
|
||||
assert(!"Unreachable code executed!"); |
||||
} |
||||
switch(dlane) { |
||||
case 0: return vsetq_lane_f32(l, vdst, 0); |
||||
case 1: return vsetq_lane_f32(l, vdst, 1); |
||||
case 2: return vsetq_lane_f32(l, vdst, 2); |
||||
case 3: return vsetq_lane_f32(l, vdst, 3); |
||||
} |
||||
#endif |
||||
assert(!"Unreachable code executed!"); |
||||
return vdupq_n_f32(0.0f); |
||||
} |
||||
|
||||
static float32x4_t mul_lane(float32x4_t v, float32x4_t vlane, int lane) { |
||||
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT |
||||
switch(lane) {
|
||||
case 0: return vmulq_laneq_f32(v, vlane, 0); break; |
||||
case 1: return vmulq_laneq_f32(v, vlane, 1); break; |
||||
case 2: return vmulq_laneq_f32(v, vlane, 2); break; |
||||
case 3: return vmulq_laneq_f32(v, vlane, 3); break; |
||||
default:
|
||||
assert(!"Unreachable code executed!"); |
||||
} |
||||
assert(!"Unreachable code executed!"); |
||||
return vdupq_n_f32(0.0f); |
||||
#else |
||||
return vmulq_f32(v, dupq_lane(vlane, lane)); |
||||
#endif |
||||
} |
||||
|
||||
static float32x4_t madd_lane(float32x4_t acc, float32x4_t v, float32x4_t vlane, int lane) { |
||||
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT |
||||
#ifdef GLM_CONFIG_FORCE_FMA |
||||
# define FMADD_LANE(acc, x, y, L) do { asm volatile ("fmla %0.4s, %1.4s, %2.4s" : "+w"(acc) : "w"(x), "w"(dup_lane(y, L))); } while(0) |
||||
#else |
||||
# define FMADD_LANE(acc, x, y, L) do { acc = vmlaq_laneq_f32(acc, x, y, L); } while(0) |
||||
#endif |
||||
|
||||
switch(lane) {
|
||||
case 0:
|
||||
FMADD_LANE(acc, v, vlane, 0); |
||||
return acc; |
||||
case 1: |
||||
FMADD_LANE(acc, v, vlane, 1); |
||||
return acc; |
||||
case 2: |
||||
FMADD_LANE(acc, v, vlane, 2); |
||||
return acc; |
||||
case 3: |
||||
FMADD_LANE(acc, v, vlane, 3); |
||||
return acc; |
||||
default:
|
||||
assert(!"Unreachable code executed!"); |
||||
} |
||||
assert(!"Unreachable code executed!"); |
||||
return vdupq_n_f32(0.0f); |
||||
# undef FMADD_LANE |
||||
#else |
||||
return vaddq_f32(acc, vmulq_f32(v, dupq_lane(vlane, lane))); |
||||
#endif |
||||
} |
||||
} //namespace neon
|
||||
} // namespace glm
|
||||
#endif // GLM_ARCH & GLM_ARCH_NEON_BIT
|
Loading…
Reference in New Issue