cpu_neon_fp16.cpp 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. #include <stdio.h>
  2. #if (defined __GNUC__ && (defined __arm__ || defined __aarch64__)) /* || (defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM64EC)) */
  3. // Windows + ARM64 case disabled: https://github.com/opencv/opencv/issues/25052
  4. #include "arm_neon.h"
  5. float16x8_t vld1q_as_f16(const float* src)
  6. {
  7. float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4);
  8. return vcombine_f16(vcvt_f16_f32(s0), vcvt_f16_f32(s1));
  9. }
  10. void vprintreg(const char* name, const float16x8_t& r)
  11. {
  12. float data[8];
  13. vst1q_f32(data, vcvt_f32_f16(vget_low_f16(r)));
  14. vst1q_f32(data + 4, vcvt_f32_f16(vget_high_f16(r)));
  15. printf("%s: (%.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f)\n",
  16. name, data[0], data[1], data[2], data[3],
  17. data[4], data[5], data[6], data[7]);
  18. }
  19. void test()
  20. {
  21. const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f };
  22. const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f };
  23. float16x8_t s1 = vld1q_as_f16(src1), s2 = vld1q_as_f16(src2);
  24. float16x8_t d = vsubq_f16(s1, s1);
  25. d = vfmaq_laneq_f16(d, s1, s2, 0);
  26. d = vfmaq_laneq_f16(d, s1, s2, 1);
  27. d = vfmaq_laneq_f16(d, s1, s2, 2);
  28. d = vfmaq_laneq_f16(d, s1, s2, 3);
  29. d = vfmaq_laneq_f16(d, s1, s2, 4);
  30. d = vfmaq_laneq_f16(d, s1, s2, 5);
  31. d = vfmaq_laneq_f16(d, s1, s2, 6);
  32. d = vfmaq_laneq_f16(d, s1, s2, 7);
  33. vprintreg("s1*s2[0]+s1*s2[1] + ... + s1*s2[7]", d);
  34. }
  35. #else
  36. #error "NEON FP16 is not supported"
  37. #endif
  38. int main()
  39. {
  40. test();
  41. return 0;
  42. }