60#ifndef SSE2NEON_PRECISE_MINMAX
61#define SSE2NEON_PRECISE_MINMAX (0)
64#ifndef SSE2NEON_PRECISE_DIV
65#define SSE2NEON_PRECISE_DIV (0)
68#ifndef SSE2NEON_PRECISE_SQRT
69#define SSE2NEON_PRECISE_SQRT (0)
72#ifndef SSE2NEON_PRECISE_DP
73#define SSE2NEON_PRECISE_DP (0)
79#ifndef SSE2NEON_INCLUDE_WINDOWS_H
80#define SSE2NEON_INCLUDE_WINDOWS_H (0)
84#if defined(__GNUC__) || defined(__clang__)
85#pragma push_macro("FORCE_INLINE")
86#pragma push_macro("ALIGN_STRUCT")
87#define FORCE_INLINE static inline __attribute__((always_inline))
88#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
89#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
90#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
91#elif defined(_MSC_VER)
93#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
96#define FORCE_INLINE static inline
99#define ALIGN_STRUCT(x) __declspec(align(x))
101#define _sse2neon_likely(x) (x)
102#define _sse2neon_unlikely(x) (x)
104#pragma message("Macro name collisions may happen with unsupported compilers.")
109#define _sse2neon_const static const
111#define _sse2neon_const const
121#define SSE2NEON_ALLOC_DEFINED
127#if SSE2NEON_INCLUDE_WINDOWS_H
128#include <processthreadsapi.h>
132#if !defined(__cplusplus)
133#error sse2neon only supports C++ compilation with this compiler
136#ifdef SSE2NEON_ALLOC_DEFINED
140#if (defined(_M_AMD64) || defined(__x86_64__)) || \
141 (defined(_M_ARM64) || defined(__arm64__))
142#define SSE2NEON_HAS_BITSCAN64
146#if defined(__GNUC__) || defined(__clang__)
147#define _sse2neon_define0(type, s, body) \
152#define _sse2neon_define1(type, s, body) \
157#define _sse2neon_define2(type, a, b, body) \
159 type _a = (a), _b = (b); \
162#define _sse2neon_return(ret) (ret)
164#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
165#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
166#define _sse2neon_define2(type, a, b, body) \
167 [](type _a, type _b) { body }((a), (b))
168#define _sse2neon_return(ret) return ret
171#define _sse2neon_init(...) \
178#define SSE2NEON_BARRIER() _ReadWriteBarrier()
180#define SSE2NEON_BARRIER() \
182 __asm__ __volatile__("" ::: "memory"); \
192#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
193#include <stdatomic.h>
199#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
200 !defined(__STDC_NO_ATOMICS__)
201 atomic_thread_fence(memory_order_seq_cst);
202#elif defined(__GNUC__) || defined(__clang__)
203 __atomic_thread_fence(__ATOMIC_SEQ_CST);
205 __dmb(_ARM64_BARRIER_ISH);
212#if defined(__arm__) && __ARM_ARCH == 7
217#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
218#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
220#if !defined(__clang__)
221#pragma GCC push_options
222#pragma GCC target("fpu=neon")
224#elif defined(__aarch64__) || defined(_M_ARM64)
225#if !defined(__clang__) && !defined(_MSC_VER)
226#pragma GCC push_options
227#pragma GCC target("+simd")
230#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
232 "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
234#if !defined(__clang__) && !defined(_MSC_VER)
235#pragma GCC push_options
238#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
243#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
244#if defined __has_include && __has_include(<arm_acle.h>)
254#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
255#define SSE2NEON_CACHELINE_SIZE 128
257#define SSE2NEON_CACHELINE_SIZE 64
261#if !defined(__aarch64__) && !defined(_M_ARM64)
270#if (!defined(__aarch64__) && !defined(_M_ARM64))
279#if defined(__GNUC__) && (__GNUC__ <= 9)
280#define __has_builtin(x) HAS##x
281#define HAS__builtin_popcount 1
282#define HAS__builtin_popcountll 1
285#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
286#define HAS__builtin_shuffle 1
288#define HAS__builtin_shuffle 0
291#define HAS__builtin_shufflevector 0
292#define HAS__builtin_nontemporal_store 0
294#define __has_builtin(x) 0
306#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
307 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
309#if __has_builtin(__builtin_shufflevector)
310#define _sse2neon_shuffle(type, a, b, ...) \
311 __builtin_shufflevector(a, b, __VA_ARGS__)
312#elif __has_builtin(__builtin_shuffle)
313#define _sse2neon_shuffle(type, a, b, ...) \
315 type tmp = {__VA_ARGS__}; \
316 __builtin_shuffle(a, b, tmp); \
320#ifdef _sse2neon_shuffle
321#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
322#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
323#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
324#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
325#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
326#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
330#define _MM_FROUND_TO_NEAREST_INT 0x00
331#define _MM_FROUND_TO_NEG_INF 0x01
332#define _MM_FROUND_TO_POS_INF 0x02
333#define _MM_FROUND_TO_ZERO 0x03
334#define _MM_FROUND_CUR_DIRECTION 0x04
335#define _MM_FROUND_NO_EXC 0x08
336#define _MM_FROUND_RAISE_EXC 0x00
337#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
338#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
339#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
340#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
341#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
342#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
343#define _MM_ROUND_NEAREST 0x0000
344#define _MM_ROUND_DOWN 0x2000
345#define _MM_ROUND_UP 0x4000
346#define _MM_ROUND_TOWARD_ZERO 0x6000
348#define _MM_FLUSH_ZERO_MASK 0x8000
349#define _MM_FLUSH_ZERO_ON 0x8000
350#define _MM_FLUSH_ZERO_OFF 0x0000
352#define _MM_DENORMALS_ZERO_MASK 0x0040
353#define _MM_DENORMALS_ZERO_ON 0x0040
354#define _MM_DENORMALS_ZERO_OFF 0x0000
357#define __constrange(a, b) const
370#if defined(__aarch64__) || defined(_M_ARM64)
379#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
380#if (defined(__x86_64__) || defined(__i386__))
381#define __int64 long long
383#define __int64 int64_t
389#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
390#define vreinterpretq_m128_f32(x) (x)
391#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
393#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
394#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
395#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
396#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
398#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
399#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
400#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
401#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
403#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
404#define vreinterpretq_f32_m128(x) (x)
405#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
407#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
408#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
409#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
410#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
412#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
413#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
414#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
415#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
417#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
418#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
419#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
420#define vreinterpretq_m128i_s64(x) (x)
422#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
423#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
424#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
425#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
427#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
428#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
430#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
431#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
432#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
433#define vreinterpretq_s64_m128i(x) (x)
435#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
436#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
437#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
438#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
440#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
441#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
442#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
443#define vreinterpret_m64_s64(x) (x)
445#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
446#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
447#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
448#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
450#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
451#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
452#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
454#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
455#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
456#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
457#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
459#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
460#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
461#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
462#define vreinterpret_s64_m64(x) (x)
464#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
466#if defined(__aarch64__) || defined(_M_ARM64)
467#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
468#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
470#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
472#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
473#define vreinterpretq_m128d_f64(x) (x)
475#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
477#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
478#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
480#define vreinterpretq_f64_m128d(x) (x)
481#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
483#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
484#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
486#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
487#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
489#define vreinterpretq_m128d_f32(x) (x)
491#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
493#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
494#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
496#define vreinterpretq_f32_m128d(x) (x)
529 uint16_t m128_u16[8];
530 uint32_t m128_u32[4];
531 uint64_t m128_u64[2];
535#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
536#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
537#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
540#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
541#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
542#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
543#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
577#if defined(__GNUC__) && !defined(__clang__) && \
578 ((__GNUC__ <= 13 && defined(__arm__)) || \
579 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
580 (__GNUC__ <= 9 && defined(__aarch64__)))
584 ret.val[0] = vld1q_u8(p + 0);
585 ret.val[1] = vld1q_u8(p + 16);
586 ret.val[2] = vld1q_u8(p + 32);
587 ret.val[3] = vld1q_u8(p + 48);
594 return vld1q_u8_x4(p);
598#if !defined(__aarch64__) && !defined(_M_ARM64)
602 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
603 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
613#if !defined(__aarch64__) && !defined(_M_ARM64)
617 uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
619 for (
int i = 0;
i < 8; ++
i)
631#if !defined(__aarch64__) && !defined(_M_ARM64)
635 uint32x4_t m = vpaddlq_u16(a);
636 uint64x2_t n = vpaddlq_u32(m);
637 uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
639 return vget_lane_u32((uint32x2_t) o, 0);
645 return vaddvq_u16(a);
701#if defined(__aarch64__) || defined(_M_ARM64)
727 float32x2_t a21 = vget_high_f32(
729 float32x2_t b03 = vget_low_f32(
736 float32x2_t a03 = vget_low_f32(
738 float32x2_t b21 = vget_high_f32(
801 float32x2_t a02 = vset_lane_f32(a0, a22, 1);
819 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
826 float32_t b2 = vgetq_lane_f32(b, 2);
828 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
835 float32_t b2 = vgetq_lane_f32(b, 2);
837 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
845#if (defined(_M_ARM64) && !defined(__clang__)) || \
846 (defined(__ARM_FEATURE_CRYPTO) && \
847 (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
851 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
852 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
854 __n64 a1 = {a}, b1 = {b};
855 return vreinterpretq_u64_p128(vmull_p64(a1, b1));
857 return vreinterpretq_u64_p128(vmull_p64(a, b));
876 poly8x8_t a = vreinterpret_p8_u64(_a);
877 poly8x8_t b = vreinterpret_p8_u64(_b);
880 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
881 vcreate_u8(0x00000000ffffffff));
882 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
883 vcreate_u8(0x0000000000000000));
886 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));
888 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));
890 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));
892 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));
894 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));
896 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));
898 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));
900 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));
903 uint8x16_t l = veorq_u8(e, f);
904 uint8x16_t m = veorq_u8(g, h);
905 uint8x16_t n = veorq_u8(
i, j);
909#if defined(__aarch64__)
910 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
911 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
912 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
913 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
914 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
915 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
916 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
917 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
919 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
920 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
921 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
922 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
926 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
927 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
928 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
932 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
933 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
934 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
937#if defined(__aarch64__)
938 uint8x16_t t0 = vreinterpretq_u8_u64(
939 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
940 uint8x16_t t1 = vreinterpretq_u8_u64(
941 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
942 uint8x16_t t2 = vreinterpretq_u8_u64(
943 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
944 uint8x16_t t3 = vreinterpretq_u8_u64(
945 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
947 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
948 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
949 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
950 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
953 uint8x16_t t0_shift = vextq_u8(t0, t0, 15);
954 uint8x16_t t1_shift = vextq_u8(t1, t1, 14);
955 uint8x16_t t2_shift = vextq_u8(t2, t2, 13);
956 uint8x16_t t3_shift = vextq_u8(t3, t3, 12);
959 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
960 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
961 uint8x16_t mix = veorq_u8(d, cross1);
962 uint8x16_t r = veorq_u8(mix, cross2);
963 return vreinterpretq_u64_u8(r);
975#define _mm_shuffle_epi32_default(a, imm) \
976 vreinterpretq_m128i_s32(vsetq_lane_s32( \
977 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
979 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
980 vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
981 ((imm) >> 2) & 0x3), \
982 vmovq_n_s32(vgetq_lane_s32( \
983 vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
1070#if defined(__aarch64__) || defined(_M_ARM64)
1071#define _mm_shuffle_epi32_splat(a, imm) \
1072 vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
1074#define _mm_shuffle_epi32_splat(a, imm) \
1075 vreinterpretq_m128i_s32( \
1076 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
1093#define _mm_shuffle_ps_default(a, b, imm) \
1094 vreinterpretq_m128_f32(vsetq_lane_f32( \
1095 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1097 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1099 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1101 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
1110#define _mm_shufflelo_epi16_function(a, imm) \
1111 _sse2neon_define1( \
1112 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1113 int16x4_t lowBits = vget_low_s16(ret); \
1114 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1115 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1117 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1119 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1121 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1127#define _mm_shufflehi_epi16_function(a, imm) \
1128 _sse2neon_define1( \
1129 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1130 int16x4_t highBits = vget_high_s16(ret); \
1131 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1132 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1134 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1136 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1138 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1163 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1442 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1452 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1462 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1472 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1482 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1510#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1511 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1535#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1536 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1540 float32_t data = vgetq_lane_f32(
1542 return (int32_t) data;
1601#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1637#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1659#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1666#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1667 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1670 float32_t data = vgetq_lane_f32(
1672 return (int64_t) data;
1696#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1701#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1719#if defined(__aarch64__) || defined(_M_ARM64)
1749#define _mm_extract_pi16(a, imm) \
1750 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1754#if !defined(SSE2NEON_ALLOC_DEFINED)
1764#if defined(_MSC_VER)
1765 value = _ReadStatusReg(ARM64_FPCR);
1767 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(value));
1774#if defined(_MSC_VER)
1775 _WriteStatusReg(ARM64_FPCR, value);
1777 __asm__ __volatile__(
"msr FPCR, %0" ::
"r"(value));
1789#if defined(__aarch64__) || defined(_M_ARM64)
1796#if defined(__aarch64__) || defined(_M_ARM64)
1799 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
1813#if defined(__aarch64__) || defined(_M_ARM64)
1820#if defined(__aarch64__) || defined(_M_ARM64)
1823 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
1826 if (r.field.bit22) {
1836#define _mm_insert_pi16(a, b, imm) \
1837 vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
1857#define _mm_load_ps1 _mm_load1_ps
1883 vcombine_f32(vget_low_f32(a), vld1_f32((
const float32_t *) p)));
1893 vcombine_f32(vld1_f32((
const float32_t *) p), vget_high_f32(a)));
1902 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1922 vsetq_lane_s16(*(
const int16_t *) p, vdupq_n_s16(0), 0));
1930 vcombine_s64(vld1_s64((
const int64_t *) p), vdup_n_s64(0)));
1937#if !defined(SSE2NEON_ALLOC_DEFINED)
1942 return malloc(size);
1943 if (align == 2 || (
sizeof(
void *) == 8 && align == 4))
1944 align =
sizeof(
void *);
1945 if (!posix_memalign(&ptr, align, size))
1962 vst1_s8((int8_t *) mem_addr, masked);
1969#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1987#if SSE2NEON_PRECISE_MINMAX
2014 float32_t value = vgetq_lane_f32(
_mm_max_ps(a, b), 0);
2035#if SSE2NEON_PRECISE_MINMAX
2062 float32_t value = vgetq_lane_f32(
_mm_min_ps(a, b), 0);
2084#if defined(aarch64__)
2111#if defined(__aarch64__) || defined(_M_ARM64)
2112 static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2113 uint8x8_t tmp = vshr_n_u8(input, 7);
2114 return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
2117 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2118 uint32x2_t paired16 =
2119 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2120 uint8x8_t paired32 =
2121 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2122 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2132#if defined(__aarch64__) || defined(_M_ARM64)
2133 static const int32_t shift[4] = {0, 1, 2, 3};
2134 uint32x4_t tmp = vshrq_n_u32(input, 31);
2135 return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
2140 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2143 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2145 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2189#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2194#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2199#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2204#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2209#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2214#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2219#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2224#define _m_pminub(a, b) _mm_min_pu8(a, b)
2229#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2235#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2243#if defined(_MSC_VER)
2261 __builtin_prefetch(p, 0, 0);
2264 __builtin_prefetch(p, 0, 3);
2267 __builtin_prefetch(p, 0, 2);
2270 __builtin_prefetch(p, 0, 1);
2281#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2286#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2319 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2320 const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
2321 const uint32x4_t has_pos_zero =
2322 vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
2323 const uint32x4_t has_neg_zero =
2324 vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
2331 out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
2332 out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
2344 return vsetq_lane_f32(vgetq_lane_f32(
_mm_rsqrt_ps(in), 0), in, 0);
2354 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2357 vset_lane_u16((
int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2370#if defined(__aarch64__) || defined(_M_ARM64)
2377#if defined(__aarch64__) || defined(_M_ARM64)
2380 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
2385#if defined(__aarch64__) || defined(_M_ARM64)
2388 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
2418#if defined(__aarch64__) || defined(_M_ARM64)
2425#if defined(__aarch64__) || defined(_M_ARM64)
2428 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
2449#if defined(__aarch64__) || defined(_M_ARM64)
2452 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
2508#ifdef _sse2neon_shuffle
2509#define _mm_shuffle_pi16(a, imm) \
2510 vreinterpret_m64_s16(vshuffle_s16( \
2511 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2512 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
2514#define _mm_shuffle_pi16(a, imm) \
2515 _sse2neon_define1( \
2516 __m64, a, int16x4_t ret; \
2518 vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \
2519 ret = vset_lane_s16( \
2520 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
2522 ret = vset_lane_s16( \
2523 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
2525 ret = vset_lane_s16( \
2526 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
2528 _sse2neon_return(vreinterpret_m64_s16(ret));)
2564#ifdef _sse2neon_shuffle
2565#define _mm_shuffle_ps(a, b, imm) \
2567 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2568 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2569 float32x4_t _shuf = \
2570 vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2571 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2572 vreinterpretq_m128_f32(_shuf); \
2575#define _mm_shuffle_ps(a, b, imm) \
2576 _sse2neon_define2( \
2577 __m128, a, b, __m128 ret; switch (imm) { \
2578 case _MM_SHUFFLE(1, 0, 3, 2): \
2579 ret = _mm_shuffle_ps_1032(_a, _b); \
2581 case _MM_SHUFFLE(2, 3, 0, 1): \
2582 ret = _mm_shuffle_ps_2301(_a, _b); \
2584 case _MM_SHUFFLE(0, 3, 2, 1): \
2585 ret = _mm_shuffle_ps_0321(_a, _b); \
2587 case _MM_SHUFFLE(2, 1, 0, 3): \
2588 ret = _mm_shuffle_ps_2103(_a, _b); \
2590 case _MM_SHUFFLE(1, 0, 1, 0): \
2591 ret = _mm_movelh_ps(_a, _b); \
2593 case _MM_SHUFFLE(1, 0, 0, 1): \
2594 ret = _mm_shuffle_ps_1001(_a, _b); \
2596 case _MM_SHUFFLE(0, 1, 0, 1): \
2597 ret = _mm_shuffle_ps_0101(_a, _b); \
2599 case _MM_SHUFFLE(3, 2, 1, 0): \
2600 ret = _mm_shuffle_ps_3210(_a, _b); \
2602 case _MM_SHUFFLE(0, 0, 1, 1): \
2603 ret = _mm_shuffle_ps_0011(_a, _b); \
2605 case _MM_SHUFFLE(0, 0, 2, 2): \
2606 ret = _mm_shuffle_ps_0022(_a, _b); \
2608 case _MM_SHUFFLE(2, 2, 0, 0): \
2609 ret = _mm_shuffle_ps_2200(_a, _b); \
2611 case _MM_SHUFFLE(3, 2, 0, 2): \
2612 ret = _mm_shuffle_ps_3202(_a, _b); \
2614 case _MM_SHUFFLE(3, 2, 3, 2): \
2615 ret = _mm_movehl_ps(_b, _a); \
2617 case _MM_SHUFFLE(1, 1, 3, 3): \
2618 ret = _mm_shuffle_ps_1133(_a, _b); \
2620 case _MM_SHUFFLE(2, 0, 1, 0): \
2621 ret = _mm_shuffle_ps_2010(_a, _b); \
2623 case _MM_SHUFFLE(2, 0, 0, 1): \
2624 ret = _mm_shuffle_ps_2001(_a, _b); \
2626 case _MM_SHUFFLE(2, 0, 3, 2): \
2627 ret = _mm_shuffle_ps_2032(_a, _b); \
2630 ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
2632 } _sse2neon_return(ret);)
2643#if defined(__aarch64__) || defined(_M_ARM64)
2650 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2651 const uint32x4_t div_by_zero =
2652 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2653 recip = vreinterpretq_f32_u32(
2654 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2697 vst1q_f32(p, vdupq_n_f32(a0));
2712#define _mm_store1_ps _mm_store_ps1
2737 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2777#if __has_builtin(__builtin_nontemporal_store)
2778 __builtin_nontemporal_store(a, (float32x4_t *) p);
2808#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2810 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2811 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2812 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2813 vget_low_f32(ROW23.val[0])); \
2814 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2815 vget_low_f32(ROW23.val[1])); \
2816 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2817 vget_high_f32(ROW23.val[0])); \
2818 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2819 vget_high_f32(ROW23.val[1])); \
2824#define _mm_ucomieq_ss _mm_comieq_ss
2825#define _mm_ucomige_ss _mm_comige_ss
2826#define _mm_ucomigt_ss _mm_comigt_ss
2827#define _mm_ucomile_ss _mm_comile_ss
2828#define _mm_ucomilt_ss _mm_comilt_ss
2829#define _mm_ucomineq_ss _mm_comineq_ss
2835#if defined(__GNUC__) || defined(__clang__)
2836#pragma GCC diagnostic push
2837#pragma GCC diagnostic ignored "-Wuninitialized"
2840#if defined(_MSC_VER)
2844#if defined(__GNUC__) || defined(__clang__)
2845#pragma GCC diagnostic pop
2853#if defined(__GNUC__) || defined(__clang__)
2854#pragma GCC diagnostic push
2855#pragma GCC diagnostic ignored "-Wuninitialized"
2858#if defined(_MSC_VER)
2862#if defined(__GNUC__) || defined(__clang__)
2863#pragma GCC diagnostic pop
2872#if defined(__aarch64__) || defined(_M_ARM64)
2878 float32x2x2_t result = vzip_f32(a1, b1);
2888#if defined(__aarch64__) || defined(_M_ARM64)
2894 float32x2x2_t result = vzip_f32(a1, b1);
2947#if defined(__aarch64__) || defined(_M_ARM64)
2948 return vreinterpretq_m128d_f64(
2949 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2951 double *da = (
double *) &a;
2952 double *db = (
double *) &b;
2954 c[0] = da[0] + db[0];
2955 c[1] = da[1] + db[1];
2956 return vld1q_f32((float32_t *) c);
2966#if defined(__aarch64__) || defined(_M_ARM64)
2969 double *da = (
double *) &a;
2970 double *db = (
double *) &b;
2972 c[0] = da[0] + db[0];
2974 return vld1q_f32((float32_t *) c);
3081#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3086#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3125#if defined(__aarch64__) || defined(_M_ARM64)
3143#if defined(__APPLE__)
3144#include <libkern/OSCacheControl.h>
3154#if defined(__APPLE__)
3156#elif defined(__GNUC__) || defined(__clang__)
3157 uintptr_t ptr = (uintptr_t) p;
3158 __builtin___clear_cache((
char *) ptr,
3160#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
3197#if defined(__aarch64__) || defined(_M_ARM64)
3199 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3204 uint32x4_t swapped = vrev64q_u32(cmp);
3223#if defined(__aarch64__) || defined(_M_ARM64)
3225 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3232 d[0] = (*(
double *) &a0) >= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3233 d[1] = (*(
double *) &a1) >= (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3245#if defined(__aarch64__) || defined(_M_ARM64)
3253 d[0] = (*(
double *) &a0) >= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3292#if defined(__aarch64__) || defined(_M_ARM64)
3294 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3301 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3302 d[1] = (*(
double *) &a1) > (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3314#if defined(__aarch64__) || defined(_M_ARM64)
3322 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3334#if defined(__aarch64__) || defined(_M_ARM64)
3336 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3343 d[0] = (*(
double *) &a0) <= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3344 d[1] = (*(
double *) &a1) <= (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3356#if defined(__aarch64__) || defined(_M_ARM64)
3364 d[0] = (*(
double *) &a0) <= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3406#if defined(__aarch64__) || defined(_M_ARM64)
3408 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3415 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3416 d[1] = (*(
double *) &a1) < (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3428#if defined(__aarch64__) || defined(_M_ARM64)
3435 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3447#if defined(__aarch64__) || defined(_M_ARM64)
3449 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3454 uint32x4_t swapped = vrev64q_u32(cmp);
3473#if defined(__aarch64__) || defined(_M_ARM64)
3475 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3476 vdupq_n_u64(UINT64_MAX)));
3484 !((*(
double *) &a0) >= (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3486 !((*(
double *) &a1) >= (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3506#if defined(__aarch64__) || defined(_M_ARM64)
3508 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3509 vdupq_n_u64(UINT64_MAX)));
3517 !((*(
double *) &a0) > (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3519 !((*(
double *) &a1) > (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3539#if defined(__aarch64__) || defined(_M_ARM64)
3541 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3542 vdupq_n_u64(UINT64_MAX)));
3550 !((*(
double *) &a0) <= (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3552 !((*(
double *) &a1) <= (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3572#if defined(__aarch64__) || defined(_M_ARM64)
3574 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3575 vdupq_n_u64(UINT64_MAX)));
3583 !((*(
double *) &a0) < (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3585 !((*(
double *) &a1) < (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3605#if defined(__aarch64__) || defined(_M_ARM64)
3607 uint64x2_t not_nan_a =
3608 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3609 uint64x2_t not_nan_b =
3610 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3618 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3619 (*(
double *) &b0) == (*(
double *) &b0))
3622 d[1] = ((*(
double *) &a1) == (*(
double *) &a1) &&
3623 (*(
double *) &b1) == (*(
double *) &b1))
3637#if defined(__aarch64__) || defined(_M_ARM64)
3644 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3645 (*(
double *) &b0) == (*(
double *) &b0))
3659#if defined(__aarch64__) || defined(_M_ARM64)
3661 uint64x2_t not_nan_a =
3662 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3663 uint64x2_t not_nan_b =
3664 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3666 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3673 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3674 (*(
double *) &b0) == (*(
double *) &b0))
3677 d[1] = ((*(
double *) &a1) == (*(
double *) &a1) &&
3678 (*(
double *) &b1) == (*(
double *) &b1))
3692#if defined(__aarch64__) || defined(_M_ARM64)
3699 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3700 (*(
double *) &b0) == (*(
double *) &b0))
3714#if defined(__aarch64__) || defined(_M_ARM64)
3715 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3720 return (*(
double *) &a0 >= *(
double *) &b0);
3729#if defined(__aarch64__) || defined(_M_ARM64)
3730 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3735 return (*(
double *) &a0 > *(
double *) &b0);
3744#if defined(__aarch64__) || defined(_M_ARM64)
3745 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3750 return (*(
double *) &a0 <= *(
double *) &b0);
3759#if defined(__aarch64__) || defined(_M_ARM64)
3760 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3765 return (*(
double *) &a0 < *(
double *) &b0);
3774#if defined(__aarch64__) || defined(_M_ARM64)
3775 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3777 uint32x4_t a_not_nan =
3779 uint32x4_t b_not_nan =
3781 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3784 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3785 vreinterpretq_u64_u32(a_eq_b));
3786 return vgetq_lane_u64(and_results, 0) & 0x1;
3803#if defined(__aarch64__) || defined(_M_ARM64)
3804 return vreinterpretq_m128d_f64(
3827#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
3828 float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
3829 int64x2_t integers = vcvtq_s64_f64(rounded);
3831 vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
3834 double d0 = ((
double *) &rnd)[0];
3835 double d1 = ((
double *) &rnd)[1];
3846 double d0 = ((
double *) &rnd)[0];
3847 double d1 = ((
double *) &rnd)[1];
3848 int32_t
ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3858#if defined(__aarch64__) || defined(_M_ARM64)
3859 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3862 float a0 = (float) ((
double *) &a)[0];
3863 float a1 = (float) ((
double *) &a)[1];
3873#if defined(__aarch64__) || defined(_M_ARM64)
3874 return vreinterpretq_m128d_f64(
3890#if defined(__ARM_FEATURE_FRINT)
3892#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
3893 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
3905 float *f = (
float *) &a;
3908 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3911 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3913 int32x4_t r_trunc = vcvtq_s32_f32(
3915 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3916 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
3917 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3919 float32x4_t delta = vsubq_f32(
3921 vcvtq_f32_s32(r_trunc));
3922 uint32x4_t is_delta_half =
3923 vceqq_f32(delta, half);
3925 vbslq_s32(is_delta_half, r_even, r_normal));
3928 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3934 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3946#if defined(__aarch64__) || defined(_M_ARM64)
3947 return vreinterpretq_m128d_f64(
3960#if defined(__aarch64__) || defined(_M_ARM64)
3961 return (
double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3963 return ((
double *) &a)[0];
3972#if defined(__aarch64__) || defined(_M_ARM64)
3973 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3976 double ret = ((
double *) &rnd)[0];
3977 return (int32_t) ret;
3986#if defined(__aarch64__) || defined(_M_ARM64)
3987 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3990 double ret = ((
double *) &rnd)[0];
3991 return (int64_t) ret;
3998#define _mm_cvtsd_si64x _mm_cvtsd_si64
4007#if defined(__aarch64__) || defined(_M_ARM64)
4009 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4033#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4041#if defined(__aarch64__) || defined(_M_ARM64)
4042 return vreinterpretq_m128d_f64(
4043 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4045 double bf = (double) b;
4053#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4069#if defined(__aarch64__) || defined(_M_ARM64)
4070 return vreinterpretq_m128d_f64(
4071 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4073 double bf = (double) b;
4090#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4096#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4106#if defined(__aarch64__) || defined(_M_ARM64)
4107 return vreinterpretq_m128d_f64(
4108 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4120 double a0 = ((
double *) &a)[0];
4121 double a1 = ((
double *) &a)[1];
4130 double a0 = ((
double *) &a)[0];
4131 double a1 = ((
double *) &a)[1];
4132 int32_t
ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4149 double ret = *((
double *) &a);
4150 return (int32_t) ret;
4158#if defined(__aarch64__) || defined(_M_ARM64)
4159 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4161 double ret = *((
double *) &a);
4162 return (int64_t) ret;
4169#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4176#if defined(__aarch64__) || defined(_M_ARM64)
4177 return vreinterpretq_m128d_f64(
4178 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4180 double *da = (
double *) &a;
4181 double *db = (
double *) &b;
4183 c[0] = da[0] / db[0];
4184 c[1] = da[1] / db[1];
4185 return vld1q_f32((float32_t *) c);
4196#if defined(__aarch64__) || defined(_M_ARM64)
4198 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4199 return vreinterpretq_m128d_f64(
4200 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4210#define _mm_extract_epi16(a, imm) \
4211 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4218#define _mm_insert_epi16(a, b, imm) \
4219 vreinterpretq_m128i_s16( \
4220 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
4228#if defined(__aarch64__) || defined(_M_ARM64)
4229 return vreinterpretq_m128d_f64(vld1q_f64(p));
4231 const float *fp = (
const float *) p;
4232 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4240#define _mm_load_pd1 _mm_load1_pd
4248#if defined(__aarch64__) || defined(_M_ARM64)
4249 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4251 const float *fp = (
const float *) p;
4270#if defined(__aarch64__) || defined(_M_ARM64)
4271 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4283#if defined(__aarch64__) || defined(_M_ARM64)
4284 return vreinterpretq_m128d_f64(
4285 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4300 vcombine_s32(vld1_s32((int32_t
const *) p), vcreate_s32(0)));
4309#if defined(__aarch64__) || defined(_M_ARM64)
4310 return vreinterpretq_m128d_f64(
4311 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4314 vcombine_f32(vld1_f32((
const float *) p),
4325#if defined(__aarch64__) || defined(_M_ARM64)
4326 float64x2_t v = vld1q_f64(p);
4327 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4329 int64x2_t v = vld1q_s64((
const int64_t *) p);
4354 vsetq_lane_s32(*(
const int32_t *) p, vdupq_n_s32(0), 0));
4365#if defined(__aarch64__) || defined(_M_ARM64)
4374 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4375 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4393 vst1q_s8((int8_t *) mem_addr, masked);
4419#if defined(__aarch64__) || defined(_M_ARM64)
4420#if SSE2NEON_PRECISE_MINMAX
4421 float64x2_t _a = vreinterpretq_f64_m128d(a);
4422 float64x2_t _b = vreinterpretq_f64_m128d(b);
4423 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4425 return vreinterpretq_m128d_f64(
4426 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4434 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? a0 : b0;
4435 d[1] = (*(
double *) &a1) > (*(
double *) &b1) ? a1 : b1;
4447#if defined(__aarch64__) || defined(_M_ARM64)
4450 double *da = (
double *) &a;
4451 double *db = (
double *) &b;
4452 double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4480#if defined(__aarch64__) || defined(_M_ARM64)
4481#if SSE2NEON_PRECISE_MINMAX
4482 float64x2_t _a = vreinterpretq_f64_m128d(a);
4483 float64x2_t _b = vreinterpretq_f64_m128d(b);
4484 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4486 return vreinterpretq_m128d_f64(
4487 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4495 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? a0 : b0;
4496 d[1] = (*(
double *) &a1) < (*(
double *) &b1) ? a1 : b1;
4507#if defined(__aarch64__) || defined(_M_ARM64)
4510 double *da = (
double *) &a;
4511 double *db = (
double *) &b;
4512 double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4566 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4581 uint32x4_t paired16 =
4582 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4595 uint64x2_t paired32 =
4596 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4609 uint8x16_t paired64 =
4610 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4617 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4626 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4627 return (
int) (vgetq_lane_u64(high_bits, 0) |
4628 (vgetq_lane_u64(high_bits, 1) << 1));
4663#if defined(__aarch64__) || defined(_M_ARM64)
4664 return vreinterpretq_m128d_f64(
4665 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4667 double *da = (
double *) &a;
4668 double *db = (
double *) &b;
4670 c[0] = da[0] * db[0];
4671 c[1] = da[1] * db[1];
4672 return vld1q_f32((float32_t *) c);
4706 int32x4_t ab3210 = vmull_s16(a3210, b3210);
4709 int32x4_t ab7654 = vmull_s16(a7654, b7654);
4711 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4723 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4724#if defined(__aarch64__) || defined(_M_ARM64)
4727 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4728 vreinterpretq_u16_u32(ab7654));
4733 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4735 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4805#if defined(_MSC_VER)
4806 __isb(_ARM64_BARRIER_SY);
4808 __asm__ __volatile__(
"isb\n");
4819 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4834 int16_t
ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4850 return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
4858 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4881 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
4882 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
4883 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
4884 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4885 return (
__m128i) vld1q_s8(data);
4894#if defined(__aarch64__) || defined(_M_ARM64)
4895 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4904#define _mm_set_pd1 _mm_set1_pd
4911#if defined(__aarch64__) || defined(_M_ARM64)
4912 return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
4958#if defined(__aarch64__) || defined(_M_ARM64)
4959 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
4976 int16_t
ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5015 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5016 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5017 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5018 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5019 return (
__m128i) vld1q_s8(data);
5034#if defined(__aarch64__) || defined(_M_ARM64)
5035 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5053#if defined(_sse2neon_shuffle)
5054#define _mm_shuffle_epi32(a, imm) \
5056 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5058 vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5059 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5060 vreinterpretq_m128i_s32(_shuf); \
5063#define _mm_shuffle_epi32(a, imm) \
5064 _sse2neon_define1( \
5065 __m128i, a, __m128i ret; switch (imm) { \
5066 case _MM_SHUFFLE(1, 0, 3, 2): \
5067 ret = _mm_shuffle_epi_1032(_a); \
5069 case _MM_SHUFFLE(2, 3, 0, 1): \
5070 ret = _mm_shuffle_epi_2301(_a); \
5072 case _MM_SHUFFLE(0, 3, 2, 1): \
5073 ret = _mm_shuffle_epi_0321(_a); \
5075 case _MM_SHUFFLE(2, 1, 0, 3): \
5076 ret = _mm_shuffle_epi_2103(_a); \
5078 case _MM_SHUFFLE(1, 0, 1, 0): \
5079 ret = _mm_shuffle_epi_1010(_a); \
5081 case _MM_SHUFFLE(1, 0, 0, 1): \
5082 ret = _mm_shuffle_epi_1001(_a); \
5084 case _MM_SHUFFLE(0, 1, 0, 1): \
5085 ret = _mm_shuffle_epi_0101(_a); \
5087 case _MM_SHUFFLE(2, 2, 1, 1): \
5088 ret = _mm_shuffle_epi_2211(_a); \
5090 case _MM_SHUFFLE(0, 1, 2, 2): \
5091 ret = _mm_shuffle_epi_0122(_a); \
5093 case _MM_SHUFFLE(3, 3, 3, 2): \
5094 ret = _mm_shuffle_epi_3332(_a); \
5096 case _MM_SHUFFLE(0, 0, 0, 0): \
5097 ret = _mm_shuffle_epi32_splat(_a, 0); \
5099 case _MM_SHUFFLE(1, 1, 1, 1): \
5100 ret = _mm_shuffle_epi32_splat(_a, 1); \
5102 case _MM_SHUFFLE(2, 2, 2, 2): \
5103 ret = _mm_shuffle_epi32_splat(_a, 2); \
5105 case _MM_SHUFFLE(3, 3, 3, 3): \
5106 ret = _mm_shuffle_epi32_splat(_a, 3); \
5109 ret = _mm_shuffle_epi32_default(_a, (imm)); \
5111 } _sse2neon_return(ret);)
5117#ifdef _sse2neon_shuffle
5118#define _mm_shuffle_pd(a, b, imm8) \
5119 vreinterpretq_m128d_s64( \
5120 vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5121 imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
5123#define _mm_shuffle_pd(a, b, imm8) \
5124 _mm_castsi128_pd(_mm_set_epi64x( \
5125 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5126 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5131#if defined(_sse2neon_shuffle)
5132#define _mm_shufflehi_epi16(a, imm) \
5134 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5136 vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5137 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5138 (((imm) >> 6) & 0x3) + 4); \
5139 vreinterpretq_m128i_s16(_shuf); \
5142#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5147#if defined(_sse2neon_shuffle)
5148#define _mm_shufflelo_epi16(a, imm) \
5150 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5151 int16x8_t _shuf = vshuffleq_s16( \
5152 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5153 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5154 vreinterpretq_m128i_s16(_shuf); \
5157#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5166 if (_sse2neon_unlikely(c & ~15))
5169 int16x8_t vc = vdupq_n_s16((int16_t) c);
5179 if (_sse2neon_unlikely(c & ~31))
5182 int32x4_t vc = vdupq_n_s32((int32_t) c);
5192 if (_sse2neon_unlikely(c & ~63))
5195 int64x2_t vc = vdupq_n_s64((int64_t) c);
5204 if (_sse2neon_unlikely(imm & ~15))
5215 if (_sse2neon_unlikely(imm & ~31))
5226 if (_sse2neon_unlikely(imm & ~63))
5235#define _mm_slli_si128(a, imm) \
5236 _sse2neon_define1( \
5237 __m128i, a, int8x16_t ret; \
5238 if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
5239 else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5240 else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \
5241 ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
5242 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5249#if defined(__aarch64__) || defined(_M_ARM64)
5250 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5252 double a0 = sqrt(((
double *) &a)[0]);
5253 double a1 = sqrt(((
double *) &a)[1]);
5264#if defined(__aarch64__) || defined(_M_ARM64)
5267 return _mm_set_pd(((
double *) &a)[1], sqrt(((
double *) &b)[0]));
5276 int64_t c = vgetq_lane_s64(count, 0);
5277 if (_sse2neon_unlikely(c & ~15))
5280 vshlq_s16((int16x8_t) a, vdupq_n_s16((
int) -c)));
5288 int64_t c = vgetq_lane_s64(count, 0);
5289 if (_sse2neon_unlikely(c & ~31))
5292 vshlq_s32((int32x4_t) a, vdupq_n_s32((
int) -c)));
5300 const int count = (imm & ~15) ? 15 : imm;
5301 return (
__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5308#define _mm_srai_epi32(a, imm) \
5309 _sse2neon_define0( \
5310 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \
5312 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5313 ret = vreinterpretq_m128i_s32( \
5314 vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
5316 ret = vreinterpretq_m128i_s32( \
5317 vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \
5318 } _sse2neon_return(ret);)
5326 if (_sse2neon_unlikely(c & ~15))
5329 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5339 if (_sse2neon_unlikely(c & ~31))
5342 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5352 if (_sse2neon_unlikely(c & ~63))
5355 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5362#define _mm_srli_epi16(a, imm) \
5363 _sse2neon_define0( \
5364 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \
5365 ret = _mm_setzero_si128(); \
5367 ret = vreinterpretq_m128i_u16( \
5368 vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
5369 } _sse2neon_return(ret);)
5375#define _mm_srli_epi32(a, imm) \
5376 _sse2neon_define0( \
5377 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \
5378 ret = _mm_setzero_si128(); \
5380 ret = vreinterpretq_m128i_u32( \
5381 vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
5382 } _sse2neon_return(ret);)
5387#define _mm_srli_epi64(a, imm) \
5388 _sse2neon_define0( \
5389 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \
5390 ret = _mm_setzero_si128(); \
5392 ret = vreinterpretq_m128i_u64( \
5393 vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
5394 } _sse2neon_return(ret);)
5399#define _mm_srli_si128(a, imm) \
5400 _sse2neon_define1( \
5401 __m128i, a, int8x16_t ret; \
5402 if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5403 else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
5404 (imm > 15 ? 0 : imm)); \
5405 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5413#if defined(__aarch64__) || defined(_M_ARM64)
5414 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5426#if defined(__aarch64__) || defined(_M_ARM64)
5427 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5428 vst1q_f64((float64_t *) mem_addr,
5429 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5432 vst1q_f32((float32_t *) mem_addr,
5442#if defined(__aarch64__) || defined(_M_ARM64)
5443 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5461#define _mm_store1_pd _mm_store_pd1
5468#if defined(__aarch64__) || defined(_M_ARM64)
5469 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5487#if defined(__aarch64__) || defined(_M_ARM64)
5488 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5536#if __has_builtin(__builtin_nontemporal_store)
5537 __builtin_nontemporal_store(a, (
__m128d *) p);
5538#elif defined(__aarch64__) || defined(_M_ARM64)
5539 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5551#if __has_builtin(__builtin_nontemporal_store)
5552 __builtin_nontemporal_store(a, p);
5564 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5573 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
5618#if defined(__aarch64__) || defined(_M_ARM64)
5619 return vreinterpretq_m128d_f64(
5620 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5622 double *da = (
double *) &a;
5623 double *db = (
double *) &b;
5625 c[0] = da[0] - db[0];
5626 c[1] = da[1] - db[1];
5627 return vld1q_f32((float32_t *) c);
5685#define _mm_ucomieq_sd _mm_comieq_sd
5686#define _mm_ucomige_sd _mm_comige_sd
5687#define _mm_ucomigt_sd _mm_comigt_sd
5688#define _mm_ucomile_sd _mm_comile_sd
5689#define _mm_ucomilt_sd _mm_comilt_sd
5690#define _mm_ucomineq_sd _mm_comineq_sd
5696#if defined(__GNUC__) || defined(__clang__)
5697#pragma GCC diagnostic push
5698#pragma GCC diagnostic ignored "-Wuninitialized"
5701#if defined(_MSC_VER)
5705#if defined(__GNUC__) || defined(__clang__)
5706#pragma GCC diagnostic pop
5715#if defined(__aarch64__) || defined(_M_ARM64)
5721 int16x4x2_t result = vzip_s16(a1, b1);
5731#if defined(__aarch64__) || defined(_M_ARM64)
5737 int32x2x2_t result = vzip_s32(a1, b1);
5747#if defined(__aarch64__) || defined(_M_ARM64)
5762#if defined(__aarch64__) || defined(_M_ARM64)
5770 int8x8x2_t result = vzip_s8(a1, b1);
5780#if defined(__aarch64__) || defined(_M_ARM64)
5781 return vreinterpretq_m128d_f64(
5782 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5795#if defined(__aarch64__) || defined(_M_ARM64)
5801 int16x4x2_t result = vzip_s16(a1, b1);
5811#if defined(__aarch64__) || defined(_M_ARM64)
5817 int32x2x2_t result = vzip_s32(a1, b1);
5827#if defined(__aarch64__) || defined(_M_ARM64)
5842#if defined(__aarch64__) || defined(_M_ARM64)
5848 int8x8x2_t result = vzip_s8(a1, b1);
5858#if defined(__aarch64__) || defined(_M_ARM64)
5859 return vreinterpretq_m128d_f64(
5860 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5895#if defined(__aarch64__) || defined(_M_ARM64)
5896 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
5897 vreinterpretq_f64_m128d(b),
5898 vreinterpretq_f64_m128d(mask)));
5911#if (defined(__aarch64__) || defined(_M_ARM64)) || \
5912 defined(__ARM_FEATURE_FMA)
5926#if defined(__aarch64__) || defined(_M_ARM64)
5927 return vreinterpretq_m128d_f64(
5928 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5930 double *da = (
double *) &a;
5931 double *db = (
double *) &b;
5932 double c[] = {da[0] + da[1], db[0] + db[1]};
5942#if defined(__aarch64__) || defined(_M_ARM64)
5951 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
5960#if defined(__aarch64__) || defined(_M_ARM64)
5961 float64x2_t a = vreinterpretq_f64_m128d(_a);
5962 float64x2_t b = vreinterpretq_f64_m128d(_b);
5963 return vreinterpretq_m128d_f64(
5964 vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
5966 double *da = (
double *) &_a;
5967 double *db = (
double *) &_b;
5968 double c[] = {da[0] - da[1], db[0] - db[1]};
5980#if defined(__aarch64__) || defined(_M_ARM64)
5982 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
5984 float32x4x2_t c = vuzpq_f32(a, b);
5993#define _mm_lddqu_si128 _mm_loadu_si128
5998#define _mm_loaddup_pd _mm_load1_pd
6005#if defined(__aarch64__) || defined(_M_ARM64)
6006 return vreinterpretq_m128d_f64(
6007 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6019#if defined(__aarch64__) || defined(_M_ARM64)
6022#elif defined(_sse2neon_shuffle)
6038#if defined(__aarch64__) || defined(_M_ARM64)
6041#elif defined(_sse2neon_shuffle)
6105#if defined(__GNUC__) && !defined(__clang__)
6106#define _mm_alignr_epi8(a, b, imm) \
6108 uint8x16_t _a = vreinterpretq_u8_m128i(a); \
6109 uint8x16_t _b = vreinterpretq_u8_m128i(b); \
6111 if (_sse2neon_unlikely((imm) & ~31)) \
6112 ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6113 else if (imm >= 16) \
6114 ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \
6117 vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
6122#define _mm_alignr_epi8(a, b, imm) \
6123 _sse2neon_define2( \
6124 __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
6125 uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \
6126 if (_sse2neon_unlikely((imm) & ~31)) ret = \
6127 vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6128 else if (imm >= 16) ret = \
6129 _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \
6131 vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
6132 _sse2neon_return(ret);)
6139#define _mm_alignr_pi8(a, b, imm) \
6140 _sse2neon_define2( \
6141 __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \
6142 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6144 uint8x8_t tmp_low; \
6145 uint8x8_t tmp_high; \
6147 const int idx = (imm) -8; \
6148 tmp_low = vreinterpret_u8_m64(_a); \
6149 tmp_high = vdup_n_u8(0); \
6150 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6152 const int idx = (imm); \
6153 tmp_low = vreinterpret_u8_m64(_b); \
6154 tmp_high = vreinterpret_u8_m64(_a); \
6155 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6157 } _sse2neon_return(ret);)
6166#if defined(__aarch64__) || defined(_M_ARM64)
6170 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6171 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6182#if defined(__aarch64__) || defined(_M_ARM64)
6186 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6187 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6214#if defined(__aarch64__) || defined(_M_ARM64)
6217 return vreinterpretq_s64_s16(
6218 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6225 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6226 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6239#if defined(__aarch64__) || defined(_M_ARM64)
6240 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6242 int16x4x2_t res = vuzp_s16(a, b);
6243 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6254#if defined(__aarch64__) || defined(_M_ARM64)
6256 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6258 int16x8x2_t c = vuzpq_s16(a, b);
6270#if defined(__aarch64__) || defined(_M_ARM64)
6272 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6274 int32x4x2_t c = vuzpq_s32(a, b);
6286#if defined(__aarch64__) || defined(_M_ARM64)
6289 int16x4x2_t c = vuzp_s16(a, b);
6301#if defined(__aarch64__) || defined(_M_ARM64)
6304 int32x2x2_t c = vuzp_s32(a, b);
6316#if defined(__aarch64__) || defined(_M_ARM64)
6318 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6320 int16x8x2_t c = vuzpq_s16(a, b);
6332#if defined(__aarch64__) || defined(_M_ARM64)
6335 int16x4x2_t c = vuzp_s16(a, b);
6347#if defined(__aarch64__) || defined(_M_ARM64)
6350 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6351 vmovl_s8(vget_low_s8(b)));
6352 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6353 vmovl_s8(vget_high_s8(b)));
6355 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6363 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6364 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6367 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6368 int16x8_t b_odd = vshrq_n_s16(b, 8);
6371 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6372 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6390 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6391 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6394 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6395 int16x4_t b_odd = vshr_n_s16(b, 8);
6398 int16x4_t prod1 = vmul_s16(a_even, b_even);
6399 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6422 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6423 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6435 int32x4_t mul_extend =
6449 uint8x16_t idx_masked =
6450 vandq_u8(idx, vdupq_n_u8(0x8F));
6451#if defined(__aarch64__) || defined(_M_ARM64)
6453#elif defined(__GNUC__)
6457 __asm__ __volatile__(
6458 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6459 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6461 : [tbl]
"w"(tbl), [idx]
"w"(idx_masked));
6465 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6467 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6468 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6477 const int8x8_t controlMask =
6495 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6497#if defined(__aarch64__) || defined(_M_ARM64)
6498 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6500 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6505 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6507 int16x8_t res = vbicq_s16(masked, zeroMask);
6523 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6526#if defined(__aarch64__) || defined(_M_ARM64)
6527 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6529 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6534 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6536 int32x4_t res = vbicq_s32(masked, zeroMask);
6552 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6555#if defined(__aarch64__) || defined(_M_ARM64)
6556 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6558 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6563 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6565 int8x16_t res = vbicq_s8(masked, zeroMask);
6581 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6584#if defined(__aarch64__) || defined(_M_ARM64)
6585 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6587 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6592 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6594 int16x4_t res = vbic_s16(masked, zeroMask);
6610 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6613#if defined(__aarch64__) || defined(_M_ARM64)
6614 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6616 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6621 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6623 int32x2_t res = vbic_s32(masked, zeroMask);
6639 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
6642#if defined(__aarch64__) || defined(_M_ARM64)
6643 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
6645 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
6650 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
6652 int8x8_t res = vbic_s8(masked, zeroMask);
6664#define _mm_blend_epi16(a, b, imm) \
6665 _sse2neon_define2( \
6667 const uint16_t _mask[8] = \
6668 _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
6669 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
6670 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
6671 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
6672 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
6673 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
6674 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
6675 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \
6676 uint16x8_t _mask_vec = vld1q_u16(_mask); \
6677 uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
6678 uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
6679 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
6684#define _mm_blend_pd(a, b, imm) \
6685 _sse2neon_define2( \
6687 const uint64_t _mask[2] = \
6688 _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
6689 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
6690 uint64x2_t _mask_vec = vld1q_u64(_mask); \
6691 uint64x2_t __a = vreinterpretq_u64_m128d(_a); \
6692 uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \
6693 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
6701 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
6702 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
6703 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
6704 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
6705 uint32x4_t mask = vld1q_u32(data);
6731#if defined(__aarch64__) || defined(_M_ARM64)
6732 float64x2_t a = vreinterpretq_f64_m128d(_a);
6733 float64x2_t b = vreinterpretq_f64_m128d(_b);
6734 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
6761#if defined(__aarch64__) || defined(_M_ARM64)
6762 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
6764 double *f = (
double *) &a;
6775#if (defined(__aarch64__) || defined(_M_ARM64)) || \
6776 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
6779 float *f = (
float *) &a;
6780 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
6808#if defined(__aarch64__) || defined(_M_ARM64)
6816 uint32x4_t swapped = vrev64q_u32(cmp);
6836 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
6837 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
6856 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
6866 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
6867 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
6877 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
6878 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
6879 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
6898 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
6899 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
6918 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
6928 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
6929 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
6939 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
6940 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
6941 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
6952 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
6953 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
6954#if !SSE2NEON_PRECISE_DP
6955 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
6956 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
6959#if !SSE2NEON_PRECISE_DP
6965#if defined(__aarch64__) || defined(_M_ARM64)
6966 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
6967 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
6969 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
6970 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
6973 double d0 = (imm & 0x10) ? ((
double *) &a)[0] * ((
double *) &b)[0] : 0;
6974 double d1 = (imm & 0x20) ? ((
double *) &a)[1] * ((
double *) &b)[1] : 0;
6979#if defined(__aarch64__) || defined(_M_ARM64)
6980 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
6982 double sum = *((
double *) &tmp) + *(((
double *) &tmp) + 1);
6997 float32x4_t elementwise_prod =
_mm_mul_ps(a, b);
6999#if defined(__aarch64__) || defined(_M_ARM64)
7005 if ((imm & 0x0F) == 0x0F) {
7006 if (!(imm & (1 << 4)))
7007 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
7008 if (!(imm & (1 << 5)))
7009 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
7010 if (!(imm & (1 << 6)))
7011 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
7012 if (!(imm & (1 << 7)))
7013 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
7022 s += vgetq_lane_f32(elementwise_prod, 0);
7024 s += vgetq_lane_f32(elementwise_prod, 1);
7026 s += vgetq_lane_f32(elementwise_prod, 2);
7028 s += vgetq_lane_f32(elementwise_prod, 3);
7030 const float32_t res[4] = {
7031 (imm & 0x1) ? s : 0.0f,
7032 (imm & 0x2) ? s : 0.0f,
7033 (imm & 0x4) ? s : 0.0f,
7034 (imm & 0x8) ? s : 0.0f,
7043#define _mm_extract_epi32(a, imm) \
7044 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7050#define _mm_extract_epi64(a, imm) \
7051 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7057#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7061#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7069#if defined(__aarch64__) || defined(_M_ARM64)
7070 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7072 double *f = (
double *) &a;
7083#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7084 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7087 float *f = (
float *) &a;
7088 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7117#define _mm_insert_epi32(a, b, imm) \
7118 vreinterpretq_m128i_s32( \
7119 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
7126#define _mm_insert_epi64(a, b, imm) \
7127 vreinterpretq_m128i_s64( \
7128 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
7135#define _mm_insert_epi8(a, b, imm) \
7136 vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
7142#define _mm_insert_ps(a, b, imm8) \
7143 _sse2neon_define2( \
7145 float32x4_t tmp1 = \
7146 vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \
7147 vreinterpretq_f32_m128(_a), 0); \
7148 float32x4_t tmp2 = \
7149 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \
7150 vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
7151 const uint32_t data[4] = \
7152 _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7153 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7154 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7155 ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \
7156 uint32x4_t mask = vld1q_u32(data); \
7157 float32x4_t all_zeros = vdupq_n_f32(0); \
7159 _sse2neon_return(vreinterpretq_m128_f32( \
7160 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
7240 uint16_t min, idx = 0;
7241#if defined(__aarch64__) || defined(_M_ARM64)
7246 static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
7247 uint16x8_t minv = vdupq_n_u16(min);
7249 idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
7263 for (
i = 0;
i < 8;
i++) {
7291 switch (imm & 0x4) {
7301#if defined(__GNUC__) || defined(__clang__)
7302 __builtin_unreachable();
7303#elif defined(_MSC_VER)
7309 switch (imm & 0x3) {
7311 _b = vreinterpretq_u8_u32(
7315 _b = vreinterpretq_u8_u32(
7319 _b = vreinterpretq_u8_u32(
7323 _b = vreinterpretq_u8_u32(
7327#if defined(__GNUC__) || defined(__clang__)
7328 __builtin_unreachable();
7329#elif defined(_MSC_VER)
7335 int16x8_t c04, c15, c26, c37;
7336 uint8x8_t low_b = vget_low_u8(_b);
7337 c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
7338 uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
7339 c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
7340 uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
7341 c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
7342 uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
7343 c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
7344#if defined(__aarch64__) || defined(_M_ARM64)
7346 c04 = vpaddq_s16(c04, c26);
7348 c15 = vpaddq_s16(c15, c37);
7351 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7353 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7355 vreinterpretq_s16_s32(trn2_c)));
7357 int16x4_t c01, c23, c45, c67;
7358 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
7359 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
7360 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
7361 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
7364 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
7404#if defined(__aarch64__) || defined(_M_ARM64)
7407 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7413 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7415 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7418 double *v_double = (
double *) &a;
7424 for (
int i = 0;
i < 2;
i++) {
7425 tmp = (v_double[
i] < 0) ? -v_double[
i] : v_double[
i];
7426 double roundDown = floor(tmp);
7427 double roundUp = ceil(tmp);
7428 double diffDown = tmp - roundDown;
7429 double diffUp = roundUp - tmp;
7430 if (diffDown < diffUp) {
7433 }
else if (diffDown > diffUp) {
7439 double half = roundDown / 2;
7440 if (half != floor(half)) {
7450 res[
i] = (v_double[
i] < 0) ? -res[
i] : res[
i];
7462 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7463 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7473#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7474 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7488 float *v_float = (
float *) &a;
7493 uint32x4_t signmask = vdupq_n_u32(0x80000000);
7496 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7498 int32x4_t r_trunc = vcvtq_s32_f32(
7500 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7501 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
7502 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7504 float32x4_t delta = vsubq_f32(
7506 vcvtq_f32_s32(r_trunc));
7507 uint32x4_t is_delta_half =
7508 vceqq_f32(delta, half);
7510 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7520 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7521 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7522 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7523 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7563#if __has_builtin(__builtin_nontemporal_store)
7564 return __builtin_nontemporal_load(p);
7575 return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7584 int64x2_t a_and_mask =
7586 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7602 uint64x2_t ones = vandq_u64(m, v);
7603 uint64x2_t zeros = vbicq_u64(m, v);
7607 uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
7610 return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
7622 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7631#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7642 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7647static const uint16_t
ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
7648 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7650static const uint8_t
ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
7651 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7652 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7656#define _SIDD_UBYTE_OPS 0x00
7657#define _SIDD_UWORD_OPS 0x01
7658#define _SIDD_SBYTE_OPS 0x02
7659#define _SIDD_SWORD_OPS 0x03
7662#define _SIDD_CMP_EQUAL_ANY 0x00
7663#define _SIDD_CMP_RANGES 0x04
7664#define _SIDD_CMP_EQUAL_EACH 0x08
7665#define _SIDD_CMP_EQUAL_ORDERED 0x0C
7668#define _SIDD_POSITIVE_POLARITY 0x00
7669#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
7670#define _SIDD_NEGATIVE_POLARITY 0x10
7671#define _SIDD_MASKED_NEGATIVE_POLARITY \
7675#define _SIDD_LEAST_SIGNIFICANT 0x00
7676#define _SIDD_MOST_SIGNIFICANT 0x40
7679#define _SIDD_BIT_MASK 0x00
7680#define _SIDD_UNIT_MASK 0x40
7687#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
7688#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
7690#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
7692#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
7694#define SSE2NEON_IIF_1(t, ...) t
7696#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
7697#define SSE2NEON_COMPL_0 1
7698#define SSE2NEON_COMPL_1 0
7700#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
7701#define SSE2NEON_DEC_1 0
7702#define SSE2NEON_DEC_2 1
7703#define SSE2NEON_DEC_3 2
7704#define SSE2NEON_DEC_4 3
7705#define SSE2NEON_DEC_5 4
7706#define SSE2NEON_DEC_6 5
7707#define SSE2NEON_DEC_7 6
7708#define SSE2NEON_DEC_8 7
7709#define SSE2NEON_DEC_9 8
7710#define SSE2NEON_DEC_10 9
7711#define SSE2NEON_DEC_11 10
7712#define SSE2NEON_DEC_12 11
7713#define SSE2NEON_DEC_13 12
7714#define SSE2NEON_DEC_14 13
7715#define SSE2NEON_DEC_15 14
7716#define SSE2NEON_DEC_16 15
7719#define SSE2NEON_CHECK_N(x, n, ...) n
7720#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
7721#define SSE2NEON_PROBE(x) x, 1,
7723#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
7724#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
7726#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
7727#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
7729#define SSE2NEON_EAT(...)
7730#define SSE2NEON_EXPAND(...) __VA_ARGS__
7731#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
7735#define SSE2NEON_EMPTY()
7736#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
7737#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
7738#define SSE2NEON_EXPAND(...) __VA_ARGS__
7740#define SSE2NEON_EVAL(...) \
7741 SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
7742#define SSE2NEON_EVAL1(...) \
7743 SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
7744#define SSE2NEON_EVAL2(...) \
7745 SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
7746#define SSE2NEON_EVAL3(...) __VA_ARGS__
7748#define SSE2NEON_REPEAT(count, macro, ...) \
7749 SSE2NEON_WHEN(count) \
7750 (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \
7751 SSE2NEON_DEC(count), macro, \
7752 __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
7754#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
7756#define SSE2NEON_SIZE_OF_byte 8
7757#define SSE2NEON_NUMBER_OF_LANES_byte 16
7758#define SSE2NEON_SIZE_OF_word 16
7759#define SSE2NEON_NUMBER_OF_LANES_word 8
7761#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \
7762 mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \
7763 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
7764 vreinterpretq_##type##_m128i(a)));
7766#define SSE2NEON_FILL_LANE(i, type) \
7768 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
7770#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \
7771 number_of_lanes, byte_or_word) \
7775 SSE2NEON_CAT(size, \
7776 SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
7777 vec_b[number_of_lanes]; \
7778 __m128i mask = SSE2NEON_IIF(byte_or_word)( \
7779 vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \
7780 vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \
7781 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \
7782 SSE2NEON_CAT(type_prefix, size))) \
7783 for (int i = 0; i < number_of_lanes; i++) { \
7784 mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \
7785 size)(SSE2NEON_CAT(vbslq_u, size)( \
7786 SSE2NEON_CAT(vreinterpretq_u, \
7787 SSE2NEON_CAT(size, _m128i))(mask), \
7788 SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \
7792 SSE2NEON_CAT(type_prefix, \
7793 SSE2NEON_CAT(size, _m128i(a))))), \
7794 SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \
7798 SSE2NEON_CAT(type_prefix, \
7799 SSE2NEON_CAT(size, _m128i(a))))))); \
7803#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \
7805 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \
7806 SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
7807 SSE2NEON_CAT(u, size))) \
7810#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \
7811 static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
7815 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7816 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7817 return SSE2NEON_CAT( \
7818 _sse2neon_aggregate_equal_any_, \
7820 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7821 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7822 type))))(la, lb, mtx); \
7825#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \
7826 static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
7831 a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7832 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \
7833 return SSE2NEON_CAT( \
7834 _sse2neon_aggregate_ranges_, \
7836 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7837 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7838 type))))(la, lb, mtx); \
7841#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \
7842 static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \
7843 __m128i b, int lb) \
7846 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7847 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7848 return SSE2NEON_CAT( \
7849 _sse2neon_aggregate_equal_ordered_, \
7851 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7853 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
7854 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \
7860 int m = (1 << la) - 1;
7861 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7862 uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7863 uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7864 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7865 for (
int j = 0; j < lb; j++) {
7879 int m = (1 << la) - 1;
7881 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7882 for (
int j = 0; j < lb; j++) {
7894#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
7895 prefix##IMPL(byte) \
7904 int m = (1 << la) - 1;
7906 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7907 for (
int j = 0; j < lb; j++) {
7916#if defined(__aarch64__) || defined(_M_ARM64)
7917 int t = vaddvq_u32(vec_res) ? 1 : 0;
7919 uint64x2_t sumh = vpaddlq_u32(vec_res);
7920 int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
7930 int m = (1 << la) - 1;
7931 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7932 uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7933 uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7934 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7935 for (
int j = 0; j < lb; j++) {
7950#define SSE2NEON_CMP_RANGES_IS_BYTE 1
7951#define SSE2NEON_CMP_RANGES_IS_WORD 0
7954#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \
7955 prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \
7956 prefix##IMPL(byte, int, s, prefix##IS_BYTE) \
7957 prefix##IMPL(word, uint, u, prefix##IS_WORD) \
7958 prefix##IMPL(word, int, s, prefix##IS_WORD)
7963#undef SSE2NEON_CMP_RANGES_IS_BYTE
7964#undef SSE2NEON_CMP_RANGES_IS_WORD
7970 int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7971 int m1 = 0x10000 - (1 << la);
7972 int tb = 0x10000 - (1 << lb);
7973 uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
7974 uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
7975 vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7976 vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
7977 vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
7978 vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
7979 vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
7980 tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
7981 tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
7983 res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
7984 res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
7985 res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
7986 res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
7987 res_lo = vand_u8(res_lo, vec_mask);
7988 res_hi = vand_u8(res_hi, vec_mask);
7998 int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7999 int m1 = 0x100 - (1 << la);
8000 int tb = 0x100 - (1 << lb);
8001 uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
8002 uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
8003 uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
8004 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
8005 mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
8006 mtx = vbslq_u16(vec1, tmp, mtx);
8007 mtx = vandq_u16(mtx, vec_mask);
8011#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8012#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8014#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \
8015 static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \
8016 int bound, int la, int lb, __m128i mtx[16]) \
8019 int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \
8020 uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \
8021 vld1_u##size(_sse2neon_cmpestr_mask##size##b), \
8022 vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \
8023 uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \
8024 vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \
8025 vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
8026 vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \
8027 uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8028 uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \
8029 for (int j = 0; j < lb; j++) { \
8030 mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \
8031 vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \
8033 for (int j = lb; j < bound; j++) { \
8034 mtx[j] = vreinterpretq_m128i_u##size( \
8035 vbslq_u##size(vec1, vec_minusone, vec_zero)); \
8037 unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \
8038 (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \
8039 for (int i = 0; i < bound; i++) { \
8041 for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \
8042 val &= ptr[k * bound + j]; \
8049#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8050 prefix##IMPL(8, 16, prefix##IS_UBYTE) \
8051 prefix##IMPL(16, 8, prefix##IS_UWORD)
8056#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8057#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8060#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8061 prefix##IMPL(byte) \
8067#define SSE2NEON_CMPESTR_LIST \
8068 _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8069 _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \
8070 _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8071 _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \
8072 _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \
8073 _(CMP_UWORD_RANGES, cmp_uword_ranges) \
8074 _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \
8075 _(CMP_SWORD_RANGES, cmp_sword_ranges) \
8076 _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8077 _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \
8078 _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8079 _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \
8080 _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8081 _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
8082 _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8083 _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
8086#define _(name, func_suffix) name,
8092#define _(name, func_suffix) _sse2neon_##func_suffix,
8099 switch (imm8 & 0x30) {
8104 res ^= (1 << lb) - 1;
8110 return res & ((bound == 8) ? 0xFF : 0xFFFF);
8116 unsigned long cnt = 0;
8117 if (_BitScanReverse(&cnt, x))
8121 return x != 0 ? __builtin_clz(x) : 32;
8128 unsigned long cnt = 0;
8129 if (_BitScanForward(&cnt, x))
8133 return x != 0 ? __builtin_ctz(x) : 32;
8141#if defined(SSE2NEON_HAS_BITSCAN64)
8142 if (_BitScanForward64(&cnt, x))
8145 if (_BitScanForward(&cnt, (
unsigned long) (x)))
8147 if (_BitScanForward(&cnt, (
unsigned long) (x >> 32)))
8148 return (
int) (cnt + 32);
8152 return x != 0 ? __builtin_ctzll(x) : 64;
8156#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
8158#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
8159 const int var = (imm & 0x01) ? 8 : 16
8161#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
8162 int tmp1 = la ^ (la >> 31); \
8163 la = tmp1 - (la >> 31); \
8164 int tmp2 = lb ^ (lb >> 31); \
8165 lb = tmp2 - (lb >> 31); \
8166 la = SSE2NEON_MIN(la, bound); \
8167 lb = SSE2NEON_MIN(lb, bound)
8174#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \
8175 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \
8176 SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \
8177 int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
8178 r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
8180#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \
8181 return (r2 == 0) ? bound \
8182 : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
8183 : _sse2neon_ctz(r2))
8185#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \
8186 __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
8187 if (imm8 & 0x40) { \
8189 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \
8190 vld1q_u16(_sse2neon_cmpestr_mask16b)); \
8191 dst = vreinterpretq_m128i_u16(vbslq_u16( \
8192 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \
8194 uint8x16_t vec_r2 = \
8195 vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \
8197 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \
8198 dst = vreinterpretq_m128i_u8( \
8199 vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \
8202 if (bound == 16) { \
8203 dst = vreinterpretq_m128i_u16( \
8204 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
8206 dst = vreinterpretq_m128i_u8( \
8207 vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \
8224 return !r2 & (lb_cpy > bound);
8289 return la <= (bound - 1);
8305 return lb <= (bound - 1);
8308#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \
8310 if (imm8 & 0x01) { \
8311 uint16x8_t equal_mask_##str = \
8312 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
8313 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8314 uint64_t matches_##str = \
8315 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8316 len = _sse2neon_ctzll(matches_##str) >> 3; \
8318 uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \
8319 vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \
8320 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8321 uint64_t matches_##str = \
8322 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8323 len = _sse2neon_ctzll(matches_##str) >> 2; \
8327#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
8330 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \
8331 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \
8341 return !r2 & (lb >= bound);
8389 return la <= (bound - 1);
8401 return lb <= (bound - 1);
8408#if defined(__aarch64__) || defined(_M_ARM64)
8423#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8424 __asm__ __volatile__(
"crc32ch %w[c], %w[c], %w[v]\n\t"
8427#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8428 (defined(_M_ARM64) && !defined(__clang__))
8429 crc = __crc32ch(crc, v);
8442#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8443 __asm__ __volatile__(
"crc32cw %w[c], %w[c], %w[v]\n\t"
8446#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8447 (defined(_M_ARM64) && !defined(__clang__))
8448 crc = __crc32cw(crc, v);
8461#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8462 __asm__ __volatile__(
"crc32cx %w[c], %w[c], %x[v]\n\t"
8465#elif (defined(_M_ARM64) && !defined(__clang__))
8466 crc = __crc32cd((uint32_t) crc, v);
8469 crc =
_mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8479#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8480 __asm__ __volatile__(
"crc32cb %w[c], %w[c], %w[v]\n\t"
8483#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8484 (defined(_M_ARM64) && !defined(__clang__))
8485 crc = __crc32cb(crc, v);
8488 for (
int bit = 0; bit < 8; bit++) {
8490 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8500#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
8502#define SSE2NEON_AES_SBOX(w) \
8504 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8505 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8506 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8507 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8508 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8509 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8510 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8511 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8512 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8513 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8514 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8515 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8516 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8517 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8518 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8519 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8520 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8521 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8522 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8523 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8524 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8525 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8526 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8527 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8528 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8529 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8530 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8531 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8532 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8533 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8534 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8535 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8536 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8537 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8538 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8539 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8540 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8542#define SSE2NEON_AES_RSBOX(w) \
8544 w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
8545 w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
8546 w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
8547 w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
8548 w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
8549 w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
8550 w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
8551 w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
8552 w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
8553 w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
8554 w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
8555 w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
8556 w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
8557 w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
8558 w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
8559 w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
8560 w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
8561 w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
8562 w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
8563 w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
8564 w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
8565 w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
8566 w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
8567 w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
8568 w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
8569 w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
8570 w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
8571 w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
8572 w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
8573 w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
8574 w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
8575 w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
8576 w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
8577 w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
8578 w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
8579 w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
8580 w(0x55), w(0x21), w(0x0c), w(0x7d) \
8585#define SSE2NEON_AES_H0(x) (x)
8588#undef SSE2NEON_AES_H0
8591#if !defined(__aarch64__) && !defined(_M_ARM64)
8592#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
8593#define SSE2NEON_MULTIPLY(x, y) \
8594 (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
8595 ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
8596 ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
8597 ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
8607#if defined(__aarch64__) || defined(_M_ARM64)
8608 static const uint8_t shift_rows[] = {
8609 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8610 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8612 static const uint8_t ror32by8[] = {
8613 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8614 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8621 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8636 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8637 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8638 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8644#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8645 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8646 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8648#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b ))
8650#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8651#define SSE2NEON_AES_U0(p) \
8652 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8653#define SSE2NEON_AES_U1(p) \
8654 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8655#define SSE2NEON_AES_U2(p) \
8656 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8657#define SSE2NEON_AES_U3(p) \
8658 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8662 static const uint32_t
ALIGN_STRUCT(16) aes_table[4][256] = {
8668#undef SSE2NEON_AES_B2W
8669#undef SSE2NEON_AES_F2
8670#undef SSE2NEON_AES_F3
8671#undef SSE2NEON_AES_U0
8672#undef SSE2NEON_AES_U1
8673#undef SSE2NEON_AES_U2
8674#undef SSE2NEON_AES_U3
8686 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8687 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8688 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8689 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8690 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8691 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8692 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8693 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8704#if defined(__aarch64__)
8705 static const uint8_t inv_shift_rows[] = {
8706 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8707 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8709 static const uint8_t ror32by8[] = {
8710 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8711 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8718 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8728 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8729 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8731 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8733 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
8735 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8736 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8743 uint8_t
i, e, f, g, h, v[4][4];
8744 uint8_t *_a = (uint8_t *) &a;
8745 for (
i = 0;
i < 16; ++
i) {
8750 for (
i = 0;
i < 4; ++
i) {
8775#if defined(__aarch64__)
8776 static const uint8_t shift_rows[] = {
8777 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8778 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8785 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8825#if defined(__aarch64__)
8826 static const uint8_t inv_shift_rows[] = {
8827 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8828 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8835 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8849 uint8_t *_a = (uint8_t *) &a;
8850 for (
int i = 0;
i < 16; ++
i) {
8862#if defined(__aarch64__)
8863 static const uint8_t ror32by8[] = {
8864 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8865 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8871 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8872 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8874 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8877 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8878 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8879 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8883 uint8_t
i, e, f, g, h, v[4][4];
8885 for (
i = 0;
i < 4; ++
i) {
8916#if defined(__aarch64__)
8923 uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
8924 uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
8925 uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
8932 for (
int i = 0;
i < 4; ++
i) {
8937 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8940#undef SSE2NEON_AES_SBOX
8941#undef SSE2NEON_AES_RSBOX
8943#if defined(__aarch64__)
8945#undef SSE2NEON_MULTIPLY
9011 u8[0x4], u8[0x1], u8[0xE], u8[0xB],
9012 u8[0x1], u8[0xE], u8[0xB], u8[0x4],
9013 u8[0xC], u8[0x9], u8[0x6], u8[0x3],
9014 u8[0x9], u8[0x6], u8[0x3], u8[0xC],
9016 uint32x4_t r = {0, (unsigned) rcon, 0, (
unsigned) rcon};
9025 ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
9026 ((uint64_t) u8.n128_u8[0xE] << 16) |
9027 ((uint64_t) u8.n128_u8[0xB] << 24) |
9028 ((uint64_t) u8.n128_u8[0x1] << 32) |
9029 ((uint64_t) u8.n128_u8[0xE] << 40) |
9030 ((uint64_t) u8.n128_u8[0xB] << 48) |
9031 ((uint64_t) u8.n128_u8[0x4] << 56),
9032 ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
9033 ((uint64_t) u8.n128_u8[0x6] << 16) |
9034 ((uint64_t) u8.n128_u8[0x3] << 24) |
9035 ((uint64_t) u8.n128_u8[0x9] << 32) |
9036 ((uint64_t) u8.n128_u8[0x6] << 40) |
9037 ((uint64_t) u8.n128_u8[0x3] << 48) |
9038 ((uint64_t) u8.n128_u8[0xC] << 56)};
9040 dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
9041 dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
9057 switch (imm & 0x11) {
9079#if defined(__aarch64__) || defined(_M_ARM64)
9086#if defined(__aarch64__) || defined(_M_ARM64)
9089 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
9100#if defined(__aarch64__) || defined(_M_ARM64)
9101#if __has_builtin(__builtin_popcount)
9102 return __builtin_popcount(a);
9103#elif defined(_MSC_VER)
9104 return _CountOneBits(a);
9106 return (
int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
9110 uint8x8_t input_val, count8x8_val;
9111 uint16x4_t count16x4_val;
9112 uint32x2_t count32x2_val;
9114 input_val = vld1_u8((uint8_t *) &a);
9115 count8x8_val = vcnt_u8(input_val);
9116 count16x4_val = vpaddl_u8(count8x8_val);
9117 count32x2_val = vpaddl_u16(count16x4_val);
9119 vst1_u32(&count, count32x2_val);
9129#if defined(__aarch64__) || defined(_M_ARM64)
9130#if __has_builtin(__builtin_popcountll)
9131 return __builtin_popcountll(a);
9132#elif defined(_MSC_VER)
9133 return _CountOneBits64(a);
9135 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
9139 uint8x8_t input_val, count8x8_val;
9140 uint16x4_t count16x4_val;
9141 uint32x2_t count32x2_val;
9142 uint64x1_t count64x1_val;
9144 input_val = vld1_u8((uint8_t *) &a);
9145 count8x8_val = vcnt_u8(input_val);
9146 count16x4_val = vpaddl_u8(count8x8_val);
9147 count32x2_val = vpaddl_u16(count16x4_val);
9148 count64x1_val = vpaddl_u32(count32x2_val);
9149 vst1_u64(&count, count64x1_val);
9160#if defined(__aarch64__) || defined(_M_ARM64)
9167#if defined(__aarch64__) || defined(_M_ARM64)
9170 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
9175#if defined(__aarch64__) || defined(_M_ARM64)
9178 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
9186#if defined(__aarch64__) || defined(_M_ARM64)
9195#if defined(_MSC_VER)
9196 val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
9198 __asm__ __volatile__(
"mrs %0, cntvct_el0" :
"=r"(val));
9203 uint32_t pmccntr, pmuseren, pmcntenset;
9206 __asm__ __volatile__(
"mrc p15, 0, %0, c9, c14, 0" :
"=r"(pmuseren));
9208 __asm__ __volatile__(
"mrc p15, 0, %0, c9, c12, 1" :
"=r"(pmcntenset));
9209 if (pmcntenset & 0x80000000UL) {
9210 __asm__ __volatile__(
"mrc p15, 0, %0, c9, c13, 0" :
"=r"(pmccntr));
9212 return (uint64_t) (pmccntr) << 6;
9219 return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
9223#if defined(__GNUC__) || defined(__clang__)
9224#pragma pop_macro("ALIGN_STRUCT")
9225#pragma pop_macro("FORCE_INLINE")
9228#if defined(__GNUC__) && !defined(__clang__)
9229#pragma GCC pop_options
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
Definition sse2neon.h:3712
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
Definition sse2neon.h:2902
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1371
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
Definition sse2neon.h:2364
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1438
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1335
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition sse2neon.h:6017
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
Definition sse2neon.h:6073
#define vreinterpret_m64_f32(x)
Definition sse2neon.h:451
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
Definition sse2neon.h:6806
#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)
Definition sse2neon.h:8185
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
Definition sse2neon.h:633
#define _MM_FROUND_TO_POS_INF
Definition sse2neon.h:332
#define vreinterpretq_u32_m128d(x)
Definition sse2neon.h:493
#define vreinterpret_m64_s32(x)
Definition sse2neon.h:442
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5713
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
Definition sse2neon.h:4990
#define SSE2NEON_CACHELINE_SIZE
Definition sse2neon.h:257
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
Definition sse2neon.h:1533
FORCE_INLINE unsigned int _mm_getcsr(void)
Definition sse2neon.h:2484
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition sse2neon.h:4780
#define vreinterpretq_m128_s32(x)
Definition sse2neon.h:400
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
Definition sse2neon.h:7174
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
Definition sse2neon.h:4067
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3281
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition sse2neon.h:5213
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
Definition sse2neon.h:1190
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4460
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
Definition sse2neon.h:2000
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
Definition sse2neon.h:3195
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition sse2neon.h:2788
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
Definition sse2neon.h:4194
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
Definition sse2neon.h:3025
FORCE_INLINE __m128d _mm_setzero_pd(void)
Definition sse2neon.h:5032
#define SSE2NEON_AES_U2(p)
FORCE_INLINE __m128i _mm_set1_epi16(short w)
Definition sse2neon.h:4920
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition sse2neon.h:5452
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
Definition sse2neon.h:1581
#define vreinterpretq_m128_f32(x)
Definition sse2neon.h:390
float32x4_t __m128
Definition sse2neon.h:366
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
Definition sse2neon.h:3958
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
Definition sse2neon.h:2998
FORCE_INLINE int _mm_cmpestrs(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8279
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
Definition sse2neon.h:1919
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
Definition sse2neon.h:2928
static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
Definition sse2neon.h:7994
FORCE_INLINE void _mm_sfence(void)
Definition sse2neon.h:2536
FORCE_INLINE __m128 _mm_load_ss(const float *p)
Definition sse2neon.h:1863
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3213
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
Definition sse2neon.h:2775
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition sse2neon.h:3053
#define SSE2NEON_MULTIPLY(x, y)
Definition sse2neon.h:8593
#define SSE2NEON_GENERATE_CMP_RANGES(prefix)
Definition sse2neon.h:7954
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition sse2neon.h:4863
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition sse2neon.h:5908
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7371
#define vreinterpretq_m128i_s8(x)
Definition sse2neon.h:417
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
Definition sse2neon.h:2752
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8773
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6312
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5840
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
Definition sse2neon.h:1000
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
Definition sse2neon.h:3016
#define vreinterpret_m64_s8(x)
Definition sse2neon.h:440
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
Definition sse2neon.h:4039
FORCE_INLINE int _mm_cmpestri(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8243
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6162
#define _SIDD_MASKED_NEGATIVE_POLARITY
Definition sse2neon.h:7671
#define vreinterpret_u16_m64(x)
Definition sse2neon.h:455
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition sse2neon.h:5940
#define vreinterpretq_m128i_u64(x)
Definition sse2neon.h:425
FORCE_INLINE int _mm_cmpestrz(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8295
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int)
Definition sse2neon.h:4840
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
Definition sse2neon.h:3635
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
Definition sse2neon.h:592
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
Definition sse2neon.h:5809
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
Definition sse2neon.h:3944
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
Definition sse2neon.h:6297
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
Definition sse2neon.h:708
#define SSE2NEON_CMPSTR_SET_UPPER(var, imm)
Definition sse2neon.h:8158
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
Definition sse2neon.h:4478
#define vreinterpret_m64_s64(x)
Definition sse2neon.h:443
FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8365
#define vreinterpretq_u8_m128i(x)
Definition sse2neon.h:435
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
Definition sse2neon.h:5323
FORCE_INLINE __m128d _mm_ceil_pd(__m128d)
Definition sse2neon.h:6759
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
Definition sse2neon.h:2694
#define vreinterpretq_s16_m128i(x)
Definition sse2neon.h:431
#define _MM_FLUSH_ZERO_MASK
Definition sse2neon.h:348
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4698
#define vreinterpretq_u64_m128d(x)
Definition sse2neon.h:494
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
Definition sse2neon.h:1316
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
Definition sse2neon.h:6409
#define vreinterpretq_s8_m128(x)
Definition sse2neon.h:412
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
Definition sse2neon.h:4641
FORCE_INLINE void _sse2neon_smp_mb(void)
Definition sse2neon.h:196
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
Definition sse2neon.h:1199
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition sse2neon.h:2920
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7391
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
Definition sse2neon.h:2492
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition sse2neon.h:4226
#define vreinterpretq_m128_u64(x)
Definition sse2neon.h:396
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3426
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
Definition sse2neon.h:3272
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
Definition sse2neon.h:5729
#define vreinterpret_m64_u8(x)
Definition sse2neon.h:445
#define vreinterpretq_m128d_s32(x)
Definition sse2neon.h:483
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
Definition sse2neon.h:1955
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
Definition sse2neon.h:4680
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
Definition sse2neon.h:2162
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
Definition sse2neon.h:5274
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition sse2neon.h:1717
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
Definition sse2neon.h:1448
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
Definition sse2neon.h:2734
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
Definition sse2neon.h:6545
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3312
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
Definition sse2neon.h:4323
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
Definition sse2neon.h:8440
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
Definition sse2neon.h:6603
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
Definition sse2neon.h:4307
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
Definition sse2neon.h:4336
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
Definition sse2neon.h:725
static cmpestr_func_t _sse2neon_cmpfunc_table[]
Definition sse2neon.h:8091
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3168
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
Definition sse2neon.h:8421
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
Definition sse2neon.h:5670
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
Definition sse2neon.h:5025
FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8395
#define vreinterpretq_f32_m128i(x)
Definition sse2neon.h:427
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
Definition sse2neon.h:1009
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
Definition sse2neon.h:615
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5745
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5597
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition sse2neon.h:4344
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition sse2neon.h:3888
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
Definition sse2neon.h:4540
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
Definition sse2neon.h:5336
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
Definition sse2neon.h:1508
#define vreinterpret_m64_u16(x)
Definition sse2neon.h:446
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
Definition sse2neon.h:1498
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
Definition sse2neon.h:1706
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
Definition sse2neon.h:2352
FORCE_INLINE int _mm_test_all_ones(__m128i a)
Definition sse2neon.h:7573
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition sse2neon.h:3177
#define _MM_DENORMALS_ZERO_OFF
Definition sse2neon.h:354
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
Definition sse2neon.h:3243
FORCE_INLINE __m128i _mm_setzero_si128(void)
Definition sse2neon.h:5043
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3793
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6212
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
Definition sse2neon.h:6081
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
Definition sse2neon.h:2759
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
Definition sse2neon.h:6714
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
Definition sse2neon.h:7532
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
Definition sse2neon.h:2964
#define vreinterpretq_m128_u32(x)
Definition sse2neon.h:395
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5466
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition sse2neon.h:2395
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d)
Definition sse2neon.h:4530
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3772
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1218
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
Definition sse2neon.h:3824
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition sse2neon.h:1226
#define SSE2NEON_AES_U0(p)
#define _sse2neon_const
Definition sse2neon.h:111
#define vreinterpretq_s32_m128i(x)
Definition sse2neon.h:432
#define vreinterpret_s64_m64(x)
Definition sse2neon.h:462
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
Definition sse2neon.h:7638
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
Definition sse2neon.h:6065
#define vreinterpretq_m128i_s16(x)
Definition sse2neon.h:418
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
Definition sse2neon.h:4688
#define vreinterpretq_f32_m128d(x)
Definition sse2neon.h:496
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition sse2neon.h:3034
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
Definition sse2neon.h:3007
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
Definition sse2neon.h:1608
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition sse2neon.h:6995
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
Definition sse2neon.h:2673
#define vreinterpretq_m128d_s64(x)
Definition sse2neon.h:484
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition sse2neon.h:4927
FORCE_INLINE void _mm_setcsr(unsigned int a)
Definition sse2neon.h:2476
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
Definition sse2neon.h:990
float32x4_t __m128d
Definition sse2neon.h:373
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition sse2neon.h:2870
#define SSE2NEON_AES_RSBOX(w)
Definition sse2neon.h:8542
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
Definition sse2neon.h:1056
FORCE_INLINE int _mm_cmpestrc(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8230
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
Definition sse2neon.h:2292
#define vreinterpretq_s8_m128i(x)
Definition sse2neon.h:430
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4408
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
Definition sse2neon.h:4351
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
Definition sse2neon.h:5571
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
Definition sse2neon.h:718
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
Definition sse2neon.h:3496
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
Definition sse2neon.h:750
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
Definition sse2neon.h:3871
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4361
#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix)
Definition sse2neon.h:8060
FORCE_INLINE __m128d _mm_floor_pd(__m128d)
Definition sse2neon.h:7067
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
Definition sse2neon.h:1353
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
Definition sse2neon.h:6789
FORCE_INLINE uint64_t _rdtsc(void)
Definition sse2neon.h:9184
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition sse2neon.h:2912
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3404
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
Definition sse2neon.h:4147
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition sse2neon.h:2745
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6282
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
Definition sse2neon.h:3603
#define vreinterpretq_s64_m128d(x)
Definition sse2neon.h:491
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition sse2neon.h:4982
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
Definition sse2neon.h:6915
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition sse2neon.h:6036
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
Definition sse2neon.h:1025
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
Definition sse2neon.h:4752
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
Definition sse2neon.h:6925
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
Definition sse2neon.h:6632
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
Definition sse2neon.h:4268
#define _MM_DENORMALS_ZERO_ON
Definition sse2neon.h:353
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
Definition sse2neon.h:2936
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
Definition sse2neon.h:1679
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
Definition sse2neon.h:766
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition sse2neon.h:2152
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition sse2neon.h:4948
#define vreinterpretq_m128i_u8(x)
Definition sse2neon.h:422
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
Definition sse2neon.h:1468
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition sse2neon.h:4139
FORCE_INLINE __m128d _mm_round_pd(__m128d, int)
Definition sse2neon.h:7402
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
Definition sse2neon.h:1549
#define _MM_ROUND_NEAREST
Definition sse2neon.h:343
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition sse2neon.h:1890
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
Definition sse2neon.h:743
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
Definition sse2neon.h:7582
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
Definition sse2neon.h:9053
#define vreinterpret_u32_m64(x)
Definition sse2neon.h:456
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
Definition sse2neon.h:4633
#define vreinterpretq_nth_u64_m128i(x, n)
Definition sse2neon.h:535
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
Definition sse2neon.h:1417
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
Definition sse2neon.h:3445
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
Definition sse2neon.h:7210
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
Definition sse2neon.h:7228
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
Definition sse2neon.h:1559
#define vreinterpretq_m128d_f32(x)
Definition sse2neon.h:489
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3570
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
Definition sse2neon.h:3984
#define vreinterpretq_f64_m128i(x)
Definition sse2neon.h:428
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
Definition sse2neon.h:1262
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
Definition sse2neon.h:5778
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
Definition sse2neon.h:4118
union ALIGN_STRUCT(16) SIMDVec
Definition sse2neon.h:522
#define vreinterpret_m64_s16(x)
Definition sse2neon.h:441
FORCE_INLINE int _mm_movemask_pd(__m128d a)
Definition sse2neon.h:4623
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition sse2neon.h:2404
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
Definition sse2neon.h:1785
#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)
Definition sse2neon.h:8180
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
Definition sse2neon.h:3562
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
Definition sse2neon.h:7192
#define _SIDD_NEGATIVE_POLARITY
Definition sse2neon.h:7670
#define _MM_ROUND_DOWN
Definition sse2neon.h:344
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7201
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
Definition sse2neon.h:814
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
Definition sse2neon.h:4934
FORCE_INLINE __m128 _mm_undefined_ps(void)
Definition sse2neon.h:2851
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
Definition sse2neon.h:3690
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
Definition sse2neon.h:5924
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition sse2neon.h:2467
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
Definition sse2neon.h:5224
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
Definition sse2neon.h:2799
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
Definition sse2neon.h:4790
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
Definition sse2neon.h:6844
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition sse2neon.h:5298
FORCE_INLINE __m128d _mm_set_sd(double a)
Definition sse2neon.h:4909
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
Definition sse2neon.h:3843
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
Definition sse2neon.h:6799
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition sse2neon.h:6445
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
Definition sse2neon.h:6097
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
Definition sse2neon.h:5892
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1345
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3757
FORCE_INLINE __m128i _mm_undefined_si128(void)
Definition sse2neon.h:2833
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
Definition sse2neon.h:3742
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
Definition sse2neon.h:4417
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
Definition sse2neon.h:773
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
Definition sse2neon.h:2048
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
Definition sse2neon.h:3123
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
Definition sse2neon.h:4661
#define _MM_FROUND_TO_NEG_INF
Definition sse2neon.h:331
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
Definition sse2neon.h:6949
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
Definition sse2neon.h:7596
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5485
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
Definition sse2neon.h:2725
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
Definition sse2neon.h:9098
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1244
static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
Definition sse2neon.h:7876
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
Definition sse2neon.h:1627
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
Definition sse2neon.h:2012
FORCE_INLINE void _mm_empty(void)
Definition sse2neon.h:1143
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition sse2neon.h:5588
#define vreinterpretq_u64_m128(x)
Definition sse2neon.h:410
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
Definition sse2neon.h:5524
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5760
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
Definition sse2neon.h:5549
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t)
Definition sse2neon.h:8477
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1308
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition sse2neon.h:6863
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3504
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5508
FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
Definition sse2neon.h:8137
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1381
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
Definition sse2neon.h:1664
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
Definition sse2neon.h:1430
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4770
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
Definition sse2neon.h:2980
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition sse2neon.h:1910
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
Definition sse2neon.h:6178
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1478
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
Definition sse2neon.h:4026
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition sse2neon.h:5880
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6345
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
Definition sse2neon.h:780
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3186
#define vreinterpretq_s32_m128(x)
Definition sse2neon.h:414
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
Definition sse2neon.h:2703
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
Definition sse2neon.h:6433
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
Definition sse2neon.h:1363
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6328
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
Definition sse2neon.h:3099
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
Definition sse2neon.h:3537
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
Definition sse2neon.h:5176
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
Definition sse2neon.h:6516
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
Definition sse2neon.h:600
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
Definition sse2neon.h:6824
#define vreinterpretq_u32_m128i(x)
Definition sse2neon.h:437
FORCE_INLINE void * _mm_malloc(size_t size, size_t align)
Definition sse2neon.h:1938
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6235
static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
Definition sse2neon.h:7901
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition sse2neon.h:4997
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition sse2neon.h:4260
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition sse2neon.h:1180
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition sse2neon.h:2098
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
Definition sse2neon.h:3657
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5579
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
Definition sse2neon.h:6874
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
Definition sse2neon.h:1063
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
Definition sse2neon.h:2342
#define vreinterpretq_m128i_s64(x)
Definition sse2neon.h:420
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition sse2neon.h:2500
FORCE_INLINE int _sse2neon_clz(unsigned int x)
Definition sse2neon.h:8113
static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
Definition sse2neon.h:7857
#define vreinterpretq_m128_s64(x)
Definition sse2neon.h:401
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition sse2neon.h:5516
FORCE_INLINE void _mm_free(void *addr)
Definition sse2neon.h:1755
#define vreinterpretq_u32_m128(x)
Definition sse2neon.h:409
#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)
Definition sse2neon.h:8174
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
Definition sse2neon.h:4174
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
Definition sse2neon.h:1900
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4817
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3727
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
Definition sse2neon.h:6936
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition sse2neon.h:1171
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
Definition sse2neon.h:8914
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
Definition sse2neon.h:3385
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
Definition sse2neon.h:3332
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6250
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1488
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
Definition sse2neon.h:4941
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
Definition sse2neon.h:4520
#define _MM_ROUND_UP
Definition sse2neon.h:345
FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
Definition sse2neon.h:5958
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition sse2neon.h:5202
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
Definition sse2neon.h:1738
FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8347
int64x1_t __m64
Definition sse2neon.h:365
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
Definition sse2neon.h:7219
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4469
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
Definition sse2neon.h:5286
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition sse2neon.h:3135
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3395
#define vreinterpret_s32_m64(x)
Definition sse2neon.h:461
FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8383
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
Definition sse2neon.h:6475
#define vreinterpretq_m128d_u32(x)
Definition sse2neon.h:486
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
Definition sse2neon.h:3043
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
Definition sse2neon.h:4103
#define vreinterpret_m64_u64(x)
Definition sse2neon.h:448
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
Definition sse2neon.h:1974
#define __int64
Definition sse2neon.h:383
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
Definition sse2neon.h:796
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3263
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition sse2neon.h:7081
static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
Definition sse2neon.h:7927
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3290
#define vreinterpretq_u64_m128i(x)
Definition sse2neon.h:438
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
Definition sse2neon.h:4082
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
Definition sse2neon.h:6886
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
Definition sse2neon.h:4294
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
Definition sse2neon.h:9154
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition sse2neon.h:6745
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t)
Definition sse2neon.h:4855
#define _MM_ROUND_TOWARD_ZERO
Definition sse2neon.h:346
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
Definition sse2neon.h:757
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition sse2neon.h:1208
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
Definition sse2neon.h:8860
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
Definition sse2neon.h:1809
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
Definition sse2neon.h:5643
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
Definition sse2neon.h:4386
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1290
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7165
FORCE_INLINE int _mm_cmpestra(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8216
#define SSE2NEON_CMPESTR_LIST
Definition sse2neon.h:8067
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5661
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition sse2neon.h:1150
FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
Definition sse2neon.h:1772
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
Definition sse2neon.h:1409
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1280
FORCE_INLINE int _mm_movemask_ps(__m128 a)
Definition sse2neon.h:2129
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
Definition sse2neon.h:7097
FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
Definition sse2neon.h:4967
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition sse2neon.h:4761
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3463
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
Definition sse2neon.h:9075
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8823
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition sse2neon.h:6853
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
Definition sse2neon.h:6906
#define _mm_set_pd1
Definition sse2neon.h:4904
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4399
FORCE_INLINE void _mm_stream_si32(int *p, int a)
Definition sse2neon.h:5562
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
Definition sse2neon.h:7183
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
Definition sse2neon.h:6266
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3595
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8702
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition sse2neon.h:2989
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
Definition sse2neon.h:3970
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
Definition sse2neon.h:5477
FORCE_INLINE __m128d _mm_set1_pd(double d)
Definition sse2neon.h:4956
#define _MM_FROUND_TO_NEAREST_INT
Definition sse2neon.h:330
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition sse2neon.h:2886
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
Definition sse2neon.h:1643
#define vreinterpretq_m128d_u64(x)
Definition sse2neon.h:487
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
Definition sse2neon.h:4058
static const uint8_t _sse2neon_sbox[256]
Definition sse2neon.h:8586
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
Definition sse2neon.h:1272
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
Definition sse2neon.h:5616
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
Definition sse2neon.h:8459
#define SSE2NEON_AES_U3(p)
#define vreinterpret_s16_m64(x)
Definition sse2neon.h:460
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
Definition sse2neon.h:6057
FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
Definition sse2neon.h:4825
FORCE_INLINE __m128d _mm_load_sd(const double *p)
Definition sse2neon.h:4246
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
Definition sse2neon.h:1327
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
Definition sse2neon.h:3856
#define vreinterpretq_m128i_u32(x)
Definition sse2neon.h:424
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
Definition sse2neon.h:5679
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
Definition sse2neon.h:2022
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
Definition sse2neon.h:1043
FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8356
FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
Definition sse2neon.h:2082
#define SSE2NEON_AES_U1(p)
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
Definition sse2neon.h:5636
FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8257
FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
Definition sse2neon.h:8097
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
Definition sse2neon.h:3063
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
Definition sse2neon.h:1017
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
Definition sse2neon.h:1160
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
Definition sse2neon.h:2767
FORCE_INLINE __m128 _mm_ceil_ps(__m128)
Definition sse2neon.h:6773
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
Definition sse2neon.h:7287
#define vreinterpretq_m128i_s32(x)
Definition sse2neon.h:419
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7382
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
Definition sse2neon.h:6698
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8605
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
Definition sse2neon.h:734
FORCE_INLINE __m128 _mm_set_ss(float a)
Definition sse2neon.h:2459
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
Definition sse2neon.h:832
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4743
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
Definition sse2neon.h:823
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
Definition sse2neon.h:9127
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5652
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
Definition sse2neon.h:7237
#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix)
Definition sse2neon.h:7894
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
Definition sse2neon.h:3801
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
Definition sse2neon.h:5262
FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8374
#define SSE2NEON_BARRIER()
Definition sse2neon.h:180
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
Definition sse2neon.h:1880
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1254
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
Definition sse2neon.h:2304
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
Definition sse2neon.h:3091
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128)
Definition sse2neon.h:2071
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
Definition sse2neon.h:4128
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6488
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3375
#define vreinterpretq_f32_m128(x)
Definition sse2neon.h:404
FORCE_INLINE __m128 _mm_round_ps(__m128, int)
Definition sse2neon.h:7471
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
Definition sse2neon.h:788
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
Definition sse2neon.h:4650
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
Definition sse2neon.h:3471
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
Definition sse2neon.h:2171
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
Definition sse2neon.h:7107
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
Definition sse2neon.h:4848
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5411
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
Definition sse2neon.h:4719
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition sse2neon.h:1298
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5498
FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8338
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
Definition sse2neon.h:1033
FORCE_INLINE __m128d _mm_undefined_pd(void)
Definition sse2neon.h:5694
#define _MM_FROUND_TO_ZERO
Definition sse2neon.h:333
FORCE_INLINE __m128d _mm_set_pd(double, double)
Definition sse2neon.h:4891
#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix)
Definition sse2neon.h:8049
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
Definition sse2neon.h:3107
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
Definition sse2neon.h:806
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition sse2neon.h:1871
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
Definition sse2neon.h:6895
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
Definition sse2neon.h:6833
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
Definition sse2neon.h:5976
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
Definition sse2neon.h:1572
#define _MM_FLUSH_ZERO_ON
Definition sse2neon.h:349
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
Definition sse2neon.h:8406
FORCE_INLINE void _mm_prefetch(char const *p, int i)
Definition sse2neon.h:2240
#define _MM_DENORMALS_ZERO_MASK
Definition sse2neon.h:352
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
Definition sse2neon.h:1617
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
Definition sse2neon.h:7618
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition sse2neon.h:3115
FORCE_INLINE void _mm_lfence(void)
Definition sse2neon.h:2557
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition sse2neon.h:1843
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
Definition sse2neon.h:4505
#define vreinterpretq_m128i_u16(x)
Definition sse2neon.h:423
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
Definition sse2neon.h:1592
_mm_hint
Definition sse2neon.h:686
@ _MM_HINT_T1
Definition sse2neon.h:689
@ _MM_HINT_T0
Definition sse2neon.h:688
@ _MM_HINT_T2
Definition sse2neon.h:690
@ _MM_HINT_NTA
Definition sse2neon.h:687
#define _MM_FROUND_CUR_DIRECTION
Definition sse2neon.h:334
int(* cmpestr_func_t)(__m128i a, int la, __m128i b, int lb)
Definition sse2neon.h:8090
int64x2_t __m128i
Definition sse2neon.h:375
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
Definition sse2neon.h:3072
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
Definition sse2neon.h:7552
#define SSE2NEON_AES_SBOX(w)
Definition sse2neon.h:8502
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
Definition sse2neon.h:2313
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
Definition sse2neon.h:5440
#define vreinterpretq_u16_m128i(x)
Definition sse2neon.h:436
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
Definition sse2neon.h:4005
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
Definition sse2neon.h:2108
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
Definition sse2neon.h:1049
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
Definition sse2neon.h:1688
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
Definition sse2neon.h:3221
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
Definition sse2neon.h:5534
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
Definition sse2neon.h:2945
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition sse2neon.h:2685
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
Definition sse2neon.h:4156
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5825
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
Definition sse2neon.h:5247
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
Definition sse2neon.h:6194
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
Definition sse2neon.h:2414
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
Definition sse2neon.h:4281
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5606
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
Definition sse2neon.h:5189
#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)
Definition sse2neon.h:8308
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
Definition sse2neon.h:4445
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
Definition sse2neon.h:1524
#define vreinterpret_s8_m64(x)
Definition sse2neon.h:459
FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
Definition sse2neon.h:1761
#define _mm_srli_si128(a, imm)
Definition sse2neon.h:5399
static const uint8_t _sse2neon_rsbox[256]
Definition sse2neon.h:8587
#define _mm_shuffle_epi32(a, imm)
Definition sse2neon.h:5063
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition sse2neon.h:2641
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
Definition sse2neon.h:5163
#define vreinterpret_u8_m64(x)
Definition sse2neon.h:454
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition sse2neon.h:2717
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5793
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1458
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6574
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
Definition sse2neon.h:1236
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition sse2neon.h:2033
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
Definition sse2neon.h:874
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
Definition sse2neon.h:6003
FORCE_INLINE void _mm_pause(void)
Definition sse2neon.h:4803
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
Definition sse2neon.h:1927
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
Definition sse2neon.h:5871
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
Definition sse2neon.h:7561
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
Definition sse2neon.h:5424
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition sse2neon.h:3816
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition sse2neon.h:2180
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
Definition sse2neon.h:6727
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
Definition sse2neon.h:5856
FORCE_INLINE int _sse2neon_ctz(unsigned int x)
Definition sse2neon.h:8125
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
Definition sse2neon.h:2060
static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
Definition sse2neon.h:7966
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
Definition sse2neon.h:3354
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3529
#define SSE2NEON_AES_H0(x)
Definition sse2neon.h:8585
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
Definition sse2neon.h:6089
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
Definition sse2neon.h:5349
FORCE_INLINE int _mm_cmpestro(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8266
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition sse2neon.h:1985
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
Definition sse2neon.h:6203
FORCE_INLINE void _mm_mfence(void)
Definition sse2neon.h:2547
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
Definition sse2neon.h:1651
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
Definition sse2neon.h:1393
#define _MM_FROUND_NO_EXC
Definition sse2neon.h:335
#define vreinterpretq_s64_m128i(x)
Definition sse2neon.h:433
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
Definition sse2neon.h:4019
#define _MM_FLUSH_ZERO_OFF
Definition sse2neon.h:350
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6384
SIMDVec
Definition sse2neon.h:532
FORCE_INLINE void _mm_clflush(void const *p)
Definition sse2neon.h:3146
Definition sse2neon.h:694
uint16_t res0
Definition sse2neon.h:695
uint8_t bit23
Definition sse2neon.h:698
uint8_t res1
Definition sse2neon.h:696
uint8_t bit24
Definition sse2neon.h:699
uint8_t bit22
Definition sse2neon.h:697
uint8_t res2
Definition sse2neon.h:700
static int gettimeofday(struct timeval *tv, struct timezone *tz)
Definition time.h:48
for i
Definition volk_config_fixed.tmpl.h:13