25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 28 #if __cplusplus >= 201703L 30 #if !_GLIBCXX_SIMD_X86INTRIN 32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available" 35 _GLIBCXX_SIMD_BEGIN_NAMESPACE
40 template <
typename _Tp,
size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 {
return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np
>>(__x._M_data); }
45 template <
typename _TV,
47 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
48 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
50 __to_masktype(_TV __x)
51 {
return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>
>(__x); }
55 template <
typename _Ap,
typename _Bp,
typename _Tp = common_type_t<_Ap, _Bp>,
56 typename _Trait = _VectorTraits<_Tp>>
57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
58 __interleave128_lo(
const _Ap& __av,
const _Bp& __bv)
62 if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
63 return _Tp{__a[0], __b[0]};
64 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
65 return _Tp{__a[0], __b[0], __a[1], __b[1]};
66 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
67 return _Tp{__a[0], __b[0], __a[1], __b[1],
68 __a[2], __b[2], __a[3], __b[3]};
69 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
72 __a[6], __b[6], __a[7], __b[7]};
73 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
74 return _Tp{__a[0], __b[0], __a[2], __b[2]};
75 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
76 return _Tp{__a[0], __b[0], __a[1], __b[1],
77 __a[4], __b[4], __a[5], __b[5]};
78 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
81 __a[10], __b[10], __a[11], __b[11]};
82 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
87 __a[22], __b[22], __a[23], __b[23]};
88 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
89 return _Tp{__a[0], __b[0], __a[2], __b[2],
90 __a[4], __b[4], __a[6], __b[6]};
91 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
94 __a[12], __b[12], __a[13], __b[13]};
95 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
100 __a[26], __b[26], __a[27], __b[27]};
101 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
113 __assert_unreachable<_Tp>();
118 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
119 _GLIBCXX_SIMD_INTRINSIC constexpr
bool 122 if (!__builtin_is_constant_evaluated())
124 if constexpr (__have_avx)
126 if constexpr (_TVT::template _S_is<float, 8>)
127 return _mm256_testz_ps(__a, __a);
128 else if constexpr (_TVT::template _S_is<double, 4>)
129 return _mm256_testz_pd(__a, __a);
130 else if constexpr (
sizeof(_Tp) == 32)
131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
132 else if constexpr (_TVT::template _S_is<
float>)
133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
134 else if constexpr (_TVT::template _S_is<
double, 2>)
135 return _mm_testz_pd(__a, __a);
137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
139 else if constexpr (__have_sse4_1)
140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
141 __intrin_bitcast<__m128i>(__a));
143 else if constexpr (sizeof(_Tp) <= 8)
144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
147 const auto __b = __vector_bitcast<_LLong>(__a);
148 if constexpr (
sizeof(__b) == 16)
149 return (__b[0] | __b[1]) == 0;
150 else if constexpr (sizeof(__b) == 32)
151 return __is_zero(__lo128(__b) | __hi128(__b));
152 else if constexpr (sizeof(__b) == 64)
153 return __is_zero(__lo256(__b) | __hi256(__b));
155 __assert_unreachable<_Tp>();
161 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST
int 165 if constexpr (
sizeof(_Tp) == 32)
167 if constexpr (_TVT::template _S_is<float>)
168 return _mm256_movemask_ps(__to_intrin(__a));
169 else if constexpr (_TVT::template _S_is<double>)
170 return _mm256_movemask_pd(__to_intrin(__a));
172 return _mm256_movemask_epi8(__to_intrin(__a));
174 else if constexpr (_TVT::template _S_is<float>)
175 return _mm_movemask_ps(__to_intrin(__a));
176 else if constexpr (_TVT::template _S_is<double>)
177 return _mm_movemask_pd(__to_intrin(__a));
179 return _mm_movemask_epi8(__to_intrin(__a));
184 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 186 __testz(_TI __a, _TI __b)
188 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
189 _TVT::_S_full_size>>);
190 if (!__builtin_is_constant_evaluated())
192 if constexpr (
sizeof(_TI) == 32)
194 if constexpr (_TVT::template _S_is<float>)
195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
196 else if constexpr (_TVT::template _S_is<double>)
197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
201 else if constexpr (_TVT::template _S_is<float> && __have_avx)
202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
203 else if constexpr (_TVT::template _S_is<double> && __have_avx)
204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
205 else if constexpr (__have_sse4_1)
206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
207 __intrin_bitcast<__m128i>(__to_intrin(__b)));
209 return __movemask(0 == __and(__a, __b)) != 0;
212 return __is_zero(__and(__a, __b));
218 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 220 __testc(_TI __a, _TI __b)
222 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
223 _TVT::_S_full_size>>);
224 if (__builtin_is_constant_evaluated())
225 return __is_zero(__andnot(__a, __b));
227 if constexpr (
sizeof(_TI) == 32)
229 if constexpr (_TVT::template _S_is<float>)
230 return _mm256_testc_ps(__a, __b);
231 else if constexpr (_TVT::template _S_is<double>)
232 return _mm256_testc_pd(__a, __b);
234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
236 else if constexpr (_TVT::template _S_is<float> && __have_avx)
237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
238 else if constexpr (_TVT::template _S_is<double> && __have_avx)
239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
244 __intrin_bitcast<__m128i>(__to_intrin(__b)));
250 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 252 __testnzc(_TI __a, _TI __b)
254 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
255 _TVT::_S_full_size>>);
256 if (!__builtin_is_constant_evaluated())
258 if constexpr (
sizeof(_TI) == 32)
260 if constexpr (_TVT::template _S_is<float>)
261 return _mm256_testnzc_ps(__a, __b);
262 else if constexpr (_TVT::template _S_is<double>)
263 return _mm256_testnzc_pd(__a, __b);
265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
267 else if constexpr (_TVT::template _S_is<float> && __have_avx)
268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
269 else if constexpr (_TVT::template _S_is<double> && __have_avx)
270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
271 else if constexpr (__have_sse4_1)
272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
273 __intrin_bitcast<__m128i>(__to_intrin(__b)));
275 return __movemask(0 == __and(__a, __b)) == 0
276 && __movemask(0 == __andnot(__a, __b)) == 0;
279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
286 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
287 _GLIBCXX_SIMD_INTRINSIC _Tp
290 if constexpr (
sizeof(_Tp) == 16)
293 is_floating_point_v<typename _TVT::value_type>, float,
int>>(__a);
294 return reinterpret_cast<_Tp
>(
295 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
297 else if constexpr (
sizeof(_Tp) == 32)
300 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
301 return reinterpret_cast<_Tp
>(
302 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
304 else if constexpr (
sizeof(_Tp) == 64)
307 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
308 return reinterpret_cast<_Tp
>(decltype(__x){__x[0], __x[1], __x[4],
309 __x[5], __x[2], __x[3],
313 __assert_unreachable<_Tp>();
318 template <
typename _Tp>
319 _GLIBCXX_SIMD_INTRINSIC
auto 320 __maskload_epi32(
const int* __ptr, _Tp __k)
322 if constexpr (
sizeof(__k) == 16)
323 return _mm_maskload_epi32(__ptr, __k);
325 return _mm256_maskload_epi32(__ptr, __k);
330 template <typename _Tp>
331 _GLIBCXX_SIMD_INTRINSIC auto
332 __maskload_epi64(const _LLong* __ptr, _Tp __k)
334 if constexpr (
sizeof(__k) == 16)
335 return _mm_maskload_epi64(__ptr, __k);
337 return _mm256_maskload_epi64(__ptr, __k);
342 template <typename _Tp>
343 _GLIBCXX_SIMD_INTRINSIC auto
344 __maskload_ps(const
float* __ptr, _Tp __k)
346 if constexpr (
sizeof(__k) == 16)
347 return _mm_maskload_ps(__ptr, __k);
349 return _mm256_maskload_ps(__ptr, __k);
354 template <typename _Tp>
355 _GLIBCXX_SIMD_INTRINSIC auto
356 __maskload_pd(const
double* __ptr, _Tp __k)
358 if constexpr (
sizeof(__k) == 16)
359 return _mm_maskload_pd(__ptr, __k);
361 return _mm256_maskload_pd(__ptr, __k);
367 template <
size_t _Np,
typename _Tp,
typename _Kp>
368 _GLIBCXX_SIMD_INTRINSIC constexpr
auto 369 __movm(_Kp __k) noexcept
371 static_assert(is_unsigned_v<_Kp>);
372 if constexpr (
sizeof(_Tp) == 1 && __have_avx512bw)
374 if constexpr (_Np <= 16 && __have_avx512vl)
375 return __builtin_ia32_cvtmask2b128(__k);
376 else if constexpr (_Np <= 32 && __have_avx512vl)
377 return __builtin_ia32_cvtmask2b256(__k);
379 return __builtin_ia32_cvtmask2b512(__k);
381 else if constexpr (
sizeof(_Tp) == 2 && __have_avx512bw)
383 if constexpr (_Np <= 8 && __have_avx512vl)
384 return __builtin_ia32_cvtmask2w128(__k);
385 else if constexpr (_Np <= 16 && __have_avx512vl)
386 return __builtin_ia32_cvtmask2w256(__k);
388 return __builtin_ia32_cvtmask2w512(__k);
390 else if constexpr (
sizeof(_Tp) == 4 && __have_avx512dq)
392 if constexpr (_Np <= 4 && __have_avx512vl)
393 return __builtin_ia32_cvtmask2d128(__k);
394 else if constexpr (_Np <= 8 && __have_avx512vl)
395 return __builtin_ia32_cvtmask2d256(__k);
397 return __builtin_ia32_cvtmask2d512(__k);
399 else if constexpr (
sizeof(_Tp) == 8 && __have_avx512dq)
401 if constexpr (_Np <= 2 && __have_avx512vl)
402 return __builtin_ia32_cvtmask2q128(__k);
403 else if constexpr (_Np <= 4 && __have_avx512vl)
404 return __builtin_ia32_cvtmask2q256(__k);
406 return __builtin_ia32_cvtmask2q512(__k);
409 __assert_unreachable<_Tp>();
413 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 414 #include "simd_x86_conversions.h" 418 template <
typename _Tp,
size_t _Np>
424 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
427 template <
typename _Tp,
size_t _Np>
433 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
436 template <
typename _Tp,
size_t _Np>
442 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
445 template <
typename _Tp,
size_t _Np>
451 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
454 template <
typename _Tp,
size_t _Np>
458 return __have_avx512f
460 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
463 template <
typename _Tp,
size_t _Np>
467 return __have_avx512f
469 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
473 struct _MaskImplX86Mixin;
476 struct _CommonImplX86 : _CommonImplBuiltin
478 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 480 template <
typename _From,
typename _To,
size_t _ToSize>
481 static constexpr
bool 482 _S_converts_via_decomposition()
484 if constexpr (is_integral_v<
485 _From> && is_integral_v<_To> &&
sizeof(_From) == 8
487 return (sizeof(_To) == 2 && !__have_ssse3)
488 || (sizeof(_To) == 1 && !__have_avx512f);
489 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
490 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
492 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
495 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
497 return (sizeof(_To) == 4 && _ToSize == 16)
498 || (sizeof(_To) == 8 && _ToSize < 64);
503 template <typename _From, typename _To,
size_t _ToSize>
504 static inline constexpr
bool __converts_via_decomposition_v
505 = _S_converts_via_decomposition<_From, _To, _ToSize>();
510 using _CommonImplBuiltin::_S_store;
512 template <
typename _Tp,
size_t _Np>
513 _GLIBCXX_SIMD_INTRINSIC
static constexpr
void 514 _S_store(_SimdWrapper<_Tp, _Np> __x,
void* __addr)
516 constexpr
size_t _Bytes = _Np *
sizeof(_Tp);
518 if (__builtin_is_constant_evaluated())
519 _CommonImplBuiltin::_S_store(__x, __addr);
520 else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
522 const auto __v = __to_intrin(__x);
524 if constexpr (_Bytes & 1)
526 if constexpr (_Bytes < 16)
527 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
528 __intrin_bitcast<__m128i>(__v));
529 else if constexpr (_Bytes < 32)
530 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
531 __intrin_bitcast<__m256i>(__v));
533 _mm512_mask_storeu_epi8(__addr,
534 0xffffffffffffffffull >> (64 - _Bytes),
535 __intrin_bitcast<__m512i>(__v));
537 else if constexpr (_Bytes & 2)
539 if constexpr (_Bytes < 16)
540 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
541 __intrin_bitcast<__m128i>(__v));
542 else if constexpr (_Bytes < 32)
543 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
544 __intrin_bitcast<__m256i>(__v));
546 _mm512_mask_storeu_epi16(__addr,
547 0xffffffffull >> (32 - _Bytes / 2),
548 __intrin_bitcast<__m512i>(__v));
550 else if constexpr (_Bytes & 4)
552 if constexpr (_Bytes < 16)
553 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
554 __intrin_bitcast<__m128i>(__v));
555 else if constexpr (_Bytes < 32)
556 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
557 __intrin_bitcast<__m256i>(__v));
559 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
560 __intrin_bitcast<__m512i>(__v));
566 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes " 567 "- 1)) != 0 is impossible");
568 if constexpr (_Bytes < 32)
569 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
570 __intrin_bitcast<__m256i>(__v));
572 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
573 __intrin_bitcast<__m512i>(__v));
577 _CommonImplBuiltin::_S_store(__x, __addr);
582 template <
size_t _Np,
bool _Sanitized>
583 _GLIBCXX_SIMD_INTRINSIC static constexpr
void 584 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x,
bool* __mem)
586 if (__builtin_is_constant_evaluated())
587 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
588 else if constexpr (__have_avx512bw_vl)
589 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
590 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
591 if constexpr (_Np <= 16)
592 return _mm_movm_epi8(__x._M_to_bits());
593 else if constexpr (_Np <= 32)
594 return _mm256_movm_epi8(__x._M_to_bits());
595 else if constexpr (_Np <= 64)
596 return _mm512_movm_epi8(__x._M_to_bits());
598 __assert_unreachable<_SizeConstant<_Np>>();
601 else if constexpr (__have_bmi2)
603 if constexpr (_Np <= 4)
604 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
606 __execute_n_times<__div_roundup(_Np, sizeof(
size_t))>(
607 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
608 constexpr
size_t __offset = __i *
sizeof(size_t);
609 constexpr
int __todo =
std::min(
sizeof(
size_t), _Np - __offset);
610 if constexpr (__todo == 1)
611 __mem[__offset] = __x[__offset];
616 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
617 0x0101010101010101ULL);
620 __x.template _M_extract<__offset>()._M_to_bits(),
623 _S_store<__todo>(__bools, __mem + __offset);
627 else if constexpr (__have_sse2 && _Np > 7)
628 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
629 constexpr
int __offset = __i * 16;
630 constexpr
int __todo =
std::min(16,
int(_Np) - __offset);
631 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
632 __vector_type16_t<_UChar> __bools;
633 if constexpr (__have_avx512f)
636 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
637 __vector_broadcast<16>(1)));
639 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
640 __todo > 8 ? __hi256(__as32bits)
642 __bools = __vector_bitcast<_UChar>(
643 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
647 using _V = __vector_type_t<_UChar, 16>;
648 auto __tmp = _mm_cvtsi32_si128(__bits);
649 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
650 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
651 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
652 _V __tmp2 =
reinterpret_cast<_V
>(__tmp);
653 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
654 1, 2, 4, 8, 16, 32, 64, 128};
655 __bools = (__tmp2 == 0) + 1;
657 _S_store<__todo>(__bools, __mem + __offset);
660 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
669 template <
typename _Kp,
typename _TV>
670 _GLIBCXX_SIMD_INTRINSIC
static _TV
671 _S_blend_avx512(
const _Kp __k,
const _TV __a,
const _TV __b) noexcept
673 static_assert(__is_vector_type_v<_TV>);
674 using _Tp =
typename _VectorTraits<_TV>::value_type;
675 static_assert(
sizeof(_TV) >= 16);
676 static_assert(
sizeof(_Tp) <= 8);
678 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a;
681 = conditional_t<(sizeof(_Tp) > 2),
683 conditional_t<sizeof(_Tp) == 1, char, short>>;
684 [[maybe_unused]]
const auto __aa = __vector_bitcast<_IntT>(__a);
685 [[maybe_unused]]
const auto __bb = __vector_bitcast<_IntT>(__b);
686 if constexpr (
sizeof(_TV) == 64)
688 if constexpr (
sizeof(_Tp) == 1)
689 return reinterpret_cast<_TV>(
690 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
691 else if constexpr (sizeof(_Tp) == 2)
692 return reinterpret_cast<_TV>(
693 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
694 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
695 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
696 else if constexpr (sizeof(_Tp) == 4)
697 return reinterpret_cast<_TV>(
698 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
699 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
700 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
701 else if constexpr (sizeof(_Tp) == 8)
702 return reinterpret_cast<_TV>(
703 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
705 else if constexpr (sizeof(_TV) == 32)
707 if constexpr (
sizeof(_Tp) == 1)
708 return reinterpret_cast<_TV>(
709 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
710 else if constexpr (sizeof(_Tp) == 2)
711 return reinterpret_cast<_TV>(
712 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
713 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
714 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
715 else if constexpr (sizeof(_Tp) == 4)
716 return reinterpret_cast<_TV>(
717 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
718 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
719 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
720 else if constexpr (sizeof(_Tp) == 8)
721 return reinterpret_cast<_TV>(
722 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
724 else if constexpr (sizeof(_TV) == 16)
726 if constexpr (
sizeof(_Tp) == 1)
727 return reinterpret_cast<_TV>(
728 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
729 else if constexpr (sizeof(_Tp) == 2)
730 return reinterpret_cast<_TV>(
731 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
732 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
733 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
734 else if constexpr (sizeof(_Tp) == 4)
735 return reinterpret_cast<_TV>(
736 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
737 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
738 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
739 else if constexpr (sizeof(_Tp) == 8)
740 return reinterpret_cast<_TV>(
741 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
752 template <
typename _Tp>
753 _GLIBCXX_SIMD_INTRINSIC
static _Tp
754 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept
756 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
759 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
760 __m128 __k)
const noexcept
762 return __builtin_ia32_blendvps(__a, __b, __k);
764 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
765 __m128d __k)
const noexcept
767 return __builtin_ia32_blendvpd(__a, __b, __k);
769 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
770 __m128i __k)
const noexcept
772 return reinterpret_cast<__m128i
>(
773 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
774 reinterpret_cast<__v16qi>(__b),
775 reinterpret_cast<__v16qi>(__k)));
777 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
778 __m256 __k)
const noexcept
780 return __builtin_ia32_blendvps256(__a, __b, __k);
782 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
783 __m256d __k)
const noexcept
785 return __builtin_ia32_blendvpd256(__a, __b, __k);
787 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
788 __m256i __k)
const noexcept
790 if constexpr (__have_avx2)
791 return reinterpret_cast<__m256i
>(
792 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
793 reinterpret_cast<__v32qi>(__b),
794 reinterpret_cast<__v32qi>(__k)));
796 return reinterpret_cast<__m256i
>(
797 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
798 reinterpret_cast<__v8sf>(__b),
799 reinterpret_cast<__v8sf>(__k)));
802 return __eval(__a, __b, __k);
809 template <
typename _Tp,
size_t _Np>
810 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
811 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
812 _SimdWrapper<_Tp, _Np> __at1)
814 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
815 if (__k._M_is_constprop() && __at0._M_is_constprop()
816 && __at1._M_is_constprop())
817 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
818 [&](
auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
819 return __k[__i] ? __at1[__i] : __at0[__i];
821 else if constexpr (
sizeof(__at0) == 64
822 || (__have_avx512vl &&
sizeof(__at0) >= 16))
823 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
826 static_assert((__have_avx512vl &&
sizeof(__at0) < 16)
827 || !__have_avx512vl);
828 constexpr
size_t __size = (__have_avx512vl ? 16 : 64) /
sizeof(_Tp);
829 return __vector_bitcast<_Tp, _Np>(
830 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
831 __vector_bitcast<_Tp, __size>(__at1)));
835 template <
typename _Tp,
size_t _Np>
836 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
837 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
838 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
840 const auto __kk = __wrapper_bitcast<_Tp>(__k);
841 if (__builtin_is_constant_evaluated()
842 || (__kk._M_is_constprop() && __at0._M_is_constprop()
843 && __at1._M_is_constprop()))
845 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
846 if (__r._M_is_constprop())
849 if constexpr (((__have_avx512f &&
sizeof(__at0) == 64) || __have_avx512vl)
850 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
853 _SimdWrapper<bool, _Np>(
854 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
862 if constexpr (__have_sse4_1)
863 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
866 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
875 template <
typename _Abi>
876 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
878 using _Base = _SimdImplBuiltin<_Abi>;
880 template <
typename _Tp>
881 using _MaskMember =
typename _Base::template _MaskMember<_Tp>;
883 template <
typename _Tp>
884 static constexpr
size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
886 template <
typename _Tp>
887 static constexpr
size_t _S_size = _Abi::template _S_size<_Tp>;
889 template <
typename _Tp>
890 static constexpr
size_t _S_max_store_size
891 = (
sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
892 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
895 using _MaskImpl =
typename _Abi::_MaskImpl;
898 template <
typename _Tp,
size_t _Np,
typename _Up>
899 static inline _SimdWrapper<_Tp, _Np>
900 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
901 const _Up* __mem) noexcept
903 static_assert(_Np == _S_size<_Tp>);
904 if constexpr (is_same_v<_Tp, _Up> ||
905 (
sizeof(_Tp) ==
sizeof(_Up)
907 _Tp> == is_integral_v<_Up>)
911 [[maybe_unused]]
const auto __intrin = __to_intrin(__merge);
912 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
915 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
916 if constexpr (
sizeof(__intrin) == 16)
917 __merge = __vector_bitcast<_Tp, _Np>(
918 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
919 else if constexpr (sizeof(__merge) == 32)
920 __merge = __vector_bitcast<_Tp, _Np>(
921 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
922 else if constexpr (sizeof(__merge) == 64)
923 __merge = __vector_bitcast<_Tp, _Np>(
924 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
926 __assert_unreachable<_Tp>();
928 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
931 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
932 if constexpr (
sizeof(__intrin) == 16)
933 __merge = __vector_bitcast<_Tp, _Np>(
934 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
935 else if constexpr (sizeof(__intrin) == 32)
936 __merge = __vector_bitcast<_Tp, _Np>(
937 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
938 else if constexpr (sizeof(__intrin) == 64)
939 __merge = __vector_bitcast<_Tp, _Np>(
940 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
942 __assert_unreachable<_Tp>();
944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
945 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
948 if constexpr (
sizeof(__intrin) == 16)
949 __merge = __vector_bitcast<_Tp, _Np>(
950 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
951 else if constexpr (sizeof(__intrin) == 32)
952 __merge = __vector_bitcast<_Tp, _Np>(
953 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
954 else if constexpr (sizeof(__intrin) == 64)
955 __merge = __vector_bitcast<_Tp, _Np>(
956 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
958 __assert_unreachable<_Tp>();
960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
961 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
964 if constexpr (
sizeof(__intrin) == 16)
965 __merge = __vector_bitcast<_Tp, _Np>(
966 _mm_mask_loadu_ps(__intrin, __kk, __mem));
967 else if constexpr (sizeof(__intrin) == 32)
968 __merge = __vector_bitcast<_Tp, _Np>(
969 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
970 else if constexpr (sizeof(__intrin) == 64)
971 __merge = __vector_bitcast<_Tp, _Np>(
972 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
974 __assert_unreachable<_Tp>();
976 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
977 && is_integral_v<_Up>)
979 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
982 __vector_bitcast<_Tp, _Np>(
983 __maskload_epi32(reinterpret_cast<const int*>(__mem),
986 else if constexpr (__have_avx &&
sizeof(_Tp) == 4)
988 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
991 __vector_bitcast<_Tp, _Np>(
992 __maskload_ps(reinterpret_cast<const float*>(__mem),
995 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
996 &&
sizeof(_Tp) == 8 && is_integral_v<_Up>)
998 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
999 if constexpr (
sizeof(__intrin) == 16)
1000 __merge = __vector_bitcast<_Tp, _Np>(
1001 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
1002 else if constexpr (sizeof(__intrin) == 32)
1003 __merge = __vector_bitcast<_Tp, _Np>(
1004 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
1005 else if constexpr (sizeof(__intrin) == 64)
1006 __merge = __vector_bitcast<_Tp, _Np>(
1007 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
1009 __assert_unreachable<_Tp>();
1011 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
1012 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
1014 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1015 if constexpr (
sizeof(__intrin) == 16)
1016 __merge = __vector_bitcast<_Tp, _Np>(
1017 _mm_mask_loadu_pd(__intrin, __kk, __mem));
1018 else if constexpr (sizeof(__intrin) == 32)
1019 __merge = __vector_bitcast<_Tp, _Np>(
1020 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
1021 else if constexpr (sizeof(__intrin) == 64)
1022 __merge = __vector_bitcast<_Tp, _Np>(
1023 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
1025 __assert_unreachable<_Tp>();
1027 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1028 && is_integral_v<_Up>)
1030 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
1032 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1033 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
1034 reinterpret_cast<const _LLong*>(__mem),
1035 __to_intrin(__k))));
1037 else if constexpr (__have_avx &&
sizeof(_Tp) == 8)
1039 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
1041 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1042 __vector_bitcast<_Tp, _Np>(
1043 __maskload_pd(reinterpret_cast<const double*>(__mem),
1044 __to_intrin(__k))));
1047 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1048 [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1049 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1077 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1083 template <
typename _Tp,
size_t _Np>
1084 _GLIBCXX_SIMD_INTRINSIC
static void 1085 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper<bool, _Np> __k)
1087 [[maybe_unused]]
const auto __vi = __to_intrin(__v);
1088 if constexpr (
sizeof(__vi) == 64)
1090 static_assert(
sizeof(__v) == 64 && __have_avx512f);
1091 if constexpr (__have_avx512bw &&
sizeof(_Tp) == 1)
1092 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1093 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1094 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1095 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1097 if constexpr (is_integral_v<_Tp>)
1098 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1100 _mm512_mask_storeu_ps(__mem, __k, __vi);
1102 else if constexpr (__have_avx512f &&
sizeof(_Tp) == 8)
1104 if constexpr (is_integral_v<_Tp>)
1105 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1107 _mm512_mask_storeu_pd(__mem, __k, __vi);
1109 #if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32 1111 else if constexpr (__have_sse2)
1113 using _M = __vector_type_t<_Tp, _Np>;
1114 using _MVT = _VectorTraits<_M>;
1115 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)),
1116 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)),
1117 reinterpret_cast<char*>(__mem));
1118 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)),
1119 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1120 __k._M_data >> 1 * _MVT::_S_full_size)),
1121 reinterpret_cast<char*>(__mem) + 1 * 16);
1122 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)),
1123 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1124 __k._M_data >> 2 * _MVT::_S_full_size)),
1125 reinterpret_cast<char*>(__mem) + 2 * 16);
1126 if constexpr (_Np > 48 /
sizeof(_Tp))
1127 _mm_maskmoveu_si128(
1128 __auto_bitcast(__extract<3, 4>(__v._M_data)),
1129 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1130 __k._M_data >> 3 * _MVT::_S_full_size)),
1131 reinterpret_cast<char*>(__mem) + 3 * 16);
1135 __assert_unreachable<_Tp>();
1137 else if constexpr (
sizeof(__vi) == 32)
1139 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1140 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1141 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1142 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1143 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1145 if constexpr (is_integral_v<_Tp>)
1146 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1148 _mm256_mask_storeu_ps(__mem, __k, __vi);
1150 else if constexpr (__have_avx512vl &&
sizeof(_Tp) == 8)
1152 if constexpr (is_integral_v<_Tp>)
1153 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1155 _mm256_mask_storeu_pd(__mem, __k, __vi);
1157 else if constexpr (__have_avx512f
1158 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
1161 _S_masked_store_nocvt(
1162 _SimdWrapper64<_Tp>(
1163 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1164 __mem, _SimdWrapper<
bool, 64 /
sizeof(_Tp)>(__k._M_data));
1167 _S_masked_store_nocvt(__v, __mem,
1168 _MaskImpl::template _S_to_maskvector<
1169 __int_for_sizeof_t<_Tp>, _Np>(__k));
1171 else if constexpr (
sizeof(__vi) == 16)
1173 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1174 _mm_mask_storeu_epi8(__mem, __k, __vi);
1175 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1176 _mm_mask_storeu_epi16(__mem, __k, __vi);
1177 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1179 if constexpr (is_integral_v<_Tp>)
1180 _mm_mask_storeu_epi32(__mem, __k, __vi);
1182 _mm_mask_storeu_ps(__mem, __k, __vi);
1184 else if constexpr (__have_avx512vl &&
sizeof(_Tp) == 8)
1186 if constexpr (is_integral_v<_Tp>)
1187 _mm_mask_storeu_epi64(__mem, __k, __vi);
1189 _mm_mask_storeu_pd(__mem, __k, __vi);
1191 else if constexpr (__have_avx512f
1192 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
1195 _S_masked_store_nocvt(
1196 _SimdWrapper64<_Tp>(
1197 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1198 __mem, _SimdWrapper<
bool, 64 /
sizeof(_Tp)>(__k._M_data));
1201 _S_masked_store_nocvt(__v, __mem,
1202 _MaskImpl::template _S_to_maskvector<
1203 __int_for_sizeof_t<_Tp>, _Np>(__k));
1206 __assert_unreachable<_Tp>();
1209 template <
typename _Tp,
size_t _Np>
1210 _GLIBCXX_SIMD_INTRINSIC
static void 1211 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1212 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1214 if constexpr (
sizeof(__v) <= 16)
1216 [[maybe_unused]]
const auto __vi
1217 = __intrin_bitcast<__m128i>(__as_vector(__v));
1218 [[maybe_unused]]
const auto __ki
1219 = __intrin_bitcast<__m128i>(__as_vector(__k));
1220 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1221 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1222 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1223 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1224 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1225 && is_integral_v<_Tp>)
1226 _mm_maskstore_epi32(reinterpret_cast<
int*>(__mem), __ki, __vi);
1227 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1228 _mm_maskstore_ps(reinterpret_cast<
float*>(__mem), __ki,
1229 __vector_bitcast<
float>(__vi));
1230 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1231 && is_integral_v<_Tp>)
1232 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1233 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1234 _mm_maskstore_pd(reinterpret_cast<
double*>(__mem), __ki,
1235 __vector_bitcast<
double>(__vi));
1236 else if constexpr (__have_sse2)
1237 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<
char*>(__mem));
1239 else if constexpr (sizeof(__v) == 32)
1241 [[maybe_unused]]
const auto __vi
1242 = __intrin_bitcast<__m256i>(__as_vector(__v));
1243 [[maybe_unused]]
const auto __ki
1244 = __intrin_bitcast<__m256i>(__as_vector(__k));
1245 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1246 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1247 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1248 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1249 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1250 && is_integral_v<_Tp>)
1251 _mm256_maskstore_epi32(reinterpret_cast<
int*>(__mem), __ki, __vi);
1252 else if constexpr (sizeof(_Tp) == 4)
1253 _mm256_maskstore_ps(reinterpret_cast<
float*>(__mem), __ki,
1254 __vector_bitcast<
float>(__v));
1255 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1256 && is_integral_v<_Tp>)
1257 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1259 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1260 _mm256_maskstore_pd(reinterpret_cast<
double*>(__mem), __ki,
1261 __vector_bitcast<
double>(__v));
1262 else if constexpr (__have_sse2)
1264 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki),
1265 reinterpret_cast<char*>(__mem));
1266 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki),
1267 reinterpret_cast<char*>(__mem) + 16);
1271 __assert_unreachable<_Tp>();
1276 template <
typename _Tp,
size_t _Np,
typename _Up>
1277 _GLIBCXX_SIMD_INTRINSIC
static void 1278 _S_masked_store(
const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1279 const _MaskMember<_Tp> __k) noexcept
1281 if constexpr (is_integral_v<
1282 _Tp> && is_integral_v<_Up> &&
sizeof(_Tp) >
sizeof(_Up)
1283 && __have_avx512f && (
sizeof(_Tp) >= 4 || __have_avx512bw)
1284 && (
sizeof(__v) == 64 || __have_avx512vl))
1286 const auto __vi = __to_intrin(__v);
1287 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1288 if constexpr (
sizeof(_Tp) == 8 &&
sizeof(_Up) == 4
1289 &&
sizeof(__vi) == 64)
1290 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1291 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1292 && sizeof(__vi) == 32)
1293 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1294 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1295 && sizeof(__vi) == 16)
1296 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1297 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1298 && sizeof(__vi) == 64)
1299 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1300 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1301 && sizeof(__vi) == 32)
1302 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1303 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1304 && sizeof(__vi) == 16)
1305 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1306 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1307 && sizeof(__vi) == 64)
1308 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1309 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1310 && sizeof(__vi) == 32)
1311 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1312 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1313 && sizeof(__vi) == 16)
1314 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1315 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1316 && sizeof(__vi) == 64)
1317 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1318 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1319 && sizeof(__vi) == 32)
1320 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1321 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1322 && sizeof(__vi) == 16)
1323 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1324 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1325 && sizeof(__vi) == 64)
1326 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1327 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1328 && sizeof(__vi) == 32)
1329 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1330 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1331 && sizeof(__vi) == 16)
1332 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1333 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1334 && sizeof(__vi) == 64)
1335 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1336 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1337 && sizeof(__vi) == 32)
1338 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1339 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1340 && sizeof(__vi) == 16)
1341 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1343 __assert_unreachable<_Tp>();
1346 _Base::_S_masked_store(__v, __mem, __k);
1351 template <typename _V, typename _VVT = _VectorTraits<_V>>
1352 _GLIBCXX_SIMD_INTRINSIC static constexpr _V
1353 _S_multiplies(_V __x, _V __y)
1355 using _Tp =
typename _VVT::value_type;
1356 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1357 || __y._M_is_constprop())
1358 return __as_vector(__x) * __as_vector(__y);
1359 else if constexpr (
sizeof(_Tp) == 1)
1361 if constexpr (
sizeof(_V) == 2)
1363 const auto __xs =
reinterpret_cast<short>(__x._M_data);
1364 const auto __ys =
reinterpret_cast<short>(__y._M_data);
1365 return reinterpret_cast<__vector_type_t<_Tp, 2>
>(short(
1366 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1368 else if constexpr (
sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1370 const auto __xi =
reinterpret_cast<int>(__x._M_data);
1371 const auto __yi =
reinterpret_cast<int>(__y._M_data);
1372 return reinterpret_cast<__vector_type_t<_Tp, 3>
>(
1373 ((__xi * __yi) & 0xff)
1374 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1375 | ((__xi >> 16) * (__yi & 0xff0000)));
1377 else if constexpr (
sizeof(_V) == 4)
1379 const auto __xi =
reinterpret_cast<int>(__x._M_data);
1380 const auto __yi =
reinterpret_cast<int>(__y._M_data);
1381 return reinterpret_cast<__vector_type_t<_Tp, 4>
>(
1382 ((__xi * __yi) & 0xff)
1383 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1384 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1385 | ((__xi >> 24) * (__yi & 0xff000000u)));
1387 else if constexpr (
sizeof(_V) == 8 && __have_avx2
1388 && is_signed_v<_Tp>)
1389 return __convert<typename _VVT::type>(
1390 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1391 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1392 else if constexpr (
sizeof(_V) == 8 && __have_avx2
1393 && is_unsigned_v<_Tp>)
1394 return __convert<typename _VVT::type>(
1395 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1396 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1400 constexpr
size_t __full_size = _VVT::_S_full_size;
1401 constexpr
int _Np =
sizeof(_V) >= 16 ? __full_size / 2 : 8;
1402 using _ShortW = _SimdWrapper<short, _Np>;
1403 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1404 * __vector_bitcast<short, _Np>(__y);
1405 _ShortW __high_byte = _ShortW()._M_data - 256;
1408 = (__vector_bitcast<short, _Np>(__x) >> 8)
1409 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1410 if constexpr (__have_avx512bw &&
sizeof(_V) > 2)
1411 return _CommonImplX86::_S_blend_avx512(
1412 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1413 __vector_bitcast<_Tp>(__odd));
1414 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1415 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1417 __to_intrin(__even),
1418 __to_intrin(__odd));
1421 __or(__andnot(__high_byte, __even), __odd));
1425 return _Base::_S_multiplies(__x, __y);
1430 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993 1431 template <
typename _Tp,
size_t _Np>
1432 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
1433 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1435 if (!__builtin_is_constant_evaluated()
1436 && !__builtin_constant_p(__y._M_data))
1437 if constexpr (is_integral_v<_Tp> &&
sizeof(_Tp) <= 4)
1456 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1457 constexpr
size_t __n_intermediate
1458 =
std::min(_Np, (__have_avx512f ? 64
1462 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1463 constexpr
size_t __n_floatv
1464 = __div_roundup(_Np, __n_intermediate);
1465 using _R = __vector_type_t<_Tp, _Np>;
1466 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1467 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1468 _Abi::__make_padding_nonzero(__as_vector(__y)));
1469 return __call_with_n_evaluations<__n_floatv>(
1470 [](
auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1471 return __vector_convert<_R>(__quotients...);
1473 [&__xf, &__yf](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1474 -> _SimdWrapper<_Float, __n_intermediate>
1476 #if __RECIPROCAL_MATH__ 1482 if constexpr (__have_avx)
1486 if constexpr (
sizeof(_Tp) == 4)
1487 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}
" 1489 : "x
"(__xf[__i]), "x
"(__yf[__i])); 1491 asm("vdivps\t{%2, %1, %0|%0, %1, %2}
" 1493 : "x
"(__xf[__i]), "x
"(__yf[__i])); 1498 if constexpr (sizeof(_Tp) == 4) 1499 asm("divpd\t{%1, %0|%0, %1}
" 1503 asm("divps\t{%1, %0|%0, %1}
" 1509 return __xf[__i] / __yf[__i]; 1513 /* 64-bit int division is potentially optimizable via double division if 1514 * the value in __x is small enough and the conversion between 1515 * int<->double is efficient enough: 1516 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> && 1519 if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1521 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull, 1522 0xffe0'0000'0000'0000ull})) 1524 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data) 1529 return _Base::_S_divides(__x, __y); 1532 using _Base::_S_divides; 1533 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993 1537 template <typename _Tp, size_t _Np> 1538 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1539 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1541 if (__builtin_is_constant_evaluated() 1542 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8) 1543 return _Base::_S_modulus(__x, __y); 1545 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y))); 1549 // _S_bit_shift_left {{{ 1550 // Notes on UB. C++2a [expr.shift] says: 1551 // -1- [...] The operands shall be of integral or unscoped enumeration type 1552 // and integral promotions are performed. The type of the result is that 1553 // of the promoted left operand. The behavior is undefined if the right 1554 // operand is negative, or greater than or equal to the width of the 1555 // promoted left operand. 1556 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo 1557 // 2^N, where N is the width of the type of the result. 1559 // C++17 [expr.shift] says: 1560 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated 1561 // bits are zero-filled. If E1 has an unsigned type, the value of the 1562 // result is E1 × 2^E2 , reduced modulo one more than the maximum value 1563 // representable in the result type. Otherwise, if E1 has a signed type 1564 // and non-negative value, and E1 × 2^E2 is representable in the 1565 // corresponding unsigned type of the result type, then that value, 1566 // converted to the result type, is the resulting value; otherwise, the 1567 // behavior is undefined. 1570 // With C++2a signed and unsigned types have the same UB 1572 // - left shift is not UB for 0 <= RHS < max(32, #bits(T)) 1574 // With C++17 there's little room for optimizations because the standard 1575 // requires all shifts to happen on promoted integrals (i.e. int). Thus, 1576 // short and char shifts must assume shifts affect bits of neighboring 1578 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1579 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1580 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1581 _S_bit_shift_left(_Tp __xx, int __y) 1583 using _V = typename _TVT::type; 1584 using _Up = typename _TVT::value_type; 1586 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1587 if (__builtin_is_constant_evaluated()) 1589 #if __cplusplus > 201703 1590 // after C++17, signed shifts have no UB, and behave just like unsigned 1592 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) 1593 return __vector_bitcast<_Up>( 1594 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1597 else if constexpr (sizeof(_Up) == 1) 1599 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894) 1600 if (__builtin_constant_p(__y)) 1611 else if (__y > 2 && __y < 8) 1613 if constexpr (sizeof(__x) > sizeof(unsigned)) 1615 const _UChar __mask = 0xff << __y; // precomputed vector 1616 return __vector_bitcast<_Up>( 1617 __vector_bitcast<_UChar>( 1618 __vector_bitcast<unsigned>(__x) << __y) 1623 const unsigned __mask 1624 = (0xff & (0xff << __y)) * 0x01010101u; 1625 return reinterpret_cast<_V>( 1626 static_cast<__int_for_sizeof_t<_V>>( 1628 reinterpret_cast<__int_for_sizeof_t<_V>>(__x) 1633 else if (__y >= 8 && __y < 32) 1636 __builtin_unreachable(); 1638 // general strategy in the following: use an sllv instead of sll 1639 // instruction, because it's 2 to 4 times faster: 1640 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16) 1641 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8( 1642 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix), 1643 _mm256_set1_epi16(__y)))); 1644 else if constexpr (__have_avx512bw && sizeof(__x) == 32) 1645 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1646 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix), 1647 _mm512_set1_epi16(__y)))); 1648 else if constexpr (__have_avx512bw && sizeof(__x) == 64) 1650 const auto __shift = _mm512_set1_epi16(__y); 1651 return __vector_bitcast<_Up>( 1652 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1653 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)), 1654 _mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1655 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift)))); 1657 else if constexpr (__have_avx2 && sizeof(__x) == 32) 1660 const auto __shift = _mm_cvtsi32_si128(__y); 1662 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift); 1663 __k |= _mm256_srli_epi16(__k, 8); 1664 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift) 1667 const _Up __k = 0xff << __y; 1668 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y) 1674 const auto __shift = _mm_cvtsi32_si128(__y); 1676 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift); 1677 __k |= _mm_srli_epi16(__k, 8); 1678 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k); 1684 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1685 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1686 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y) 1688 using _V = typename _TVT::type; 1689 using _Up = typename _TVT::value_type; 1691 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1692 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1693 if (__builtin_is_constant_evaluated()) 1695 #if __cplusplus > 201703 1696 // after C++17, signed shifts have no UB, and behave just like unsigned 1698 else if constexpr (is_signed_v<_Up>) 1699 return __vector_bitcast<_Up>( 1700 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1701 __vector_bitcast<make_unsigned_t<_Up>>(__y))); 1703 else if constexpr (sizeof(_Up) == 1) 1705 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1706 return __vector_bitcast<_Up>(__concat( 1707 _mm512_cvtepi16_epi8( 1708 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)), 1709 _mm512_cvtepu8_epi16(__lo256(__iy)))), 1710 _mm512_cvtepi16_epi8( 1711 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)), 1712 _mm512_cvtepu8_epi16(__hi256(__iy)))))); 1713 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1714 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1715 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix), 1716 _mm512_cvtepu8_epi16(__iy)))); 1717 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl) 1718 return __intrin_bitcast<_V>( 1719 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix), 1720 _mm_cvtepu8_epi16(__iy)))); 1721 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1722 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1723 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix), 1724 _mm256_cvtepu8_epi16(__iy)))); 1725 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1726 return __intrin_bitcast<_V>( 1727 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1728 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)), 1729 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy)))))); 1730 else if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1733 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5); 1735 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1737 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1738 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4))); 1741 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1743 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1744 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2))); 1746 auto __x1 = __x + __x; 1747 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1748 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1))); 1750 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1752 else if constexpr (sizeof(__x) == 16) 1755 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5); 1757 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1759 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x; 1762 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1764 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x; 1766 auto __x1 = __x + __x; 1767 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x; 1769 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1774 else if constexpr (sizeof(_Up) == 2) 1776 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1777 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy)); 1778 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl) 1779 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy)); 1780 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1781 return __vector_bitcast<_Up>( 1782 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix), 1783 _mm512_castsi256_si512(__iy)))); 1784 else if constexpr (sizeof __ix == 32 && __have_avx2) 1786 const auto __ux = __vector_bitcast<unsigned>(__x); 1787 const auto __uy = __vector_bitcast<unsigned>(__y); 1788 return __vector_bitcast<_Up>(_mm256_blend_epi16( 1789 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1790 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1792 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1793 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy)); 1794 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1795 return __intrin_bitcast<_V>( 1796 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix), 1797 _mm512_castsi128_si512(__iy)))); 1798 else if constexpr (sizeof __ix == 16 && __have_avx2) 1800 const auto __ux = __vector_bitcast<unsigned>(__ix); 1801 const auto __uy = __vector_bitcast<unsigned>(__iy); 1802 return __intrin_bitcast<_V>(_mm_blend_epi16( 1803 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1804 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1806 else if constexpr (sizeof __ix == 16) 1808 using _Float4 = __vector_type_t<float, 4>; 1809 using _Int4 = __vector_type_t<int, 4>; 1810 using _UInt4 = __vector_type_t<unsigned, 4>; 1812 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3))); 1814 * __intrin_bitcast<_V>( 1815 __vector_convert<_Int4>(_SimdWrapper<float, 4>( 1816 reinterpret_cast<_Float4>(__yu << 23))) 1817 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>( 1818 reinterpret_cast<_Float4>((__yu >> 16) << 23))) 1822 __assert_unreachable<_Tp>(); 1824 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16 1826 // latency is suboptimal, but throughput is at full speedup 1827 return __intrin_bitcast<_V>( 1828 __vector_bitcast<unsigned>(__ix) 1829 * __vector_convert<__vector_type16_t<int>>( 1830 _SimdWrapper<float, 4>(__vector_bitcast<float>( 1831 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000)))); 1832 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16 1835 const auto __lo = _mm_sll_epi64(__ix, __iy); 1837 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy)); 1838 if constexpr (__have_sse4_1) 1839 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0)); 1841 return __vector_bitcast<_Up>( 1842 _mm_move_sd(__vector_bitcast<double>(__hi), 1843 __vector_bitcast<double>(__lo))); 1848 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 1851 // _S_bit_shift_right {{{ 1852 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1853 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1854 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1855 _S_bit_shift_right(_Tp __xx, int __y) 1857 using _V = typename _TVT::type; 1858 using _Up = typename _TVT::value_type; 1860 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1861 if (__builtin_is_constant_evaluated()) 1863 else if (__builtin_constant_p(__y) 1865 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__)) 1867 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{ 1868 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y) 1871 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{ 1872 return __intrin_bitcast<_V>( 1873 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix) 1876 | (__vector_bitcast<_UShort>( 1877 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8) 1881 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected 1882 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{ 1885 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32) 1886 & _Up(0xffff'ffff'0000'0000ull)) 1887 | __vector_bitcast<_Up>( 1888 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix) 1892 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix) 1894 | __vector_bitcast<_Up>( 1895 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll) 1903 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1904 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1905 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y) 1907 using _V = typename _TVT::type; 1908 using _Up = typename _TVT::value_type; 1910 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1911 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1912 if (__builtin_is_constant_evaluated() 1913 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 1915 else if constexpr (sizeof(_Up) == 1) //{{{ 1917 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl) 1918 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8( 1919 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix), 1920 _mm_cvtepi8_epi16(__iy)) 1921 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix), 1922 _mm_cvtepu8_epi16(__iy)))); 1923 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl) 1924 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1926 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix), 1927 _mm256_cvtepi8_epi16(__iy)) 1928 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix), 1929 _mm256_cvtepu8_epi16(__iy)))); 1930 else if constexpr (sizeof(__x) == 32 && __have_avx512bw) 1931 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1933 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix), 1934 _mm512_cvtepi8_epi16(__iy)) 1935 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix), 1936 _mm512_cvtepu8_epi16(__iy)))); 1937 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>) 1938 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1939 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1940 0x5555'5555'5555'5555ull, 1942 _mm512_slli_epi16(__ix, 8), 1943 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy, 1944 _mm512_set1_epi16(8))))); 1945 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>) 1946 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1947 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1948 0x5555'5555'5555'5555ull, 1950 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix), 1951 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy)))); 1952 /* This has better throughput but higher latency than the impl below 1953 else if constexpr (__have_avx2 && sizeof(__x) == 16 && 1956 const auto __shorts = __to_intrin(_S_bit_shift_right( 1957 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)), 1958 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy)))); 1959 return __vector_bitcast<_Up>( 1960 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts))); 1963 else if constexpr (__have_avx2 && sizeof(__x) > 8) 1964 // the following uses vpsr[al]vd, which requires AVX2 1965 if constexpr (is_signed_v<_Up>) 1967 const auto r3 = __vector_bitcast<_UInt>( 1968 (__vector_bitcast<int>(__x) 1969 >> (__vector_bitcast<_UInt>(__y) >> 24))) 1972 = __vector_bitcast<_UInt>( 1973 ((__vector_bitcast<int>(__x) << 8) 1974 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))) 1977 = __vector_bitcast<_UInt>( 1978 ((__vector_bitcast<int>(__x) << 16) 1979 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))) 1981 const auto r0 = __vector_bitcast<_UInt>( 1982 (__vector_bitcast<int>(__x) << 24) 1983 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24)); 1984 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1989 const auto r3 = (__vector_bitcast<_UInt>(__x) 1990 >> (__vector_bitcast<_UInt>(__y) >> 24)) 1993 = ((__vector_bitcast<_UInt>(__x) << 8) 1994 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)) 1997 = ((__vector_bitcast<_UInt>(__x) << 16) 1998 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)) 2001 = (__vector_bitcast<_UInt>(__x) << 24) 2002 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24); 2003 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 2006 else if constexpr (__have_sse4_1 2007 && is_unsigned_v<_Up> && sizeof(__x) > 2) 2009 auto __x128 = __vector_bitcast<_Up>(__ix); 2011 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5); 2012 auto __x4 = __vector_bitcast<_Up>( 2013 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f)); 2014 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 2015 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4))); 2017 auto __x2 = __vector_bitcast<_Up>( 2018 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f)); 2019 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 2020 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2))); 2022 auto __x1 = __vector_bitcast<_Up>( 2023 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f)); 2024 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 2025 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1))); 2026 return __intrin_bitcast<_V>( 2028 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2029 == 0)); // y > 7 nulls the result 2031 else if constexpr (__have_sse4_1 2032 && is_signed_v<_Up> && sizeof(__x) > 2) 2034 auto __mask = __vector_bitcast<_UChar>( 2035 __vector_bitcast<_UShort>(__iy) << 5); 2036 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2037 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); 2039 auto __xh = __vector_bitcast<short>(__ix); 2040 auto __xl = __vector_bitcast<short>(__ix) << 8; 2041 auto __xh4 = __xh >> 4; 2042 auto __xl4 = __xl >> 4; 2043 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2044 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4))); 2045 __xl = __vector_bitcast<short>( 2046 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2047 __to_intrin(__xl4))); 2049 auto __xh2 = __xh >> 2; 2050 auto __xl2 = __xl >> 2; 2051 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2052 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2))); 2053 __xl = __vector_bitcast<short>( 2054 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2055 __to_intrin(__xl2))); 2057 auto __xh1 = __xh >> 1; 2058 auto __xl1 = __xl >> 1; 2059 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2060 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1))); 2061 __xl = __vector_bitcast<short>( 2062 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2063 __to_intrin(__xl1))); 2064 return __intrin_bitcast<_V>( 2065 (__vector_bitcast<_Up>((__xh & short(0xff00))) 2066 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2068 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2069 == 0)); // y > 7 nulls the result 2071 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2 2074 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5); 2075 auto __x4 = __vector_bitcast<_Up>( 2076 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f)); 2077 __x = __mask > 0x7f ? __x4 : __x; 2079 auto __x2 = __vector_bitcast<_Up>( 2080 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f)); 2081 __x = __mask > 0x7f ? __x2 : __x; 2083 auto __x1 = __vector_bitcast<_Up>( 2084 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f)); 2085 __x = __mask > 0x7f ? __x1 : __x; 2087 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2089 else if constexpr (sizeof(__x) > 2) // signed SSE2 2091 static_assert(is_signed_v<_Up>); 2092 auto __maskh = __vector_bitcast<_UShort>(__y) << 5; 2093 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8); 2094 auto __xh = __vector_bitcast<short>(__x); 2095 auto __xl = __vector_bitcast<short>(__x) << 8; 2096 auto __xh4 = __xh >> 4; 2097 auto __xl4 = __xl >> 4; 2098 __xh = __maskh > 0x7fff ? __xh4 : __xh; 2099 __xl = __maskl > 0x7fff ? __xl4 : __xl; 2102 auto __xh2 = __xh >> 2; 2103 auto __xl2 = __xl >> 2; 2104 __xh = __maskh > 0x7fff ? __xh2 : __xh; 2105 __xl = __maskl > 0x7fff ? __xl2 : __xl; 2108 auto __xh1 = __xh >> 1; 2109 auto __xl1 = __xl >> 1; 2110 __xh = __maskh > 0x7fff ? __xh1 : __xh; 2111 __xl = __maskl > 0x7fff ? __xl1 : __xl; 2112 __x = __vector_bitcast<_Up>((__xh & short(0xff00))) 2113 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2116 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2121 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ 2123 [[maybe_unused]] auto __blend_0xaa 2124 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2125 if constexpr (sizeof(__a) == 16) 2126 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2128 else if constexpr (sizeof(__a) == 32) 2129 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2131 else if constexpr (sizeof(__a) == 64) 2132 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), 2135 __assert_unreachable<decltype(__a)>(); 2137 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) 2138 return __intrin_bitcast<_V>(is_signed_v<_Up> 2139 ? _mm_srav_epi16(__ix, __iy) 2140 : _mm_srlv_epi16(__ix, __iy)); 2141 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32) 2142 return __vector_bitcast<_Up>(is_signed_v<_Up> 2143 ? _mm256_srav_epi16(__ix, __iy) 2144 : _mm256_srlv_epi16(__ix, __iy)); 2145 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64) 2146 return __vector_bitcast<_Up>(is_signed_v<_Up> 2147 ? _mm512_srav_epi16(__ix, __iy) 2148 : _mm512_srlv_epi16(__ix, __iy)); 2149 else if constexpr (__have_avx2 && is_signed_v<_Up>) 2150 return __intrin_bitcast<_V>( 2151 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16) 2152 >> (__vector_bitcast<int>(__iy) & 0xffffu)) 2154 __vector_bitcast<int>(__ix) 2155 >> (__vector_bitcast<int>(__iy) >> 16))); 2156 else if constexpr (__have_avx2 && is_unsigned_v<_Up>) 2157 return __intrin_bitcast<_V>( 2158 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu) 2159 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu), 2160 __vector_bitcast<_UInt>(__ix) 2161 >> (__vector_bitcast<_UInt>(__iy) >> 16))); 2162 else if constexpr (__have_sse4_1) 2164 auto __mask = __vector_bitcast<_UShort>(__iy); 2165 auto __x128 = __vector_bitcast<_Up>(__ix); 2167 __mask = (__mask << 3) | (__mask << 11); 2168 // do __x128 = 0 where __y[4] is set 2169 __x128 = __vector_bitcast<_Up>( 2170 _mm_blendv_epi8(__to_intrin(__x128), __m128i(), 2171 __to_intrin(__mask))); 2172 // do __x128 =>> 8 where __y[3] is set 2173 __x128 = __vector_bitcast<_Up>( 2174 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8), 2175 __to_intrin(__mask += __mask))); 2176 // do __x128 =>> 4 where __y[2] is set 2177 __x128 = __vector_bitcast<_Up>( 2178 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4), 2179 __to_intrin(__mask += __mask))); 2180 // do __x128 =>> 2 where __y[1] is set 2181 __x128 = __vector_bitcast<_Up>( 2182 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2), 2183 __to_intrin(__mask += __mask))); 2184 // do __x128 =>> 1 where __y[0] is set 2185 return __intrin_bitcast<_V>( 2186 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1), 2187 __to_intrin(__mask + __mask))); 2191 auto __k = __vector_bitcast<_UShort>(__iy) << 11; 2192 auto __x128 = __vector_bitcast<_Up>(__ix); 2194 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2195 return __vector_bitcast<short>(__kk) < 0; 2197 // do __x128 = 0 where __y[4] is set 2198 __x128 = __mask(__k) ? decltype(__x128)() : __x128; 2199 // do __x128 =>> 8 where __y[3] is set 2200 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128; 2201 // do __x128 =>> 4 where __y[2] is set 2202 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128; 2203 // do __x128 =>> 2 where __y[1] is set 2204 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128; 2205 // do __x128 =>> 1 where __y[0] is set 2206 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1 2210 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{ 2212 if constexpr (is_unsigned_v<_Up>) 2214 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31 2215 const __m128 __factor_f = reinterpret_cast<__m128>( 2216 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23)); 2217 const __m128i __factor 2218 = __builtin_constant_p(__factor_f) 2220 __make_vector<unsigned>(__factor_f[0], __factor_f[1], 2221 __factor_f[2], __factor_f[3])) 2222 : _mm_cvttps_epi32(__factor_f); 2224 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31); 2225 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4), 2226 _mm_srli_si128(__factor, 4)); 2227 if constexpr (__have_sse4_1) 2228 return __intrin_bitcast<_V>( 2229 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33)); 2231 return __intrin_bitcast<_V>( 2232 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4)); 2236 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2237 if constexpr (is_signed_v<_Up>) 2238 return _mm_sra_epi32(__a, __b); 2240 return _mm_srl_epi32(__a, __b); 2243 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i())); 2244 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32)); 2246 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i())); 2247 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12)); 2248 if constexpr (__have_sse4_1) 2249 return __intrin_bitcast<_V>( 2250 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3), 2251 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0)); 2253 return __intrin_bitcast<_V>(_mm_unpacklo_epi64( 2254 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)), 2255 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4)))); 2261 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 2266 template <typename _Tp, size_t _Np> 2267 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2268 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2270 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2272 if (__builtin_is_constant_evaluated() 2273 || (__x._M_is_constprop() && __y._M_is_constprop())) 2274 return _MaskImpl::_S_to_bits( 2275 __as_wrapper<_Np>(__x._M_data == __y._M_data)); 2277 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2278 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2279 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2280 if constexpr (is_floating_point_v<_Tp>) 2282 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2283 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2284 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2285 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2286 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2287 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2288 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2289 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2290 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2291 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2292 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2293 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2295 __assert_unreachable<_Tp>(); 2297 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2298 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2299 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2300 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2301 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2302 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2303 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2304 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2306 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2308 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2309 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2310 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2311 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2312 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2313 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2314 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2315 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2316 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2317 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2318 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2319 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2320 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2322 __assert_unreachable<_Tp>(); 2324 else if (__builtin_is_constant_evaluated()) 2325 return _Base::_S_equal_to(__x, __y); 2326 else if constexpr (sizeof(__x) == 8) 2328 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2329 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2330 _MaskMember<_Tp> __r64{}; 2331 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2335 return _Base::_S_equal_to(__x, __y); 2339 // _S_not_equal_to {{{ 2340 template <typename _Tp, size_t _Np> 2341 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2342 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2344 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2346 if (__builtin_is_constant_evaluated() 2347 || (__x._M_is_constprop() && __y._M_is_constprop())) 2348 return _MaskImpl::_S_to_bits( 2349 __as_wrapper<_Np>(__x._M_data != __y._M_data)); 2351 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2352 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2353 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2354 if constexpr (is_floating_point_v<_Tp>) 2356 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2357 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2358 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2359 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2360 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2361 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2362 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2363 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2364 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2365 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2366 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2367 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2369 __assert_unreachable<_Tp>(); 2371 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2372 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2373 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2374 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2375 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2376 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2377 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2378 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2379 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2380 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2381 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2382 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2383 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2384 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2385 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2386 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2387 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2388 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2389 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2390 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2391 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2392 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2393 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2394 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2396 __assert_unreachable<_Tp>(); 2398 else if (__builtin_is_constant_evaluated()) 2399 return _Base::_S_not_equal_to(__x, __y); 2400 else if constexpr (sizeof(__x) == 8) 2402 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2403 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2404 _MaskMember<_Tp> __r64{}; 2405 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2409 return _Base::_S_not_equal_to(__x, __y); 2414 template <typename _Tp, size_t _Np> 2415 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2416 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2418 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2420 if (__builtin_is_constant_evaluated() 2421 || (__x._M_is_constprop() && __y._M_is_constprop())) 2422 return _MaskImpl::_S_to_bits( 2423 __as_wrapper<_Np>(__x._M_data < __y._M_data)); 2425 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2426 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2427 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2428 if constexpr (sizeof(__xi) == 64) 2430 if constexpr (is_same_v<_Tp, float>) 2431 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2432 else if constexpr (is_same_v<_Tp, double>) 2433 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2434 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2435 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2436 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2437 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2438 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2439 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2440 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2441 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2442 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2443 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2444 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2445 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2446 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2447 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2448 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2449 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2451 __assert_unreachable<_Tp>(); 2453 else if constexpr (sizeof(__xi) == 32) 2455 if constexpr (is_same_v<_Tp, float>) 2456 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2457 else if constexpr (is_same_v<_Tp, double>) 2458 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2459 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2460 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2461 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2462 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2463 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2464 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2465 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2466 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2467 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2468 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2469 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2470 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2471 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2472 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2473 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2474 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2476 __assert_unreachable<_Tp>(); 2478 else if constexpr (sizeof(__xi) == 16) 2480 if constexpr (is_same_v<_Tp, float>) 2481 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2482 else if constexpr (is_same_v<_Tp, double>) 2483 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2484 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2485 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2486 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2487 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2488 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2489 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2490 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2491 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2492 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2493 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2494 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2495 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2496 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2497 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2498 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2499 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2501 __assert_unreachable<_Tp>(); 2504 __assert_unreachable<_Tp>(); 2506 else if (__builtin_is_constant_evaluated()) 2507 return _Base::_S_less(__x, __y); 2508 else if constexpr (sizeof(__x) == 8) 2510 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2511 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2512 _MaskMember<_Tp> __r64{}; 2513 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2517 return _Base::_S_less(__x, __y); 2521 // _S_less_equal {{{ 2522 template <typename _Tp, size_t _Np> 2523 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2524 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2526 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2528 if (__builtin_is_constant_evaluated() 2529 || (__x._M_is_constprop() && __y._M_is_constprop())) 2530 return _MaskImpl::_S_to_bits( 2531 __as_wrapper<_Np>(__x._M_data <= __y._M_data)); 2533 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2534 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2535 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2536 if constexpr (sizeof(__xi) == 64) 2538 if constexpr (is_same_v<_Tp, float>) 2539 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2540 else if constexpr (is_same_v<_Tp, double>) 2541 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2542 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2543 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi); 2544 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2545 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi); 2546 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2547 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi); 2548 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2549 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi); 2550 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2551 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi); 2552 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2553 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi); 2554 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2555 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi); 2556 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2557 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi); 2559 __assert_unreachable<_Tp>(); 2561 else if constexpr (sizeof(__xi) == 32) 2563 if constexpr (is_same_v<_Tp, float>) 2564 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2565 else if constexpr (is_same_v<_Tp, double>) 2566 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2567 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2568 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi); 2569 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2570 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi); 2571 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2572 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi); 2573 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2574 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi); 2575 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2576 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi); 2577 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2578 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi); 2579 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2580 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi); 2581 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2582 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi); 2584 __assert_unreachable<_Tp>(); 2586 else if constexpr (sizeof(__xi) == 16) 2588 if constexpr (is_same_v<_Tp, float>) 2589 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2590 else if constexpr (is_same_v<_Tp, double>) 2591 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2592 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2593 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi); 2594 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2595 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi); 2596 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2597 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi); 2598 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2599 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi); 2600 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2601 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi); 2602 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2603 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi); 2604 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2605 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi); 2606 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2607 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi); 2609 __assert_unreachable<_Tp>(); 2612 __assert_unreachable<_Tp>(); 2614 else if (__builtin_is_constant_evaluated()) 2615 return _Base::_S_less_equal(__x, __y); 2616 else if constexpr (sizeof(__x) == 8) 2618 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2619 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2620 _MaskMember<_Tp> __r64{}; 2621 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2625 return _Base::_S_less_equal(__x, __y); 2630 template <typename _Tp, size_t _Np> 2631 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2632 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 2634 if constexpr (__is_avx512_abi<_Abi>()) 2635 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>()); 2637 return _Base::_S_negate(__x); 2642 using _Base::_S_abs; 2645 template <typename _Tp, size_t _Np> 2646 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2647 _S_sqrt(_SimdWrapper<_Tp, _Np> __x) 2649 if constexpr (__is_sse_ps<_Tp, _Np>()) 2650 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x))); 2651 else if constexpr (__is_sse_pd<_Tp, _Np>()) 2652 return _mm_sqrt_pd(__x); 2653 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2654 return _mm256_sqrt_ps(__x); 2655 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2656 return _mm256_sqrt_pd(__x); 2657 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 2658 return _mm512_sqrt_ps(__x); 2659 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2660 return _mm512_sqrt_pd(__x); 2662 __assert_unreachable<_Tp>(); 2667 template <typename _Tp, size_t _Np> 2668 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2669 _S_ldexp(_SimdWrapper<_Tp, _Np> __x, 2670 __fixed_size_storage_t<int, _Np> __exp) 2672 if constexpr (__is_avx512_abi<_Abi>()) 2674 const auto __xi = __to_intrin(__x); 2675 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi> 2677 const auto __expi = __to_intrin(__cvt(__exp)); 2678 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2679 if constexpr (sizeof(__xi) == 16) 2681 if constexpr (sizeof(_Tp) == 8) 2682 return _mm_maskz_scalef_pd(__k1, __xi, __expi); 2684 return _mm_maskz_scalef_ps(__k1, __xi, __expi); 2686 else if constexpr (sizeof(__xi) == 32) 2688 if constexpr (sizeof(_Tp) == 8) 2689 return _mm256_maskz_scalef_pd(__k1, __xi, __expi); 2691 return _mm256_maskz_scalef_ps(__k1, __xi, __expi); 2695 static_assert(sizeof(__xi) == 64); 2696 if constexpr (sizeof(_Tp) == 8) 2697 return _mm512_maskz_scalef_pd(__k1, __xi, __expi); 2699 return _mm512_maskz_scalef_ps(__k1, __xi, __expi); 2703 return _Base::_S_ldexp(__x, __exp); 2708 template <typename _Tp, size_t _Np> 2709 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2710 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2712 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2713 return _mm512_roundscale_ps(__x, 0x0b); 2714 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2715 return _mm512_roundscale_pd(__x, 0x0b); 2716 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2717 return _mm256_round_ps(__x, 0x3); 2718 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2719 return _mm256_round_pd(__x, 0x3); 2720 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2721 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x3)); 2722 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2723 return _mm_round_pd(__x, 0x3); 2724 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2727 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))); 2728 const auto __no_fractional_values 2729 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x)) 2731 < 0x4b000000; // the exponent is so large that no mantissa bits 2732 // signify fractional values (0x3f8 + 23*8 = 2734 return __no_fractional_values ? __truncated : __to_intrin(__x); 2737 return _Base::_S_trunc(__x); 2742 template <typename _Tp, size_t _Np> 2743 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2744 _S_round(_SimdWrapper<_Tp, _Np> __x) 2746 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away 2747 // from zero as required by std::round. Therefore this function is more 2749 using _V = __vector_type_t<_Tp, _Np>; 2751 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2752 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b); 2753 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2754 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b); 2755 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2756 __truncated = _mm256_round_ps(__x._M_data, 2757 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2758 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2759 __truncated = _mm256_round_pd(__x._M_data, 2760 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2761 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2762 __truncated = __auto_bitcast( 2763 _mm_round_ps(__to_intrin(__x), 2764 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); 2765 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2767 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2768 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2769 __truncated = __auto_bitcast( 2770 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)))); 2772 return _Base::_S_round(__x); 2774 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0 2775 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0 2779 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5) 2780 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1) 2782 if constexpr (__have_sse4_1) 2784 else // adjust for missing range in cvttps_epi32 2785 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded 2791 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2792 _GLIBCXX_SIMD_INTRINSIC static _Tp 2793 _S_nearbyint(_Tp __x) noexcept 2795 if constexpr (_TVT::template _S_is<float, 16>) 2796 return _mm512_roundscale_ps(__x, 0x0c); 2797 else if constexpr (_TVT::template _S_is<double, 8>) 2798 return _mm512_roundscale_pd(__x, 0x0c); 2799 else if constexpr (_TVT::template _S_is<float, 8>) 2800 return _mm256_round_ps(__x, 2801 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2802 else if constexpr (_TVT::template _S_is<double, 4>) 2803 return _mm256_round_pd(__x, 2804 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2805 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2806 return _mm_round_ps(__x, 2807 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2808 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2809 return _mm_round_pd(__x, 2810 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2812 return _Base::_S_nearbyint(__x); 2817 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2818 _GLIBCXX_SIMD_INTRINSIC static _Tp 2819 _S_rint(_Tp __x) noexcept 2821 if constexpr (_TVT::template _S_is<float, 16>) 2822 return _mm512_roundscale_ps(__x, 0x04); 2823 else if constexpr (_TVT::template _S_is<double, 8>) 2824 return _mm512_roundscale_pd(__x, 0x04); 2825 else if constexpr (_TVT::template _S_is<float, 8>) 2826 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2827 else if constexpr (_TVT::template _S_is<double, 4>) 2828 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2829 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2830 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2831 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2832 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2834 return _Base::_S_rint(__x); 2839 template <typename _Tp, size_t _Np> 2840 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2841 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2843 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2844 return _mm512_roundscale_ps(__x, 0x09); 2845 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2846 return _mm512_roundscale_pd(__x, 0x09); 2847 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2848 return _mm256_round_ps(__x, 0x1); 2849 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2850 return _mm256_round_pd(__x, 0x1); 2851 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2852 return __auto_bitcast(_mm_floor_ps(__to_intrin(__x))); 2853 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2854 return _mm_floor_pd(__x); 2856 return _Base::_S_floor(__x); 2861 template <typename _Tp, size_t _Np> 2862 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2863 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2865 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2866 return _mm512_roundscale_ps(__x, 0x0a); 2867 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2868 return _mm512_roundscale_pd(__x, 0x0a); 2869 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2870 return _mm256_round_ps(__x, 0x2); 2871 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2872 return _mm256_round_pd(__x, 0x2); 2873 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2874 return __auto_bitcast(_mm_ceil_ps(__to_intrin(__x))); 2875 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2876 return _mm_ceil_pd(__x); 2878 return _Base::_S_ceil(__x); 2883 template <typename _Tp, size_t _Np> 2884 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2885 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2887 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2889 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4) 2890 return _mm512_movepi32_mask( 2891 __intrin_bitcast<__m512i>(__x._M_data)); 2892 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8) 2893 return _mm512_movepi64_mask( 2894 __intrin_bitcast<__m512i>(__x._M_data)); 2895 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4) 2896 return _mm256_movepi32_mask( 2897 __intrin_bitcast<__m256i>(__x._M_data)); 2898 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8) 2899 return _mm256_movepi64_mask( 2900 __intrin_bitcast<__m256i>(__x._M_data)); 2901 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4) 2902 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2903 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8) 2904 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2906 else if constexpr (__is_avx512_abi<_Abi>()) 2908 const auto __xi = __to_intrin(__x); 2909 [[maybe_unused]] constexpr auto __k1 2910 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2911 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2912 return _mm_movemask_ps(__xi); 2913 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2914 return _mm_movemask_pd(__xi); 2915 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2916 return _mm256_movemask_ps(__xi); 2917 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2918 return _mm256_movemask_pd(__xi); 2919 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2920 return _mm512_mask_cmplt_epi32_mask( 2921 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2922 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2923 return _mm512_mask_cmplt_epi64_mask( 2924 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2926 __assert_unreachable<_Tp>(); 2929 return _Base::_S_signbit(__x); 2931 using _I = __int_for_sizeof_t<_Tp>; 2932 if constexpr (sizeof(__x) == 64) 2933 return _S_less(__vector_bitcast<_I>(__x), _I()); 2936 const auto __xx = __vector_bitcast<_I>(__x._M_data); 2937 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>; 2938 if constexpr ((sizeof(_Tp) == 4 && 2939 (__have_avx2 || sizeof(__x) == 16)) || 2942 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>); 2944 else if constexpr ((__have_avx2 || 2945 (__have_ssse3 && sizeof(__x) == 16))) 2947 return __vector_bitcast<_Tp>((__xx & __signmask) == 2951 { // SSE2/3 or AVX (w/o AVX2) 2952 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1); 2953 return __vector_bitcast<_Tp>( 2954 __vector_bitcast<_Tp>( 2955 (__xx & __signmask) | 2956 __vector_bitcast<_I>(__one)) // -1 or 1 2964 // _S_isnonzerovalue_mask {{{ 2965 // (isnormal | is subnormal == !isinf & !isnan & !is zero) 2966 template <typename _Tp> 2967 _GLIBCXX_SIMD_INTRINSIC static auto 2968 _S_isnonzerovalue_mask(_Tp __x) 2970 using _Traits = _VectorTraits<_Tp>; 2971 if constexpr (__have_avx512dq_vl) 2973 if constexpr (_Traits::template _S_is< 2974 float, 2> || _Traits::template _S_is<float, 4>) 2975 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f)); 2976 else if constexpr (_Traits::template _S_is<float, 8>) 2977 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f)); 2978 else if constexpr (_Traits::template _S_is<float, 16>) 2979 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f)); 2980 else if constexpr (_Traits::template _S_is<double, 2>) 2981 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f)); 2982 else if constexpr (_Traits::template _S_is<double, 4>) 2983 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f)); 2984 else if constexpr (_Traits::template _S_is<double, 8>) 2985 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f)); 2987 __assert_unreachable<_Tp>(); 2991 using _Up = typename _Traits::value_type; 2992 constexpr size_t _Np = _Traits::_S_full_size; 2993 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0 2994 const auto __b = __x * _Up(); // NaN if __x == inf 2995 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>()) 2996 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b), 2998 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>()) 3000 & _mm512_cmp_ps_mask(__auto_bitcast(__a), 3001 __auto_bitcast(__b), 3003 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>()) 3004 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 3005 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>()) 3007 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 3008 __auto_bitcast(__b), 3010 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>()) 3011 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 3012 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>()) 3013 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a), 3014 __auto_bitcast(__b), 3016 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>()) 3017 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 3018 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>()) 3020 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 3021 __auto_bitcast(__b), 3023 else if constexpr (__is_avx512_ps<_Up, _Np>()) 3024 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 3025 else if constexpr (__is_avx512_pd<_Up, _Np>()) 3026 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 3028 __assert_unreachable<_Tp>(); 3034 template <typename _Tp, size_t _Np> 3035 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3036 _S_isfinite(_SimdWrapper<_Tp, _Np> __x) 3038 static_assert(is_floating_point_v<_Tp>); 3039 #if !__FINITE_MATH_ONLY__ 3040 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3042 const auto __xi = __to_intrin(__x); 3043 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3044 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3045 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3046 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3047 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3048 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3049 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3050 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3051 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3052 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3053 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3054 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3055 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3057 else if constexpr (__is_avx512_abi<_Abi>()) 3059 // if all exponent bits are set, __x is either inf or NaN 3060 using _I = __int_for_sizeof_t<_Tp>; 3061 const auto __inf = __vector_bitcast<_I>( 3062 __vector_broadcast<_Np>(__infinity_v<_Tp>)); 3063 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf); 3067 return _Base::_S_isfinite(__x); 3072 template <typename _Tp, size_t _Np> 3073 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3074 _S_isinf(_SimdWrapper<_Tp, _Np> __x) 3076 #if !__FINITE_MATH_ONLY__ 3077 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3079 const auto __xi = __to_intrin(__x); 3080 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3081 return _mm512_fpclass_ps_mask(__xi, 0x18); 3082 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3083 return _mm512_fpclass_pd_mask(__xi, 0x18); 3084 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3085 return _mm256_fpclass_ps_mask(__xi, 0x18); 3086 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3087 return _mm256_fpclass_pd_mask(__xi, 0x18); 3088 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3089 return _mm_fpclass_ps_mask(__xi, 0x18); 3090 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3091 return _mm_fpclass_pd_mask(__xi, 0x18); 3093 __assert_unreachable<_Tp>(); 3095 else if constexpr (__have_avx512dq_vl) 3097 if constexpr (__is_sse_pd<_Tp, _Np>()) 3098 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18)); 3099 else if constexpr (__is_avx_pd<_Tp, _Np>()) 3100 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18)); 3101 else if constexpr (__is_sse_ps<_Tp, _Np>()) 3102 return _mm_movm_epi32( 3103 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18)); 3104 else if constexpr (__is_avx_ps<_Tp, _Np>()) 3105 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18)); 3107 __assert_unreachable<_Tp>(); 3111 return _Base::_S_isinf(__x); 3116 template <typename _Tp, size_t _Np> 3117 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3118 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 3120 #if __FINITE_MATH_ONLY__ 3121 [[maybe_unused]] constexpr int __mode = 0x26; 3123 [[maybe_unused]] constexpr int __mode = 0xbf; 3125 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3127 const auto __xi = __to_intrin(__x); 3128 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3129 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3130 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode); 3131 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3132 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode); 3133 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3134 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode); 3135 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3136 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode); 3137 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3138 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode); 3139 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3140 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode); 3142 __assert_unreachable<_Tp>(); 3144 else if constexpr (__have_avx512dq) 3146 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) 3147 return _mm_movm_epi32( 3148 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode))); 3149 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) 3150 return _mm256_movm_epi32( 3151 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode))); 3152 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 3153 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode)); 3154 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) 3155 return _mm_movm_epi64( 3156 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode))); 3157 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) 3158 return _mm256_movm_epi64( 3159 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode))); 3160 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 3161 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode)); 3163 __assert_unreachable<_Tp>(); 3165 else if constexpr (__is_avx512_abi<_Abi>()) 3167 using _I = __int_for_sizeof_t<_Tp>; 3168 const auto absn = __vector_bitcast<_I>(_S_abs(__x)); 3169 const auto minn = __vector_bitcast<_I>( 3170 __vector_broadcast<_Np>(__norm_min_v<_Tp>)); 3171 #if __FINITE_MATH_ONLY__ 3172 return _S_less_equal<_I, _Np>(minn, absn); 3175 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 3176 return __and(_S_less_equal<_I, _Np>(minn, absn), 3177 _S_less<_I, _Np>(absn, infn)); 3181 return _Base::_S_isnormal(__x); 3186 template <typename _Tp, size_t _Np> 3187 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3188 _S_isnan(_SimdWrapper<_Tp, _Np> __x) 3189 { return _S_isunordered(__x, __x); } 3192 // _S_isunordered {{{ 3193 template <typename _Tp, size_t _Np> 3194 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3195 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x, 3196 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y) 3198 #if __FINITE_MATH_ONLY__ 3201 const auto __xi = __to_intrin(__x); 3202 const auto __yi = __to_intrin(__y); 3203 if constexpr (__is_avx512_abi<_Abi>()) 3205 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3206 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3207 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3208 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3209 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3210 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3211 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3212 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3213 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3214 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3215 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3216 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3217 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3219 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3220 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q)); 3221 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3222 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q)); 3223 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3224 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi)); 3225 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3226 return __to_masktype(_mm_cmpunord_pd(__xi, __yi)); 3228 __assert_unreachable<_Tp>(); 3234 template <typename _Tp, size_t _Np> 3235 static constexpr _MaskMember<_Tp> 3236 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3238 const auto __xi = __to_intrin(__x); 3239 const auto __yi = __to_intrin(__y); 3240 if constexpr (__is_avx512_abi<_Abi>()) 3242 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3243 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3244 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3245 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3246 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3247 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3248 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3249 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3250 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3251 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3252 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3253 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3254 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3256 __assert_unreachable<_Tp>(); 3258 else if constexpr (__have_avx) 3260 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3261 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3262 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3263 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3264 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3265 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3266 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3267 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3269 __assert_unreachable<_Tp>(); 3271 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3272 && sizeof(_Tp) == 4) 3274 const auto __xn = __vector_bitcast<int>(__xi); 3275 const auto __yn = __vector_bitcast<int>(__yi); 3276 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3277 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3278 return __auto_bitcast( 3279 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp)); 3281 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3282 && sizeof(_Tp) == 8) 3283 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3284 -_mm_ucomigt_sd(__xi, __yi), 3285 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi), 3286 _mm_unpackhi_pd(__yi, __yi))}; 3288 return _Base::_S_isgreater(__x, __y); 3292 // _S_isgreaterequal {{{ 3293 template <typename _Tp, size_t _Np> 3294 static constexpr _MaskMember<_Tp> 3295 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3297 const auto __xi = __to_intrin(__x); 3298 const auto __yi = __to_intrin(__y); 3299 if constexpr (__is_avx512_abi<_Abi>()) 3301 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3302 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3303 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3304 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3305 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3306 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3307 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3308 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3309 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3310 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3311 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3312 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3313 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3315 __assert_unreachable<_Tp>(); 3317 else if constexpr (__have_avx) 3319 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3320 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3321 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3322 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3323 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3324 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3325 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3326 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3328 __assert_unreachable<_Tp>(); 3330 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3331 && sizeof(_Tp) == 4) 3333 const auto __xn = __vector_bitcast<int>(__xi); 3334 const auto __yn = __vector_bitcast<int>(__yi); 3335 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3336 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3337 return __auto_bitcast( 3338 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp)); 3340 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3341 && sizeof(_Tp) == 8) 3342 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3343 -_mm_ucomige_sd(__xi, __yi), 3344 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi), 3345 _mm_unpackhi_pd(__yi, __yi))}; 3347 return _Base::_S_isgreaterequal(__x, __y); 3352 template <typename _Tp, size_t _Np> 3353 static constexpr _MaskMember<_Tp> 3354 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3356 const auto __xi = __to_intrin(__x); 3357 const auto __yi = __to_intrin(__y); 3358 if constexpr (__is_avx512_abi<_Abi>()) 3360 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3361 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3362 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3363 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3364 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3365 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3366 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3367 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3368 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3369 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3370 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3371 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3372 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3374 __assert_unreachable<_Tp>(); 3376 else if constexpr (__have_avx) 3378 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3379 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3380 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3381 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3382 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3383 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3384 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3385 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3387 __assert_unreachable<_Tp>(); 3389 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3390 && sizeof(_Tp) == 4) 3392 const auto __xn = __vector_bitcast<int>(__xi); 3393 const auto __yn = __vector_bitcast<int>(__yi); 3394 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3395 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3396 return __auto_bitcast( 3397 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp)); 3399 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3400 && sizeof(_Tp) == 8) 3401 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3402 -_mm_ucomigt_sd(__yi, __xi), 3403 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi), 3404 _mm_unpackhi_pd(__xi, __xi))}; 3406 return _Base::_S_isless(__x, __y); 3410 // _S_islessequal {{{ 3411 template <typename _Tp, size_t _Np> 3412 static constexpr _MaskMember<_Tp> 3413 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3415 const auto __xi = __to_intrin(__x); 3416 const auto __yi = __to_intrin(__y); 3417 if constexpr (__is_avx512_abi<_Abi>()) 3419 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3420 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3421 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3422 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3423 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3424 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3425 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3426 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3427 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3428 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3429 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3430 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3431 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3433 __assert_unreachable<_Tp>(); 3435 else if constexpr (__have_avx) 3437 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3438 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3439 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3440 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3441 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3442 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3443 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3444 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3446 __assert_unreachable<_Tp>(); 3448 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3449 && sizeof(_Tp) == 4) 3451 const auto __xn = __vector_bitcast<int>(__xi); 3452 const auto __yn = __vector_bitcast<int>(__yi); 3453 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3454 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3455 return __auto_bitcast( 3456 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp)); 3458 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3459 && sizeof(_Tp) == 8) 3460 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3461 -_mm_ucomige_sd(__yi, __xi), 3462 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi), 3463 _mm_unpackhi_pd(__xi, __xi))}; 3465 return _Base::_S_islessequal(__x, __y); 3469 // _S_islessgreater {{{ 3470 template <typename _Tp, size_t _Np> 3471 static constexpr _MaskMember<_Tp> 3472 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3474 const auto __xi = __to_intrin(__x); 3475 const auto __yi = __to_intrin(__y); 3476 if constexpr (__is_avx512_abi<_Abi>()) 3478 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3479 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3480 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3481 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3482 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3483 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3484 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3485 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3486 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3487 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3488 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3489 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3490 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3492 __assert_unreachable<_Tp>(); 3494 else if constexpr (__have_avx) 3496 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3497 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3498 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3499 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3500 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3501 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3502 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3503 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3505 __assert_unreachable<_Tp>(); 3507 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3508 return __auto_bitcast( 3509 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi))); 3510 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3511 return __to_masktype( 3512 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi))); 3514 __assert_unreachable<_Tp>(); 3518 template <template <typename> class _Op, typename _Tp, typename _K, size_t _Np> 3519 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 3520 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v) 3522 if (__k._M_is_constprop_none_of()) 3524 else if (__k._M_is_constprop_all_of()) 3526 auto __vv = _Base::_M_make_simd(__v); 3527 _Op<decltype(__vv)> __op; 3528 return __data(__op(__vv)); 3530 else if constexpr (__is_bitmask_v<decltype(__k)> 3531 && (is_same_v<_Op<void>, __increment<void>> 3532 || is_same_v<_Op<void>, __decrement<void>>)) 3534 // optimize masked unary increment and decrement as masked sub +/-1 3535 constexpr int __pm_one 3536 = is_same_v<_Op<void>, __increment<void>> ? -1 : 1; 3538 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data; 3540 if constexpr (is_integral_v<_Tp>) 3542 constexpr bool __lp64 = sizeof(long) == sizeof(long long); 3543 using _Ip = std::make_signed_t<_Tp>; 3544 using _Up = std::conditional_t< 3545 std::is_same_v<_Ip, long>, 3546 std::conditional_t<__lp64, long long, int>, 3548 std::is_same_v<_Ip, signed char>, char, _Ip>>; 3549 const auto __value = __vector_bitcast<_Up>(__v._M_data); 3550 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3551 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \ 3552 return __vector_bitcast<_Tp>(__builtin_ia32_##_Instr##_mask(__value, \ 3553 __vector_broadcast<_Np>(_Up(__pm_one)), __value, __k._M_data)) 3554 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512); 3555 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256); 3556 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128); 3557 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512); 3558 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256); 3559 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128); 3560 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512); 3561 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256); 3562 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128); 3563 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512); 3564 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256); 3565 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128); 3566 #undef _GLIBCXX_SIMD_MASK_SUB 3570 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3571 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \ 3572 return __builtin_ia32_##_Instr##_mask( \ 3573 __v._M_data, __vector_broadcast<_Np>(_Tp(__pm_one)), __v._M_data, \ 3574 __k._M_data, _MM_FROUND_CUR_DIRECTION) 3575 _GLIBCXX_SIMD_MASK_SUB(4, 64, subps512); 3576 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256); 3577 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128); 3578 _GLIBCXX_SIMD_MASK_SUB(8, 64, subpd512); 3579 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256); 3580 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128); 3581 #undef _GLIBCXX_SIMD_MASK_SUB 3586 return _Base::template _S_masked_unary<_Op>(__k, __v); 3591 // _MaskImplX86Mixin {{{ 3592 struct _MaskImplX86Mixin 3594 template <typename _Tp> 3595 using _TypeTag = _Tp*; 3597 using _Base = _MaskImplBuiltinMixin; 3599 // _S_to_maskvector(bool) {{{ 3600 template <typename _Up, size_t _ToN = 1, typename _Tp> 3601 _GLIBCXX_SIMD_INTRINSIC static constexpr 3602 enable_if_t<is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>> 3603 _S_to_maskvector(_Tp __x) 3605 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3606 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 3607 : __vector_type_t<_Up, _ToN>(); 3611 // _S_to_maskvector(_SanitizedBitMask) {{{ 3612 template <typename _Up, size_t _UpN = 0, size_t _Np, size_t _ToN = _UpN == 0 ? _Np : _UpN> 3613 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3614 _S_to_maskvector(_SanitizedBitMask<_Np> __x) 3616 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3617 using _UV = __vector_type_t<_Up, _ToN>; 3618 using _UI = __intrinsic_type_t<_Up, _ToN>; 3619 [[maybe_unused]] const auto __k = __x._M_to_bits(); 3620 if constexpr (_Np == 1) 3621 return _S_to_maskvector<_Up, _ToN>(__k); 3622 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3623 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>( 3624 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; }); 3625 else if constexpr (sizeof(_Up) == 1) 3627 if constexpr (sizeof(_UI) == 16) 3629 if constexpr (__have_avx512bw_vl) 3630 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k)); 3631 else if constexpr (__have_avx512bw) 3632 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k))); 3633 else if constexpr (__have_avx512f) 3635 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3637 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3638 __hi256(__as32bits))); 3639 return __intrin_bitcast<_UV>( 3640 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 3642 else if constexpr (__have_ssse3) 3644 const auto __bitmask = __to_intrin( 3645 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 3646 8, 16, 32, 64, 128)); 3647 return __intrin_bitcast<_UV>( 3648 __vector_bitcast<_Up>( 3649 _mm_shuffle_epi8(__to_intrin( 3650 __vector_type_t<_ULLong, 2>{__k}), 3651 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3652 1, 1, 1, 1, 1, 1, 1)) 3656 // else fall through 3658 else if constexpr (sizeof(_UI) == 32) 3660 if constexpr (__have_avx512bw_vl) 3661 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k)); 3662 else if constexpr (__have_avx512bw) 3663 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k))); 3664 else if constexpr (__have_avx512f) 3666 auto __as16bits = // 0 16 1 17 ... 15 31 3667 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()), 3669 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16, 3672 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16( 3673 __lo256(__as16bits), 3674 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ... 3677 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8( 3678 __0_16_1_17, // 0 16 1 17 2 ... 3679 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 3680 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3682 15)))); // 0-7 16-23 8-15 24-31 -> xzyw 3683 // 0-3 8-11 16-19 24-27 3684 // 4-7 12-15 20-23 28-31 3686 else if constexpr (__have_avx2) 3688 const auto __bitmask 3689 = _mm256_broadcastsi128_si256(__to_intrin( 3690 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 3691 4, 8, 16, 32, 64, 128))); 3692 return __vector_bitcast<_Up>( 3693 __vector_bitcast<_Up>( 3694 _mm256_shuffle_epi8( 3695 _mm256_broadcastsi128_si256( 3696 __to_intrin(__vector_type_t<_ULLong, 2>{__k})), 3697 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3698 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3703 // else fall through 3705 else if constexpr (sizeof(_UI) == 64) 3706 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k)); 3707 if constexpr (std::min(_ToN, _Np) <= 4) 3709 if constexpr (_Np > 7) // avoid overflow 3710 __x &= _SanitizedBitMask<_Np>(0x0f); 3711 const _UInt __char_mask 3712 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL) 3715 __builtin_memcpy(&__r, &__char_mask, 3716 std::min(sizeof(__r), sizeof(__char_mask))); 3719 else if constexpr (std::min(_ToN, _Np) <= 7) 3721 if constexpr (_Np > 7) // avoid overflow 3722 __x &= _SanitizedBitMask<_Np>(0x7f); 3723 const _ULLong __char_mask 3724 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL) 3727 __builtin_memcpy(&__r, &__char_mask, 3728 std::min(sizeof(__r), sizeof(__char_mask))); 3732 else if constexpr (sizeof(_Up) == 2) 3734 if constexpr (sizeof(_UI) == 16) 3736 if constexpr (__have_avx512bw_vl) 3737 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k)); 3738 else if constexpr (__have_avx512bw) 3739 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k))); 3740 else if constexpr (__have_avx512f) 3742 __m256i __as32bits = {}; 3743 if constexpr (__have_avx512vl) 3744 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i()); 3747 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())); 3748 return __intrin_bitcast<_UV>( 3749 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits))); 3751 // else fall through 3753 else if constexpr (sizeof(_UI) == 32) 3755 if constexpr (__have_avx512bw_vl) 3756 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k)); 3757 else if constexpr (__have_avx512bw) 3758 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k))); 3759 else if constexpr (__have_avx512f) 3761 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3762 return __vector_bitcast<_Up>( 3763 __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3764 __hi256(__as32bits)))); 3766 // else fall through 3768 else if constexpr (sizeof(_UI) == 64) 3769 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k)); 3771 else if constexpr (sizeof(_Up) == 4) 3773 if constexpr (sizeof(_UI) == 16) 3775 if constexpr (__have_avx512dq_vl) 3776 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k)); 3777 else if constexpr (__have_avx512dq) 3778 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k))); 3779 else if constexpr (__have_avx512vl) 3780 return __intrin_bitcast<_UV>( 3781 _mm_maskz_mov_epi32(__k, ~__m128i())); 3782 else if constexpr (__have_avx512f) 3783 return __intrin_bitcast<_UV>( 3784 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3785 // else fall through 3787 else if constexpr (sizeof(_UI) == 32) 3789 if constexpr (__have_avx512dq_vl) 3790 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k)); 3791 else if constexpr (__have_avx512dq) 3792 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k))); 3793 else if constexpr (__have_avx512vl) 3794 return __vector_bitcast<_Up>( 3795 _mm256_maskz_mov_epi32(__k, ~__m256i())); 3796 else if constexpr (__have_avx512f) 3797 return __vector_bitcast<_Up>( 3798 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3799 // else fall through 3801 else if constexpr (sizeof(_UI) == 64) 3802 return __vector_bitcast<_Up>( 3803 __have_avx512dq ? _mm512_movm_epi32(__k) 3804 : _mm512_maskz_mov_epi32(__k, ~__m512i())); 3806 else if constexpr (sizeof(_Up) == 8) 3808 if constexpr (sizeof(_UI) == 16) 3810 if constexpr (__have_avx512dq_vl) 3811 return __vector_bitcast<_Up>(_mm_movm_epi64(__k)); 3812 else if constexpr (__have_avx512dq) 3813 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k))); 3814 else if constexpr (__have_avx512vl) 3815 return __vector_bitcast<_Up>( 3816 _mm_maskz_mov_epi64(__k, ~__m128i())); 3817 else if constexpr (__have_avx512f) 3818 return __vector_bitcast<_Up>( 3819 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3820 // else fall through 3822 else if constexpr (sizeof(_UI) == 32) 3824 if constexpr (__have_avx512dq_vl) 3825 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k)); 3826 else if constexpr (__have_avx512dq) 3827 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k))); 3828 else if constexpr (__have_avx512vl) 3829 return __vector_bitcast<_Up>( 3830 _mm256_maskz_mov_epi64(__k, ~__m256i())); 3831 else if constexpr (__have_avx512f) 3832 return __vector_bitcast<_Up>( 3833 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3834 // else fall through 3836 else if constexpr (sizeof(_UI) == 64) 3837 return __vector_bitcast<_Up>( 3838 __have_avx512dq ? _mm512_movm_epi64(__k) 3839 : _mm512_maskz_mov_epi64(__k, ~__m512i())); 3842 using _UpUInt = make_unsigned_t<_Up>; 3843 using _V = __vector_type_t<_UpUInt, _ToN>; 3844 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__; 3845 if constexpr (_ToN == 2) 3847 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])}); 3849 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32) 3851 if constexpr (sizeof(_Up) == 4) 3852 return __vector_bitcast<_Up>(_mm256_cmp_ps( 3853 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)), 3854 _mm256_castsi256_ps(_mm256_setr_epi32( 3855 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))), 3856 _mm256_setzero_ps(), _CMP_NEQ_UQ)); 3857 else if constexpr (sizeof(_Up) == 8) 3858 return __vector_bitcast<_Up>(_mm256_cmp_pd( 3859 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)), 3860 _mm256_castsi256_pd( 3861 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))), 3862 _mm256_setzero_pd(), _CMP_NEQ_UQ)); 3864 __assert_unreachable<_Up>(); 3866 else if constexpr (__bits_per_element >= _ToN) 3868 constexpr auto __bitmask 3869 = __generate_vector<_V>([](auto __i) 3870 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt 3871 { return __i < _ToN ? 1ull << __i : 0; }); 3873 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; 3874 if constexpr (__bits_per_element > _ToN) 3875 return __vector_bitcast<_Up>(__bits) > 0; 3877 return __vector_bitcast<_Up>(__bits != 0); 3882 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3883 return static_cast<_UpUInt>( 3884 __k >> (__bits_per_element * (__i / __bits_per_element))); 3886 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3887 return static_cast<_UpUInt>(1ull 3888 << (__i % __bits_per_element)); 3889 }); // mask bit index 3890 return __intrin_bitcast<_UV>(__tmp != _V()); 3895 // _S_to_maskvector(_SimdWrapper) {{{ 3896 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np, 3897 size_t _ToN = _UpN == 0 ? _Np : _UpN> 3898 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3899 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 3901 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3902 using _TW = _SimdWrapper<_Tp, _Np>; 3903 using _UW = _SimdWrapper<_Up, _ToN>; 3904 using _UI = __intrinsic_type_t<_Up, _ToN>; 3905 if constexpr (is_same_v<_Tp, bool>) // bits -> vector 3906 return _S_to_maskvector<_Up, _ToN>( 3907 _BitMask<_Np>(__x._M_data)._M_sanitized()); 3908 // vector -> vector bitcast 3909 else if constexpr (sizeof(_Up) == sizeof(_Tp) 3910 && sizeof(_TW) == sizeof(_UW)) 3911 return __wrapper_bitcast<_Up, _ToN>( 3914 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x)); 3915 else // vector -> vector {{{ 3917 if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3919 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); 3920 return __generate_from_n_evaluations<std::min(_ToN, _Np), 3921 __vector_type_t<_Up, _ToN>>( 3922 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; }); 3924 using _To = __vector_type_t<_Up, _ToN>; 3925 [[maybe_unused]] constexpr size_t _FromN = _Np; 3926 constexpr int _FromBytes = sizeof(_Tp); 3927 constexpr int _ToBytes = sizeof(_Up); 3928 const auto __k = __x._M_data; 3930 if constexpr (_FromBytes == _ToBytes) 3931 return __intrin_bitcast<_To>(__k); 3932 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16) 3934 if constexpr (_FromBytes == 4 && _ToBytes == 8) 3935 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3936 else if constexpr (_FromBytes == 2 && _ToBytes == 8) 3939 = __vector_bitcast<int>(__interleave128_lo(__k, __k)); 3940 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3942 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3945 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3947 = __vector_bitcast<int>(__interleave128_lo(__y, __y)); 3948 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z)); 3950 else if constexpr (_FromBytes == 8 && _ToBytes == 4 3952 return __intrin_bitcast<_To>( 3953 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3954 else if constexpr (_FromBytes == 8 && _ToBytes == 4) 3955 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k), 3957 else if constexpr (_FromBytes == 2 && _ToBytes == 4) 3958 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3959 else if constexpr (_FromBytes == 1 && _ToBytes == 4) 3962 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3963 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3965 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 3967 if constexpr (__have_sse2 && !__have_ssse3) 3968 return __intrin_bitcast<_To>(_mm_packs_epi32( 3969 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()), 3972 return __intrin_bitcast<_To>( 3973 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>( 3974 __vector_bitcast<_Up>(__k))); 3976 else if constexpr (_FromBytes == 4 && _ToBytes == 2) 3977 return __intrin_bitcast<_To>( 3978 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3979 else if constexpr (_FromBytes == 1 && _ToBytes == 2) 3980 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3981 else if constexpr (_FromBytes == 8 && _ToBytes == 1 3983 return __intrin_bitcast<_To>( 3984 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3985 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1, 3986 -1, -1, -1, -1, -1, -1, -1, 3988 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 3991 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3992 __y = _mm_packs_epi32(__y, __m128i()); 3993 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3995 else if constexpr (_FromBytes == 4 && _ToBytes == 1 3997 return __intrin_bitcast<_To>( 3998 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3999 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4000 -1, -1, -1, -1, -1, -1, -1, 4002 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4005 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 4006 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 4008 else if constexpr (_FromBytes == 2 && _ToBytes == 1) 4009 return __intrin_bitcast<_To>( 4010 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())); 4012 __assert_unreachable<_Tp>(); 4014 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32) 4016 if constexpr (_FromBytes == _ToBytes) 4017 __assert_unreachable<_Tp>(); 4018 else if constexpr (_FromBytes == _ToBytes * 2) 4020 const auto __y = __vector_bitcast<_LLong>(__k); 4021 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4022 _mm_packs_epi16(__lo128(__y), __hi128(__y)))); 4024 else if constexpr (_FromBytes == _ToBytes * 4) 4026 const auto __y = __vector_bitcast<_LLong>(__k); 4027 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4028 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4031 else if constexpr (_FromBytes == _ToBytes * 8) 4033 const auto __y = __vector_bitcast<_LLong>(__k); 4034 return __intrin_bitcast<_To>( 4035 _mm256_castsi128_si256(_mm_shuffle_epi8( 4036 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4037 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 4038 -1, -1, -1, -1, -1)))); 4040 else if constexpr (_FromBytes * 2 == _ToBytes) 4042 auto __y = __xzyw(__to_intrin(__k)); 4043 if constexpr (is_floating_point_v< 4044 _Tp> || (!__have_avx2 && _FromBytes == 4)) 4046 const auto __yy = __vector_bitcast<float>(__y); 4047 return __intrin_bitcast<_To>( 4048 _mm256_unpacklo_ps(__yy, __yy)); 4051 return __intrin_bitcast<_To>( 4052 _mm256_unpacklo_epi8(__y, __y)); 4054 else if constexpr (_FromBytes * 4 == _ToBytes) 4057 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4058 __lo128(__vector_bitcast<_LLong>( 4059 __k))); // drops 3/4 of input 4060 return __intrin_bitcast<_To>( 4061 __concat(_mm_unpacklo_epi16(__y, __y), 4062 _mm_unpackhi_epi16(__y, __y))); 4064 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 4067 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4068 __lo128(__vector_bitcast<_LLong>( 4069 __k))); // drops 3/4 of input 4071 = _mm_unpacklo_epi16(__y, 4072 __y); // drops another 1/2 => 7/8 total 4073 return __intrin_bitcast<_To>( 4074 __concat(_mm_unpacklo_epi32(__y, __y), 4075 _mm_unpackhi_epi32(__y, __y))); 4078 __assert_unreachable<_Tp>(); 4080 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16) 4082 if constexpr (_FromBytes == _ToBytes) 4083 return __intrin_bitcast<_To>( 4084 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>( 4085 __zero_extend(__to_intrin(__k)))); 4086 else if constexpr (_FromBytes * 2 == _ToBytes) 4088 return __intrin_bitcast<_To>( 4089 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k), 4090 __vector_bitcast<_LLong>(__k)), 4091 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k), 4092 __vector_bitcast<_LLong>(__k)))); 4094 else if constexpr (_FromBytes * 4 == _ToBytes) 4096 if constexpr (__have_avx2) 4098 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4099 __concat(__vector_bitcast<_LLong>(__k), 4100 __vector_bitcast<_LLong>(__k)), 4101 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 4102 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 4103 6, 6, 7, 7, 7, 7))); 4107 return __intrin_bitcast<_To>(__concat( 4108 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4109 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 4110 2, 2, 2, 2, 3, 3, 3, 3)), 4111 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4112 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5, 4113 6, 6, 6, 6, 7, 7, 7, 4117 else if constexpr (_FromBytes * 8 == _ToBytes) 4119 if constexpr (__have_avx2) 4121 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4122 __concat(__vector_bitcast<_LLong>(__k), 4123 __vector_bitcast<_LLong>(__k)), 4124 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 4125 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4126 3, 3, 3, 3, 3, 3))); 4130 return __intrin_bitcast<_To>(__concat( 4131 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4132 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 4133 1, 1, 1, 1, 1, 1, 1, 1)), 4134 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4135 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 4136 3, 3, 3, 3, 3, 3, 3, 4140 else if constexpr (_FromBytes == _ToBytes * 2) 4141 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4142 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())))); 4143 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 4145 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4146 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4147 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, 4148 -1, -1, -1, -1, -1, -1, -1, 4151 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4153 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4154 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4155 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4156 -1, -1, -1, -1, -1, -1, -1, 4159 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4161 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4162 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4163 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, 4164 -1, -1, -1, -1, -1, -1, -1, 4168 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable
"); 4170 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32) 4172 if constexpr (_FromBytes == _ToBytes) 4174 return __intrin_bitcast<_To>(__lo128(__k)); 4176 else if constexpr (_FromBytes == _ToBytes * 2) 4178 auto __y = __vector_bitcast<_LLong>(__k); 4179 return __intrin_bitcast<_To>( 4180 _mm_packs_epi16(__lo128(__y), __hi128(__y))); 4182 else if constexpr (_FromBytes == _ToBytes * 4) 4184 auto __y = __vector_bitcast<_LLong>(__k); 4185 return __intrin_bitcast<_To>( 4186 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4189 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4191 auto __y = __vector_bitcast<_LLong>(__k); 4192 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 4193 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4194 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, 4197 else if constexpr (_FromBytes * 2 == _ToBytes) 4199 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4200 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4202 else if constexpr (_FromBytes * 4 == _ToBytes) 4204 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4205 __y = _mm_unpacklo_epi8(__y, __y); 4206 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4208 else if constexpr (_FromBytes * 8 == _ToBytes) 4210 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4211 __y = _mm_unpacklo_epi8(__y, __y); 4212 __y = _mm_unpacklo_epi8(__y, __y); 4213 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4216 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable
"); 4219 return _Base::template _S_to_maskvector<_Up, _ToN>(__x); 4221 if constexpr (_FromBytes > _ToBytes) { 4222 const _To __y = __vector_bitcast<_Up>(__k); 4223 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4224 constexpr int _Stride = _FromBytes / _ToBytes; 4225 return _To{__y[(_Is + 1) * _Stride - 1]...}; 4226 }(make_index_sequence<std::min(_ToN, _FromN)>()); 4228 // {0, 0, 1, 1} (_Dups = 2, _Is<4>) 4229 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>) 4230 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>) 4232 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4233 constexpr int __dup = _ToBytes / _FromBytes; 4234 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...}); 4235 }(make_index_sequence<_FromN>()); 4243 template <typename _Tp, size_t _Np> 4244 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 4245 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 4247 if constexpr (is_same_v<_Tp, bool>) 4248 return _BitMask<_Np>(__x._M_data)._M_sanitized(); 4251 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4252 if (__builtin_is_constant_evaluated() 4253 || __builtin_constant_p(__x._M_data)) 4255 const auto __bools = -__x._M_data; 4256 const _ULLong __k = __call_with_n_evaluations<_Np>( 4257 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4258 return (__bits | ...); 4259 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4260 return _ULLong(__bools[+__i]) << __i; 4262 if (__builtin_is_constant_evaluated() 4263 || __builtin_constant_p(__k)) 4266 const auto __xi = __to_intrin(__x); 4267 if constexpr (sizeof(_Tp) == 1) 4268 if constexpr (sizeof(__xi) == 16) 4269 if constexpr (__have_avx512bw_vl) 4270 return _BitMask<_Np>(_mm_movepi8_mask(__xi)); 4271 else // implies SSE2 4272 return _BitMask<_Np>(_mm_movemask_epi8(__xi)); 4273 else if constexpr (sizeof(__xi) == 32) 4274 if constexpr (__have_avx512bw_vl) 4275 return _BitMask<_Np>(_mm256_movepi8_mask(__xi)); 4276 else // implies AVX2 4277 return _BitMask<_Np>(_mm256_movemask_epi8(__xi)); 4278 else // implies AVX512BW 4279 return _BitMask<_Np>(_mm512_movepi8_mask(__xi)); 4281 else if constexpr (sizeof(_Tp) == 2) 4282 if constexpr (sizeof(__xi) == 16) 4283 if constexpr (__have_avx512bw_vl) 4284 return _BitMask<_Np>(_mm_movepi16_mask(__xi)); 4285 else if constexpr (__have_avx512bw) 4286 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4287 else // implies SSE2 4288 return _BitMask<_Np>( 4289 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i()))); 4290 else if constexpr (sizeof(__xi) == 32) 4291 if constexpr (__have_avx512bw_vl) 4292 return _BitMask<_Np>(_mm256_movepi16_mask(__xi)); 4293 else if constexpr (__have_avx512bw) 4294 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4295 else // implies SSE2 4296 return _BitMask<_Np>(_mm_movemask_epi8( 4297 _mm_packs_epi16(__lo128(__xi), __hi128(__xi)))); 4298 else // implies AVX512BW 4299 return _BitMask<_Np>(_mm512_movepi16_mask(__xi)); 4301 else if constexpr (sizeof(_Tp) == 4) 4302 if constexpr (sizeof(__xi) == 16) 4303 if constexpr (__have_avx512dq_vl) 4304 return _BitMask<_Np>(_mm_movepi32_mask(__xi)); 4305 else if constexpr (__have_avx512vl) 4306 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i())); 4307 else if constexpr (__have_avx512dq) 4308 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4309 else if constexpr (__have_avx512f) 4310 return _BitMask<_Np>( 4311 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4313 return _BitMask<_Np>( 4314 _mm_movemask_ps(reinterpret_cast<__m128>(__xi))); 4315 else if constexpr (sizeof(__xi) == 32) 4316 if constexpr (__have_avx512dq_vl) 4317 return _BitMask<_Np>(_mm256_movepi32_mask(__xi)); 4318 else if constexpr (__have_avx512dq) 4319 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4320 else if constexpr (__have_avx512vl) 4321 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i())); 4322 else if constexpr (__have_avx512f) 4323 return _BitMask<_Np>( 4324 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4326 return _BitMask<_Np>( 4327 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi))); 4328 else // implies AVX512?? 4329 if constexpr (__have_avx512dq) 4330 return _BitMask<_Np>(_mm512_movepi32_mask(__xi)); 4331 else // implies AVX512F 4332 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i())); 4334 else if constexpr (sizeof(_Tp) == 8) 4335 if constexpr (sizeof(__xi) == 16) 4336 if constexpr (__have_avx512dq_vl) 4337 return _BitMask<_Np>(_mm_movepi64_mask(__xi)); 4338 else if constexpr (__have_avx512dq) 4339 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4340 else if constexpr (__have_avx512vl) 4341 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i())); 4342 else if constexpr (__have_avx512f) 4343 return _BitMask<_Np>( 4344 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4345 else // implies SSE2 4346 return _BitMask<_Np>( 4347 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi))); 4348 else if constexpr (sizeof(__xi) == 32) 4349 if constexpr (__have_avx512dq_vl) 4350 return _BitMask<_Np>(_mm256_movepi64_mask(__xi)); 4351 else if constexpr (__have_avx512dq) 4352 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4353 else if constexpr (__have_avx512vl) 4354 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i())); 4355 else if constexpr (__have_avx512f) 4356 return _BitMask<_Np>( 4357 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4359 return _BitMask<_Np>( 4360 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi))); 4361 else // implies AVX512?? 4362 if constexpr (__have_avx512dq) 4363 return _BitMask<_Np>(_mm512_movepi64_mask(__xi)); 4364 else // implies AVX512F 4365 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i())); 4368 __assert_unreachable<_Tp>(); 4376 template <typename _Abi> 4377 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi> 4379 using _MaskImplX86Mixin::_S_to_bits; 4380 using _MaskImplX86Mixin::_S_to_maskvector; 4381 using _MaskImplBuiltin<_Abi>::_S_convert; 4384 template <typename _Tp> 4385 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 4387 template <typename _Tp> 4388 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 4390 template <typename _Tp> 4391 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 4393 using _Base = _MaskImplBuiltin<_Abi>; 4397 template <typename _Tp> 4398 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4399 _S_broadcast(bool __x) 4401 if constexpr (__is_avx512_abi<_Abi>()) 4402 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1)) 4403 : _MaskMember<_Tp>(); 4405 return _Base::template _S_broadcast<_Tp>(__x); 4410 template <typename _Tp> 4411 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4412 _S_load(const bool* __mem) 4414 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4415 if (__builtin_is_constant_evaluated()) 4417 if constexpr (__is_avx512_abi<_Abi>()) 4419 _MaskMember<_Tp> __r{}; 4420 for (size_t __i = 0; __i < _S_size<_Tp>; ++__i) 4421 __r._M_data |= _ULLong(__mem[__i]) << __i; 4425 return _Base::template _S_load<_Tp>(__mem); 4427 else if constexpr (__have_avx512bw) 4429 const auto __to_vec_or_bits 4430 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) { 4431 if constexpr (__is_avx512_abi<_Abi>()) 4434 return _S_to_maskvector<_Tp>( 4435 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); 4438 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) 4441 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4442 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a)); 4444 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl) 4447 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4448 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a)); 4450 else if constexpr (_S_size<_Tp> <= 64) 4453 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4454 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a)); 4457 else if constexpr (__is_avx512_abi<_Abi>()) 4459 if constexpr (_S_size<_Tp> <= 8) 4462 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4463 const auto __b = _mm512_cvtepi8_epi64(__a); 4464 return _mm512_test_epi64_mask(__b, __b); 4466 else if constexpr (_S_size<_Tp> <= 16) 4469 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4470 const auto __b = _mm512_cvtepi8_epi32(__a); 4471 return _mm512_test_epi32_mask(__b, __b); 4473 else if constexpr (_S_size<_Tp> <= 32) 4476 __builtin_memcpy(&__a, __mem, 16); 4477 const auto __b = _mm512_cvtepi8_epi32(__a); 4478 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16); 4479 const auto __c = _mm512_cvtepi8_epi32(__a); 4480 return _mm512_test_epi32_mask(__b, __b) 4481 | (_mm512_test_epi32_mask(__c, __c) << 16); 4483 else if constexpr (_S_size<_Tp> <= 64) 4486 __builtin_memcpy(&__a, __mem, 16); 4487 const auto __b = _mm512_cvtepi8_epi32(__a); 4488 __builtin_memcpy(&__a, __mem + 16, 16); 4489 const auto __c = _mm512_cvtepi8_epi32(__a); 4490 if constexpr (_S_size<_Tp> <= 48) 4492 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32); 4493 const auto __d = _mm512_cvtepi8_epi32(__a); 4494 return _mm512_test_epi32_mask(__b, __b) 4495 | (_mm512_test_epi32_mask(__c, __c) << 16) 4496 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32); 4500 __builtin_memcpy(&__a, __mem + 16, 16); 4501 const auto __d = _mm512_cvtepi8_epi32(__a); 4502 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48); 4503 const auto __e = _mm512_cvtepi8_epi32(__a); 4504 return _mm512_test_epi32_mask(__b, __b) 4505 | (_mm512_test_epi32_mask(__c, __c) << 16) 4506 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32) 4507 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48); 4511 __assert_unreachable<_Tp>(); 4513 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2) 4514 return __vector_bitcast<_Tp>( 4515 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]), 4516 -int(__mem[1]), -int(__mem[1])}); 4517 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx) 4520 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>); 4521 const auto __k = __to_intrin( 4522 (__vector_broadcast<4>(__bool4) 4523 & __make_vector<int>(0x1, 0x100, 0x10000, 4524 _S_size<_Tp> == 4 ? 0x1000000 : 0)) 4526 return __vector_bitcast<_Tp>( 4527 __concat(_mm_unpacklo_epi32(__k, __k), 4528 _mm_unpackhi_epi32(__k, __k))); 4530 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4) 4533 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>); 4534 if constexpr (__have_sse2) 4536 __m128i __k = _mm_cvtsi32_si128(__bools); 4537 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4538 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4539 _mm_unpacklo_epi16(__k, __k)); 4543 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools)); 4545 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4546 _mm_cmpgt_ps(__k, __m128())); 4549 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8) 4552 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4553 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4554 return __vector_bitcast<_Tp>( 4555 __concat(_mm_unpacklo_epi16(__k, __k), 4556 _mm_unpackhi_epi16(__k, __k))); 4558 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16) 4561 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4562 __k = _mm_cmpgt_epi8(__k, __m128i()); 4563 if constexpr (_S_size<_Tp> <= 8) 4564 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4565 _mm_unpacklo_epi8(__k, __k)); 4567 return __concat(_mm_unpacklo_epi8(__k, __k), 4568 _mm_unpackhi_epi8(__k, __k)); 4571 return _Base::template _S_load<_Tp>(__mem); 4575 // _S_from_bitmask{{{ 4576 template <size_t _Np, typename _Tp> 4577 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 4578 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) 4580 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4581 if constexpr (__is_avx512_abi<_Abi>()) 4582 return __bits._M_to_bits(); 4584 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); 4588 // _S_masked_load {{{2 4589 template <typename _Tp, size_t _Np> 4590 static inline _SimdWrapper<_Tp, _Np> 4591 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, 4592 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept 4594 if constexpr (__is_avx512_abi<_Abi>()) 4596 if constexpr (__have_avx512bw_vl) 4598 if constexpr (_Np <= 16) 4601 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem); 4602 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a); 4604 else if constexpr (_Np <= 32) 4607 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem); 4608 return (__merge & ~__mask) 4609 | _mm256_test_epi8_mask(__a, __a); 4611 else if constexpr (_Np <= 64) 4614 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem); 4615 return (__merge & ~__mask) 4616 | _mm512_test_epi8_mask(__a, __a); 4619 __assert_unreachable<_Tp>(); 4623 _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4624 __merge._M_set(__i, __mem[__i]); 4629 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1) 4631 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4632 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(), 4633 _mm256_mask_loadu_epi8(__m256i(), 4636 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1) 4638 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4640 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k, 4642 _mm_mask_loadu_epi8(__m128i(), __k, __mem)); 4644 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2) 4646 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4647 __merge = _mm256_mask_sub_epi16( 4648 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4649 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4651 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2) 4653 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4654 __merge = _mm_mask_sub_epi16( 4655 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4656 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4658 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4) 4660 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4661 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32( 4662 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4663 _mm256_cvtepi8_epi32( 4664 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4666 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4) 4668 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4669 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32( 4670 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4671 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4673 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8) 4675 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4676 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64( 4677 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4678 _mm256_cvtepi8_epi64( 4679 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4681 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8) 4683 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4684 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64( 4685 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4686 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4689 return _Base::_S_masked_load(__merge, __mask, __mem); 4694 template <typename _Tp, size_t _Np> 4695 _GLIBCXX_SIMD_INTRINSIC static constexpr void 4696 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept 4698 if (__builtin_is_constant_evaluated()) 4699 _Base::_S_store(__v, __mem); 4700 else if constexpr (__is_avx512_abi<_Abi>()) 4702 if constexpr (__have_avx512bw_vl) 4703 _CommonImplX86::_S_store<_Np>( 4704 __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4705 if constexpr (_Np <= 16) 4706 return _mm_maskz_set1_epi8(__data, 1); 4707 else if constexpr (_Np <= 32) 4708 return _mm256_maskz_set1_epi8(__data, 1); 4710 return _mm512_maskz_set1_epi8(__data, 1); 4713 else if constexpr (_Np <= 8) 4714 _CommonImplX86::_S_store<_Np>( 4715 __vector_bitcast<char>( 4716 #if defined __x86_64__ 4717 __make_wrapper<_ULLong>( 4718 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull) 4720 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U), 4721 _pdep_u32(__v._M_data >> 4, 4726 else if constexpr (_Np <= 16) 4727 _mm512_mask_cvtepi32_storeu_epi8( 4728 __mem, 0xffffu >> (16 - _Np), 4729 _mm512_maskz_set1_epi32(__v._M_data, 1)); 4731 __assert_unreachable<_Tp>(); 4733 else if constexpr (__is_sse_abi<_Abi>()) //{{{ 4735 if constexpr (_Np == 2 && sizeof(_Tp) == 8) 4737 const auto __k = __vector_bitcast<int>(__v); 4741 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 4743 if constexpr (__have_sse2) 4745 const unsigned __bool4 4746 = __vector_bitcast<_UInt>(_mm_packs_epi16( 4747 _mm_packs_epi32(__intrin_bitcast<__m128i>( 4752 __builtin_memcpy(__mem, &__bool4, _Np); 4754 else if constexpr (__have_mmx) 4756 const __m64 __k = _mm_cvtps_pi8( 4757 __and(__to_intrin(__v), _mm_set1_ps(1.f))); 4758 __builtin_memcpy(__mem, &__k, _Np); 4762 return _Base::_S_store(__v, __mem); 4764 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 4766 _CommonImplX86::_S_store<_Np>( 4767 __vector_bitcast<char>(_mm_packs_epi16( 4768 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15), 4772 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 4773 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem); 4775 __assert_unreachable<_Tp>(); 4777 else if constexpr (__is_avx_abi<_Abi>()) // {{{ 4779 if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 4781 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4783 if constexpr (__have_avx2) 4784 __bool4 = _mm256_movemask_epi8(__k); 4786 __bool4 = (_mm_movemask_epi8(__lo128(__k)) 4787 | (_mm_movemask_epi8(__hi128(__k)) << 16)); 4788 __bool4 &= 0x01010101; 4789 __builtin_memcpy(__mem, &__bool4, _Np); 4791 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4) 4793 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4795 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)), 4798 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i())); 4799 _CommonImplX86::_S_store<_Np>(__k3, __mem); 4801 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2) 4803 if constexpr (__have_avx2) 4805 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15); 4806 const auto __bools = __vector_bitcast<char>( 4807 _mm_packs_epi16(__lo128(__x), __hi128(__x))); 4808 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4814 & __vector_bitcast<_UChar>( 4815 _mm_packs_epi16(__lo128(__to_intrin(__v)), 4816 __hi128(__to_intrin(__v)))); 4817 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4820 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1) 4821 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem); 4823 __assert_unreachable<_Tp>(); 4826 __assert_unreachable<_Tp>(); 4829 // _S_masked_store {{{2 4830 template <typename _Tp, size_t _Np> 4832 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, 4833 const _SimdWrapper<_Tp, _Np> __k) noexcept 4835 if constexpr (__is_avx512_abi<_Abi>()) 4837 static_assert(is_same_v<_Tp, bool>); 4838 if constexpr (_Np <= 16 && __have_avx512bw_vl) 4839 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1)); 4840 else if constexpr (_Np <= 16) 4841 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k, 4842 _mm512_maskz_set1_epi32(__v, 1)); 4843 else if constexpr (_Np <= 32 && __have_avx512bw_vl) 4844 _mm256_mask_storeu_epi8(__mem, __k, 4845 _mm256_maskz_set1_epi8(__v, 1)); 4846 else if constexpr (_Np <= 32 && __have_avx512bw) 4847 _mm256_mask_storeu_epi8(__mem, __k, 4848 __lo256(_mm512_maskz_set1_epi8(__v, 1))); 4849 else if constexpr (_Np <= 64 && __have_avx512bw) 4850 _mm512_mask_storeu_epi8(__mem, __k, 4851 _mm512_maskz_set1_epi8(__v, 1)); 4853 __assert_unreachable<_Tp>(); 4856 _Base::_S_masked_store(__v, __mem, __k); 4859 // logical and bitwise operators {{{2 4860 template <typename _Tp, size_t _Np> 4861 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4862 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4864 if constexpr (is_same_v<_Tp, bool>) 4866 if (__builtin_is_constant_evaluated()) 4867 return __x._M_data & __y._M_data; 4868 else if constexpr (__have_avx512dq && _Np <= 8) 4869 return _kand_mask8(__x._M_data, __y._M_data); 4870 else if constexpr (_Np <= 16) 4871 return _kand_mask16(__x._M_data, __y._M_data); 4872 else if constexpr (__have_avx512bw && _Np <= 32) 4873 return _kand_mask32(__x._M_data, __y._M_data); 4874 else if constexpr (__have_avx512bw && _Np <= 64) 4875 return _kand_mask64(__x._M_data, __y._M_data); 4877 __assert_unreachable<_Tp>(); 4880 return _Base::_S_logical_and(__x, __y); 4883 template <typename _Tp, size_t _Np> 4884 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4885 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4887 if constexpr (is_same_v<_Tp, bool>) 4889 if (__builtin_is_constant_evaluated()) 4890 return __x._M_data | __y._M_data; 4891 else if constexpr (__have_avx512dq && _Np <= 8) 4892 return _kor_mask8(__x._M_data, __y._M_data); 4893 else if constexpr (_Np <= 16) 4894 return _kor_mask16(__x._M_data, __y._M_data); 4895 else if constexpr (__have_avx512bw && _Np <= 32) 4896 return _kor_mask32(__x._M_data, __y._M_data); 4897 else if constexpr (__have_avx512bw && _Np <= 64) 4898 return _kor_mask64(__x._M_data, __y._M_data); 4900 __assert_unreachable<_Tp>(); 4903 return _Base::_S_logical_or(__x, __y); 4906 template <typename _Tp, size_t _Np> 4907 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4908 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x) 4910 if constexpr (is_same_v<_Tp, bool>) 4912 if (__builtin_is_constant_evaluated()) 4913 return __x._M_data ^ _Abi::template __implicit_mask_n<_Np>(); 4914 else if constexpr (__have_avx512dq && _Np <= 8) 4915 return _kandn_mask8(__x._M_data, 4916 _Abi::template __implicit_mask_n<_Np>()); 4917 else if constexpr (_Np <= 16) 4918 return _kandn_mask16(__x._M_data, 4919 _Abi::template __implicit_mask_n<_Np>()); 4920 else if constexpr (__have_avx512bw && _Np <= 32) 4921 return _kandn_mask32(__x._M_data, 4922 _Abi::template __implicit_mask_n<_Np>()); 4923 else if constexpr (__have_avx512bw && _Np <= 64) 4924 return _kandn_mask64(__x._M_data, 4925 _Abi::template __implicit_mask_n<_Np>()); 4927 __assert_unreachable<_Tp>(); 4930 return _Base::_S_bit_not(__x); 4933 template <typename _Tp, size_t _Np> 4934 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4935 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4937 if constexpr (is_same_v<_Tp, bool>) 4939 if (__builtin_is_constant_evaluated()) 4940 return __x._M_data & __y._M_data; 4941 else if constexpr (__have_avx512dq && _Np <= 8) 4942 return _kand_mask8(__x._M_data, __y._M_data); 4943 else if constexpr (_Np <= 16) 4944 return _kand_mask16(__x._M_data, __y._M_data); 4945 else if constexpr (__have_avx512bw && _Np <= 32) 4946 return _kand_mask32(__x._M_data, __y._M_data); 4947 else if constexpr (__have_avx512bw && _Np <= 64) 4948 return _kand_mask64(__x._M_data, __y._M_data); 4950 __assert_unreachable<_Tp>(); 4953 return _Base::_S_bit_and(__x, __y); 4956 template <typename _Tp, size_t _Np> 4957 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4958 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4960 if constexpr (is_same_v<_Tp, bool>) 4962 if (__builtin_is_constant_evaluated()) 4963 return __x._M_data | __y._M_data; 4964 else if constexpr (__have_avx512dq && _Np <= 8) 4965 return _kor_mask8(__x._M_data, __y._M_data); 4966 else if constexpr (_Np <= 16) 4967 return _kor_mask16(__x._M_data, __y._M_data); 4968 else if constexpr (__have_avx512bw && _Np <= 32) 4969 return _kor_mask32(__x._M_data, __y._M_data); 4970 else if constexpr (__have_avx512bw && _Np <= 64) 4971 return _kor_mask64(__x._M_data, __y._M_data); 4973 __assert_unreachable<_Tp>(); 4976 return _Base::_S_bit_or(__x, __y); 4979 template <typename _Tp, size_t _Np> 4980 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4981 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4983 if constexpr (is_same_v<_Tp, bool>) 4985 if (__builtin_is_constant_evaluated()) 4986 return __x._M_data ^ __y._M_data; 4987 else if constexpr (__have_avx512dq && _Np <= 8) 4988 return _kxor_mask8(__x._M_data, __y._M_data); 4989 else if constexpr (_Np <= 16) 4990 return _kxor_mask16(__x._M_data, __y._M_data); 4991 else if constexpr (__have_avx512bw && _Np <= 32) 4992 return _kxor_mask32(__x._M_data, __y._M_data); 4993 else if constexpr (__have_avx512bw && _Np <= 64) 4994 return _kxor_mask64(__x._M_data, __y._M_data); 4996 __assert_unreachable<_Tp>(); 4999 return _Base::_S_bit_xor(__x, __y); 5003 // _S_masked_assign{{{ 5004 template <size_t _Np> 5005 _GLIBCXX_SIMD_INTRINSIC static void 5006 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 5007 _SimdWrapper<bool, _Np>& __lhs, _SimdWrapper<bool, _Np> __rhs) 5010 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data); 5013 template <size_t _Np> 5014 _GLIBCXX_SIMD_INTRINSIC static void 5015 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 5016 _SimdWrapper<bool, _Np>& __lhs, bool __rhs) 5019 __lhs._M_data = __k._M_data | __lhs._M_data; 5021 __lhs._M_data = ~__k._M_data & __lhs._M_data; 5024 using _MaskImplBuiltin<_Abi>::_S_masked_assign; 5028 template <typename _Tp> 5029 _GLIBCXX_SIMD_INTRINSIC static bool 5030 _S_all_of(simd_mask<_Tp, _Abi> __k) 5032 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5034 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5035 using _TI = __intrinsic_type_t<_Tp, _Np>; 5036 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5037 if constexpr (__have_sse4_1) 5039 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5040 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5041 return 0 != __testc(__a, __b); 5043 else if constexpr (is_same_v<_Tp, float>) 5044 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) 5046 else if constexpr (is_same_v<_Tp, double>) 5047 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) 5050 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 5051 == (1 << (_Np * sizeof(_Tp))) - 1; 5053 else if constexpr (__is_avx512_abi<_Abi>()) 5055 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>(); 5056 const auto __kk = __k._M_data._M_data; 5057 if constexpr (sizeof(__kk) == 1) 5059 if constexpr (__have_avx512dq) 5060 return _kortestc_mask8_u8(__kk, _Mask == 0xff 5062 : __mmask8(~_Mask)); 5064 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask)); 5066 else if constexpr (sizeof(__kk) == 2) 5067 return _kortestc_mask16_u8(__kk, _Mask == 0xffff 5069 : __mmask16(~_Mask)); 5070 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw) 5071 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU 5073 : __mmask32(~_Mask)); 5074 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw) 5075 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL 5077 : __mmask64(~_Mask)); 5079 __assert_unreachable<_Tp>(); 5085 template <typename _Tp> 5086 _GLIBCXX_SIMD_INTRINSIC static bool 5087 _S_any_of(simd_mask<_Tp, _Abi> __k) 5089 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5091 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5092 using _TI = __intrinsic_type_t<_Tp, _Np>; 5093 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5094 if constexpr (__have_sse4_1) 5096 if constexpr (_Abi::template _S_is_partial< 5097 _Tp> || sizeof(__k) < 16) 5099 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5100 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5101 return 0 == __testz(__a, __b); 5104 return 0 == __testz(__a, __a); 5106 else if constexpr (is_same_v<_Tp, float>) 5107 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0; 5108 else if constexpr (is_same_v<_Tp, double>) 5109 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0; 5111 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 5114 else if constexpr (__is_avx512_abi<_Abi>()) 5115 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 5121 template <typename _Tp> 5122 _GLIBCXX_SIMD_INTRINSIC static bool 5123 _S_none_of(simd_mask<_Tp, _Abi> __k) 5125 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5127 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5128 using _TI = __intrinsic_type_t<_Tp, _Np>; 5129 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5130 if constexpr (__have_sse4_1) 5132 if constexpr (_Abi::template _S_is_partial< 5133 _Tp> || sizeof(__k) < 16) 5135 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5136 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5137 return 0 != __testz(__a, __b); 5140 return 0 != __testz(__a, __a); 5142 else if constexpr (is_same_v<_Tp, float>) 5143 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 5144 else if constexpr (is_same_v<_Tp, double>) 5145 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 5147 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1)) 5150 else if constexpr (__is_avx512_abi<_Abi>()) 5151 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 5157 template <typename _Tp> 5158 _GLIBCXX_SIMD_INTRINSIC static bool 5159 _S_some_of(simd_mask<_Tp, _Abi> __k) 5161 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5163 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5164 using _TI = __intrinsic_type_t<_Tp, _Np>; 5165 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5166 if constexpr (__have_sse4_1) 5168 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5169 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5170 return 0 != __testnzc(__a, __b); 5172 else if constexpr (is_same_v<_Tp, float>) 5174 constexpr int __allbits = (1 << _Np) - 1; 5175 const auto __tmp = _mm_movemask_ps(__a) & __allbits; 5176 return __tmp > 0 && __tmp < __allbits; 5178 else if constexpr (is_same_v<_Tp, double>) 5180 constexpr int __allbits = (1 << _Np) - 1; 5181 const auto __tmp = _mm_movemask_pd(__a) & __allbits; 5182 return __tmp > 0 && __tmp < __allbits; 5186 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1; 5187 const auto __tmp = _mm_movemask_epi8(__a) & __allbits; 5188 return __tmp > 0 && __tmp < __allbits; 5191 else if constexpr (__is_avx512_abi<_Abi>()) 5192 return _S_any_of(__k) && !_S_all_of(__k); 5194 __assert_unreachable<_Tp>(); 5199 template <typename _Tp> 5200 _GLIBCXX_SIMD_INTRINSIC static int 5201 _S_popcount(simd_mask<_Tp, _Abi> __k) 5203 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5204 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data; 5205 if constexpr (__is_avx512_abi<_Abi>()) 5207 if constexpr (_Np > 32) 5208 return __builtin_popcountll(__kk); 5210 return __builtin_popcount(__kk); 5214 if constexpr (__have_popcnt) 5217 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk))); 5218 const int __count = __builtin_popcount(__bits); 5219 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count; 5221 else if constexpr (_Np == 2 && sizeof(_Tp) == 8) 5223 const int mask = _mm_movemask_pd(__auto_bitcast(__kk)); 5224 return mask - (mask >> 1); 5226 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 5228 auto __x = -(__lo128(__kk) + __hi128(__kk)); 5229 return __x[0] + __x[1]; 5231 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 5233 if constexpr (__have_sse2) 5235 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk)); 5236 __x = _mm_add_epi32( 5237 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5238 __x = _mm_add_epi32( 5239 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2))); 5240 return -_mm_cvtsi128_si32(__x); 5243 return __builtin_popcount( 5244 _mm_movemask_ps(__auto_bitcast(__kk))); 5246 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 5248 auto __x = __to_intrin(__kk); 5249 __x = _mm_add_epi16(__x, 5250 _mm_shuffle_epi32(__x, 5251 _MM_SHUFFLE(0, 1, 2, 3))); 5252 __x = _mm_add_epi16( 5253 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5254 __x = _mm_add_epi16( 5255 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1))); 5256 return -short(_mm_extract_epi16(__x, 0)); 5258 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 5260 auto __x = __to_intrin(__kk); 5261 __x = _mm_add_epi8(__x, 5262 _mm_shuffle_epi32(__x, 5263 _MM_SHUFFLE(0, 1, 2, 3))); 5264 __x = _mm_add_epi8(__x, 5265 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 5267 __x = _mm_add_epi8(__x, 5268 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 5270 auto __y = -__vector_bitcast<_UChar>(__x); 5271 if constexpr (__have_sse4_1) 5272 return __y[0] + __y[1]; 5275 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0); 5276 return (__z & 0xff) + (__z >> 8); 5279 else if constexpr (sizeof(__kk) == 32) 5281 // The following works only as long as the implementations above 5283 using _I = __int_for_sizeof_t<_Tp>; 5284 const auto __as_int = __vector_bitcast<_I>(__kk); 5285 _MaskImplX86<simd_abi::__sse>::_S_popcount( 5286 simd_mask<_I, simd_abi::__sse>(__private_init, 5288 + __hi128(__as_int))); 5291 __assert_unreachable<_Tp>(); 5296 // _S_find_first_set {{{ 5297 template <typename _Tp> 5298 _GLIBCXX_SIMD_INTRINSIC static int 5299 _S_find_first_set(simd_mask<_Tp, _Abi> __k) 5301 if constexpr (__is_avx512_abi<_Abi>()) 5302 return std::__countr_zero(__k._M_data._M_data); 5304 return _Base::_S_find_first_set(__k); 5308 // _S_find_last_set {{{ 5309 template <typename _Tp> 5310 _GLIBCXX_SIMD_INTRINSIC static int 5311 _S_find_last_set(simd_mask<_Tp, _Abi> __k) 5313 if constexpr (__is_avx512_abi<_Abi>()) 5314 return std::__bit_width(__k._M_data._M_data) - 1; 5316 return _Base::_S_find_last_set(__k); 5324 _GLIBCXX_SIMD_END_NAMESPACE 5325 #endif // __cplusplus >= 201703L 5326 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 5328 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80 typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.