// Simd NEON specific implementations -*- C++ -*-
// Copyright (C) 2020-2022 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// .
#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
#if __cplusplus >= 201703L
#if !_GLIBCXX_SIMD_HAVE_NEON
#error "simd_neon.h may only be included when NEON on ARM is available"
#endif
_GLIBCXX_SIMD_BEGIN_NAMESPACE
// _CommonImplNeon {{{
struct _CommonImplNeon : _CommonImplBuiltin
{
// _S_store {{{
using _CommonImplBuiltin::_S_store;
// }}}
};
// }}}
// _SimdImplNeon {{{
template
struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
{
using _Base = _SimdImplBuiltin<_Abi>;
template
using _MaskMember = typename _Base::template _MaskMember<_Tp>;
template
static constexpr size_t _S_max_store_size = 16;
// _S_masked_load {{{
template
static inline _SimdWrapper<_Tp, _Np>
_S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
const _Up* __mem) noexcept
{
__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if (__k[__i] != 0)
__merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
});
return __merge;
}
// }}}
// _S_masked_store_nocvt {{{
template
_GLIBCXX_SIMD_INTRINSIC static void
_S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
_MaskMember<_Tp> __k)
{
__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if (__k[__i] != 0)
__mem[__i] = __v[__i];
});
}
// }}}
// _S_reduce {{{
template
_GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
_S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
{
if (not __builtin_is_constant_evaluated())
{
constexpr size_t _Np = __x.size();
if constexpr (sizeof(__x) == 16 && _Np >= 4
&& !_Abi::template _S_is_partial<_Tp>)
{
const auto __halves = split>>(__x);
const auto __y = __binary_op(__halves[0], __halves[1]);
return _SimdImplNeon>::_S_reduce(
__y, static_cast<_BinaryOperation&&>(__binary_op));
}
else if constexpr (_Np == 8)
{
__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
__vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(__x._M_data)));
__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
__vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(__x._M_data)));
__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
__vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(__x._M_data)));
return __x[0];
}
else if constexpr (_Np == 4)
{
__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
__vector_permute<1, 0, 3, 2>(__x._M_data)));
__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
__vector_permute<3, 2, 1, 0>(__x._M_data)));
return __x[0];
}
else if constexpr (_Np == 2)
{
__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
__vector_permute<1, 0>(__x._M_data)));
return __x[0];
}
}
return _Base::_S_reduce(__x, static_cast<_BinaryOperation&&>(__binary_op));
}
// }}}
// math {{{
// _S_sqrt {{{
template >
_GLIBCXX_SIMD_INTRINSIC static _Tp
_S_sqrt(_Tp __x)
{
if constexpr (__have_neon_a64)
{
const auto __intrin = __to_intrin(__x);
if constexpr (_TVT::template _S_is)
return vsqrt_f32(__intrin);
else if constexpr (_TVT::template _S_is)
return vsqrtq_f32(__intrin);
else if constexpr (_TVT::template _S_is)
return vsqrt_f64(__intrin);
else if constexpr (_TVT::template _S_is)
return vsqrtq_f64(__intrin);
else
__assert_unreachable<_Tp>();
}
else
return _Base::_S_sqrt(__x);
}
// }}}
// _S_trunc {{{
template >
_GLIBCXX_SIMD_INTRINSIC static _TW
_S_trunc(_TW __x)
{
using _Tp = typename _TVT::value_type;
if constexpr (__have_neon_a32)
{
const auto __intrin = __to_intrin(__x);
if constexpr (_TVT::template _S_is)
return vrnd_f32(__intrin);
else if constexpr (_TVT::template _S_is)
return vrndq_f32(__intrin);
else if constexpr (_TVT::template _S_is)
return vrnd_f64(__intrin);
else if constexpr (_TVT::template _S_is)
return vrndq_f64(__intrin);
else
__assert_unreachable<_Tp>();
}
else if constexpr (is_same_v<_Tp, float>)
{
auto __intrin = __to_intrin(__x);
if constexpr (sizeof(__x) == 16)
__intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
else
__intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
return _Base::_S_abs(__x)._M_data < 0x1p23f
? __vector_bitcast(__intrin)
: __x._M_data;
}
else
return _Base::_S_trunc(__x);
}
// }}}
// _S_round {{{
template
_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
_S_round(_SimdWrapper<_Tp, _Np> __x)
{
if constexpr (__have_neon_a32)
{
const auto __intrin = __to_intrin(__x);
if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
return vrnda_f32(__intrin);
else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
return vrndaq_f32(__intrin);
else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
return vrnda_f64(__intrin);
else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
return vrndaq_f64(__intrin);
else
__assert_unreachable<_Tp>();
}
else
return _Base::_S_round(__x);
}
// }}}
// _S_floor {{{
template >
_GLIBCXX_SIMD_INTRINSIC static _Tp
_S_floor(_Tp __x)
{
if constexpr (__have_neon_a32)
{
const auto __intrin = __to_intrin(__x);
if constexpr (_TVT::template _S_is)
return vrndm_f32(__intrin);
else if constexpr (_TVT::template _S_is)
return vrndmq_f32(__intrin);
else if constexpr (_TVT::template _S_is)
return vrndm_f64(__intrin);
else if constexpr (_TVT::template _S_is)
return vrndmq_f64(__intrin);
else
__assert_unreachable<_Tp>();
}
else
return _Base::_S_floor(__x);
}
// }}}
// _S_ceil {{{
template >
_GLIBCXX_SIMD_INTRINSIC static _Tp
_S_ceil(_Tp __x)
{
if constexpr (__have_neon_a32)
{
const auto __intrin = __to_intrin(__x);
if constexpr (_TVT::template _S_is)
return vrndp_f32(__intrin);
else if constexpr (_TVT::template _S_is)
return vrndpq_f32(__intrin);
else if constexpr (_TVT::template _S_is)
return vrndp_f64(__intrin);
else if constexpr (_TVT::template _S_is)
return vrndpq_f64(__intrin);
else
__assert_unreachable<_Tp>();
}
else
return _Base::_S_ceil(__x);
}
//}}} }}}
}; // }}}
// _MaskImplNeonMixin {{{
struct _MaskImplNeonMixin
{
using _Base = _MaskImplBuiltinMixin;
template
_GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
_S_to_bits(_SimdWrapper<_Tp, _Np> __x)
{
if (__builtin_is_constant_evaluated())
return _Base::_S_to_bits(__x);
using _I = __int_for_sizeof_t<_Tp>;
if constexpr (sizeof(__x) == 16)
{
auto __asint = __vector_bitcast<_I>(__x);
#ifdef __aarch64__
[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
#else
[[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
#endif
if constexpr (sizeof(_Tp) == 1)
{
constexpr auto __bitsel
= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(
__i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
});
__asint &= __bitsel;
#ifdef __aarch64__
return __vector_bitcast<_UShort>(
vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
__zero))[0];
#else
return __vector_bitcast<_UShort>(
vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
__zero),
__zero))[0];
#endif
}
else if constexpr (sizeof(_Tp) == 2)
{
constexpr auto __bitsel
= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
#ifdef __aarch64__
return vaddvq_s16(__asint);
#else
return vpadd_s16(
vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
__zero)[0];
#endif
}
else if constexpr (sizeof(_Tp) == 4)
{
constexpr auto __bitsel
= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
#ifdef __aarch64__
return vaddvq_s32(__asint);
#else
return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
__zero)[0];
#endif
}
else if constexpr (sizeof(_Tp) == 8)
return (__asint[0] & 1) | (__asint[1] & 2);
else
__assert_unreachable<_Tp>();
}
else if constexpr (sizeof(__x) == 8)
{
auto __asint = __vector_bitcast<_I>(__x);
[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
if constexpr (sizeof(_Tp) == 1)
{
constexpr auto __bitsel
= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
#ifdef __aarch64__
return vaddv_s8(__asint);
#else
return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
__zero)[0];
#endif
}
else if constexpr (sizeof(_Tp) == 2)
{
constexpr auto __bitsel
= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
#ifdef __aarch64__
return vaddv_s16(__asint);
#else
return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
#endif
}
else if constexpr (sizeof(_Tp) == 4)
{
__asint &= __make_vector<_I>(0x1, 0x2);
#ifdef __aarch64__
return vaddv_s32(__asint);
#else
return vpadd_s32(__asint, __zero)[0];
#endif
}
else
__assert_unreachable<_Tp>();
}
else
return _Base::_S_to_bits(__x);
}
};
// }}}
// _MaskImplNeon {{{
template
struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
{
using _MaskImplBuiltinMixin::_S_to_maskvector;
using _MaskImplNeonMixin::_S_to_bits;
using _Base = _MaskImplBuiltin<_Abi>;
using _Base::_S_convert;
// _S_all_of {{{
template
_GLIBCXX_SIMD_INTRINSIC static bool
_S_all_of(simd_mask<_Tp, _Abi> __k)
{
const auto __kk
= __vector_bitcast(__k._M_data)
| ~__vector_bitcast(_Abi::template _S_implicit_mask<_Tp>());
if constexpr (sizeof(__k) == 16)
{
const auto __x = __vector_bitcast(__kk);
return __x[0] + __x[1] == -2;
}
else if constexpr (sizeof(__k) <= 8)
return __bit_cast<__int_for_sizeof_t>(__kk) == -1;
else
__assert_unreachable<_Tp>();
}
// }}}
// _S_any_of {{{
template
_GLIBCXX_SIMD_INTRINSIC static bool
_S_any_of(simd_mask<_Tp, _Abi> __k)
{
const auto __kk
= __vector_bitcast(__k._M_data)
| ~__vector_bitcast(_Abi::template _S_implicit_mask<_Tp>());
if constexpr (sizeof(__k) == 16)
{
const auto __x = __vector_bitcast(__kk);
return (__x[0] | __x[1]) != 0;
}
else if constexpr (sizeof(__k) <= 8)
return __bit_cast<__int_for_sizeof_t>(__kk) != 0;
else
__assert_unreachable<_Tp>();
}
// }}}
// _S_none_of {{{
template
_GLIBCXX_SIMD_INTRINSIC static bool
_S_none_of(simd_mask<_Tp, _Abi> __k)
{
const auto __kk = _Abi::_S_masked(__k._M_data);
if constexpr (sizeof(__k) == 16)
{
const auto __x = __vector_bitcast(__kk);
return (__x[0] | __x[1]) == 0;
}
else if constexpr (sizeof(__k) <= 8)
return __bit_cast<__int_for_sizeof_t>(__kk) == 0;
else
__assert_unreachable<_Tp>();
}
// }}}
// _S_some_of {{{
template
_GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
{
if constexpr (sizeof(__k) <= 8)
{
const auto __kk = __vector_bitcast(__k._M_data)
| ~__vector_bitcast(
_Abi::template _S_implicit_mask<_Tp>());
using _Up = make_unsigned_t<__int_for_sizeof_t>;
return __bit_cast<_Up>(__kk) + 1 > 1;
}
else
return _Base::_S_some_of(__k);
}
// }}}
// _S_popcount {{{
template
_GLIBCXX_SIMD_INTRINSIC static int
_S_popcount(simd_mask<_Tp, _Abi> __k)
{
if constexpr (sizeof(_Tp) == 1)
{
const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
int8x8_t())[0];
}
else if constexpr (sizeof(_Tp) == 2)
{
const auto __s16 = __vector_bitcast(__k._M_data);
int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
}
else if constexpr (sizeof(_Tp) == 4)
{
const auto __s32 = __vector_bitcast(__k._M_data);
int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
return -vpadd_s32(__tmp, int32x2_t())[0];
}
else if constexpr (sizeof(_Tp) == 8)
{
static_assert(sizeof(__k) == 16);
const auto __s64 = __vector_bitcast(__k._M_data);
return -(__s64[0] + __s64[1]);
}
}
// }}}
// _S_find_first_set {{{
template
_GLIBCXX_SIMD_INTRINSIC static int
_S_find_first_set(simd_mask<_Tp, _Abi> __k)
{
// TODO: the _Base implementation is not optimal for NEON
return _Base::_S_find_first_set(__k);
}
// }}}
// _S_find_last_set {{{
template
_GLIBCXX_SIMD_INTRINSIC static int
_S_find_last_set(simd_mask<_Tp, _Abi> __k)
{
// TODO: the _Base implementation is not optimal for NEON
return _Base::_S_find_last_set(__k);
}
// }}}
}; // }}}
_GLIBCXX_SIMD_END_NAMESPACE
#endif // __cplusplus >= 201703L
#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80