[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 06/31] target/arm: Split out saturating/rounding shifts from neon
From: |
Richard Henderson |
Subject: |
[PATCH 06/31] target/arm: Split out saturating/rounding shifts from neon |
Date: |
Thu, 26 Mar 2020 16:08:13 -0700 |
Split these operations out into a header that can be shared
between neon and sve. The "sat" pointer acts both as a boolean
for control of saturating behavior and controls the difference
in behavior between neon and sve -- QC bit or no QC bit.
Implement right-shift rounding as
tmp = src >> (shift - 1);
dst = (tmp >> 1) + (tmp & 1);
This is the same number of instructions as the current
tmp = 1 << (shift - 1);
dst = (src + tmp) >> shift;
without any possibility of intermediate overflow.
Signed-off-by: Richard Henderson <address@hidden>
---
target/arm/vec_internal.h | 161 ++++++++++++
target/arm/neon_helper.c | 507 +++++++-------------------------------
2 files changed, 244 insertions(+), 424 deletions(-)
create mode 100644 target/arm/vec_internal.h
diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h
new file mode 100644
index 0000000000..0d1f9c86c8
--- /dev/null
+++ b/target/arm/vec_internal.h
@@ -0,0 +1,161 @@
+/*
+ * ARM AdvSIMD / SVE Vector Helpers
+ *
+ * Copyright (c) 2020 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TARGET_ARM_VEC_INTERNALS_H
+#define TARGET_ARM_VEC_INTERNALS_H
+
+static inline int32_t do_sqrshl_bhs(int32_t src, int8_t shift, int bits,
+ bool round, uint32_t *sat)
+{
+ if (shift <= -bits) {
+ /* Rounding the sign bit always produces 0. */
+ if (round) {
+ return 0;
+ }
+ return src >> 31;
+ } else if (shift < 0) {
+ if (round) {
+ src >>= -shift - 1;
+ return (src >> 1) + (src & 1);
+ }
+ return src >> -shift;
+ } else if (shift < bits) {
+ int32_t val = src << shift;
+ if (bits == 32) {
+ if (!sat || val >> shift == src) {
+ return val;
+ }
+ } else {
+ int32_t extval = sextract32(val, 0, bits);
+ if (!sat || val == extval) {
+ return extval;
+ }
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return (1u << (bits - 1)) - (src >= 0);
+}
+
+static inline uint32_t do_uqrshl_bhs(uint32_t src, int8_t shift, int bits,
+ bool round, uint32_t *sat)
+{
+ if (shift <= -(bits + round)) {
+ return 0;
+ } else if (shift < 0) {
+ if (round) {
+ src >>= -shift - 1;
+ return (src >> 1) + (src & 1);
+ }
+ return src >> -shift;
+ } else if (shift < bits) {
+ uint32_t val = src << shift;
+ if (bits == 32) {
+ if (!sat || val >> shift == src) {
+ return val;
+ }
+ } else {
+ uint32_t extval = extract32(val, 0, bits);
+ if (!sat || val == extval) {
+ return extval;
+ }
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return MAKE_64BIT_MASK(0, bits);
+}
+
+static inline int32_t do_suqrshl_bhs(int32_t src, int8_t shift, int bits,
+ bool round, uint32_t *sat)
+{
+ if (src < 0) {
+ *sat = 1;
+ return 0;
+ }
+ return do_uqrshl_bhs(src, shift, bits, round, sat);
+}
+
+static inline int64_t do_sqrshl_d(int64_t src, int8_t shift,
+ bool round, uint32_t *sat)
+{
+ if (shift <= -64) {
+ /* Rounding the sign bit always produces 0. */
+ if (round) {
+ return 0;
+ }
+ return src >> 63;
+ } else if (shift < 0) {
+ if (round) {
+ src >>= -shift - 1;
+ return (src >> 1) + (src & 1);
+ }
+ return src >> -shift;
+ } else if (shift < 64) {
+ int64_t val = src << shift;
+ if (!sat || val >> shift == src) {
+ return val;
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return src < 0 ? INT64_MIN : INT64_MAX;
+}
+
+static inline uint64_t do_uqrshl_d(uint64_t src, int8_t shift,
+ bool round, uint32_t *sat)
+{
+ if (shift <= -(64 + round)) {
+ return 0;
+ } else if (shift < 0) {
+ if (round) {
+ src >>= -shift - 1;
+ return (src >> 1) + (src & 1);
+ }
+ return src >> -shift;
+ } else if (shift < 64) {
+ uint64_t val = src << shift;
+ if (!sat || val >> shift == src) {
+ return val;
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return UINT64_MAX;
+}
+
+static inline int64_t do_suqrshl_d(int64_t src, int8_t shift,
+ bool round, uint32_t *sat)
+{
+ if (src < 0) {
+ *sat = 1;
+ return 0;
+ }
+ return do_uqrshl_d(src, shift, round, sat);
+}
+
+#endif /* TARGET_ARM_VEC_INTERNALS_H */
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
index c7a8438b42..e6481a5764 100644
--- a/target/arm/neon_helper.c
+++ b/target/arm/neon_helper.c
@@ -11,6 +11,7 @@
#include "cpu.h"
#include "exec/helper-proto.h"
#include "fpu/softfloat.h"
+#include "vec_internal.h"
#define SIGNBIT (uint32_t)0x80000000
#define SIGNBIT64 ((uint64_t)1 << 63)
@@ -604,496 +605,154 @@ NEON_VOP(abd_s32, neon_s32, 1)
NEON_VOP(abd_u32, neon_u32, 1)
#undef NEON_FN
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8 || \
- tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, src2, 16, false, NULL))
NEON_VOP(shl_u16, neon_u16, 2)
#undef NEON_FN
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> (sizeof(src1) * 8 - 1); \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, src2, 16, false, NULL))
NEON_VOP(shl_s16, neon_s16, 2)
#undef NEON_FN
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if ((tmp >= (ssize_t)sizeof(src1) * 8) \
- || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, src2, 8, true, NULL))
NEON_VOP(rshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, src2, 16, true, NULL))
NEON_VOP(rshl_s16, neon_s16, 2)
#undef NEON_FN
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator. */
-uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
+uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
{
- int32_t dest;
- int32_t val = (int32_t)valop;
- int8_t shift = (int8_t)shiftop;
- if ((shift >= 32) || (shift <= -32)) {
- dest = 0;
- } else if (shift < 0) {
- int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
- dest = big_dest >> -shift;
- } else {
- dest = val << shift;
- }
- return dest;
+ return do_sqrshl_bhs(val, shift, 32, true, NULL);
}
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values. */
-uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
+uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
{
- int8_t shift = (int8_t)shiftop;
- int64_t val = valop;
- if ((shift >= 64) || (shift <= -64)) {
- val = 0;
- } else if (shift < 0) {
- val >>= (-shift - 1);
- if (val == INT64_MAX) {
- /* In this case, it means that the rounding constant is 1,
- * and the addition would overflow. Return the actual
- * result directly. */
- val = 0x4000000000000000LL;
- } else {
- val++;
- val >>= 1;
- }
- } else {
- val <<= shift;
- }
- return val;
+ return do_sqrshl_d(val, shift, true, NULL);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8 || \
- tmp < -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> (-tmp - 1); \
- } else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, src2, 8, true, NULL))
NEON_VOP(rshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, src2, 16, true, NULL))
NEON_VOP(rshl_u16, neon_u16, 2)
#undef NEON_FN
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator. */
-uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
+uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
{
- uint32_t dest;
- int8_t shift = (int8_t)shiftop;
- if (shift >= 32 || shift < -32) {
- dest = 0;
- } else if (shift == -32) {
- dest = val >> 31;
- } else if (shift < 0) {
- uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
- dest = big_dest >> -shift;
- } else {
- dest = val << shift;
- }
- return dest;
+ return do_uqrshl_bhs(val, shift, 32, true, NULL);
}
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values. */
-uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
+uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
{
- int8_t shift = (uint8_t)shiftop;
- if (shift >= 64 || shift < -64) {
- val = 0;
- } else if (shift == -64) {
- /* Rounding a 1-bit result just preserves that bit. */
- val >>= 63;
- } else if (shift < 0) {
- val >>= (-shift - 1);
- if (val == UINT64_MAX) {
- /* In this case, it means that the rounding constant is 1,
- * and the addition would overflow. Return the actual
- * result directly. */
- val = 0x8000000000000000ULL;
- } else {
- val++;
- val >>= 1;
- }
- } else {
- val <<= shift;
- }
- return val;
+ return do_uqrshl_d(val, shift, true, NULL);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = ~0; \
- } else { \
- dest = 0; \
- } \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = ~0; \
- } \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshl_u16, neon_u16, 2)
-NEON_VOP_ENV(qshl_u32, neon_u32, 1)
#undef NEON_FN
-uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t
shiftop)
+uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
{
- int8_t shift = (int8_t)shiftop;
- if (shift >= 64) {
- if (val) {
- val = ~(uint64_t)0;
- SET_QC();
- }
- } else if (shift <= -64) {
- val = 0;
- } else if (shift < 0) {
- val >>= -shift;
- } else {
- uint64_t tmp = val;
- val <<= shift;
- if ((val >> shift) != tmp) {
- SET_QC();
- val = ~(uint64_t)0;
- }
- }
- return val;
+ return do_uqrshl_bhs(val, shift, 32, false, env->vfp.qc);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
- if (src1 > 0) { \
- dest--; \
- } \
- } else { \
- dest = src1; \
- } \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> 31; \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
- if (src1 > 0) { \
- dest--; \
- } \
- } \
- }} while (0)
+uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_uqrshl_d(val, shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshl_s16, neon_s16, 2)
-NEON_VOP_ENV(qshl_s32, neon_s32, 1)
#undef NEON_FN
-uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t
shiftop)
+uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
{
- int8_t shift = (uint8_t)shiftop;
- int64_t val = valop;
- if (shift >= 64) {
- if (val) {
- SET_QC();
- val = (val >> 63) ^ ~SIGNBIT64;
- }
- } else if (shift <= -64) {
- val >>= 63;
- } else if (shift < 0) {
- val >>= -shift;
- } else {
- int64_t tmp = val;
- val <<= shift;
- if ((val >> shift) != tmp) {
- SET_QC();
- val = (tmp >> 63) ^ ~SIGNBIT64;
- }
- }
- return val;
+ return do_sqrshl_bhs(val, shift, 32, false, env->vfp.qc);
}
-#define NEON_FN(dest, src1, src2) do { \
- if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
- SET_QC(); \
- dest = 0; \
- } else { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = ~0; \
- } else { \
- dest = 0; \
- } \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = ~0; \
- } \
- } \
- }} while (0)
-NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
-NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
+uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_sqrshl_d(val, shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_bhs(src1, src2, 8, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
#undef NEON_FN
-uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t
shiftop)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_bhs(src1, src2, 16, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
{
- if ((int32_t)valop < 0) {
- SET_QC();
- return 0;
- }
- return helper_neon_qshl_u32(env, valop, shiftop);
+ return do_suqrshl_bhs(val, shift, 32, false, env->vfp.qc);
}
-uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t
shiftop)
+uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
{
- if ((int64_t)valop < 0) {
- SET_QC();
- return 0;
- }
- return helper_neon_qshl_u64(env, valop, shiftop);
+ return do_suqrshl_d(val, shift, false, env->vfp.qc);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = ~0; \
- } else { \
- dest = 0; \
- } \
- } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> (sizeof(src1) * 8 - 1); \
- } else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = ~0; \
- } \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, src2, 8, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, src2, 16, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
#undef NEON_FN
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator. */
-uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t
shiftop)
+uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
{
- uint32_t dest;
- int8_t shift = (int8_t)shiftop;
- if (shift >= 32) {
- if (val) {
- SET_QC();
- dest = ~0;
- } else {
- dest = 0;
- }
- } else if (shift < -32) {
- dest = 0;
- } else if (shift == -32) {
- dest = val >> 31;
- } else if (shift < 0) {
- uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
- dest = big_dest >> -shift;
- } else {
- dest = val << shift;
- if ((dest >> shift) != val) {
- SET_QC();
- dest = ~0;
- }
- }
- return dest;
+ return do_uqrshl_bhs(val, shift, 32, true, env->vfp.qc);
}
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values. */
-uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t
shiftop)
+uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
{
- int8_t shift = (int8_t)shiftop;
- if (shift >= 64) {
- if (val) {
- SET_QC();
- val = ~0;
- }
- } else if (shift < -64) {
- val = 0;
- } else if (shift == -64) {
- val >>= 63;
- } else if (shift < 0) {
- val >>= (-shift - 1);
- if (val == UINT64_MAX) {
- /* In this case, it means that the rounding constant is 1,
- * and the addition would overflow. Return the actual
- * result directly. */
- val = 0x8000000000000000ULL;
- } else {
- val++;
- val >>= 1;
- }
- } else { \
- uint64_t tmp = val;
- val <<= shift;
- if ((val >> shift) != tmp) {
- SET_QC();
- val = ~0;
- }
- }
- return val;
+ return do_uqrshl_d(val, shift, true, env->vfp.qc);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = (typeof(dest))(1 << (sizeof(src1) * 8 - 1)); \
- if (src1 > 0) { \
- dest--; \
- } \
- } else { \
- dest = 0; \
- } \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
- if (src1 > 0) { \
- dest--; \
- } \
- } \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, src2, 8, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, src2, 16, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
#undef NEON_FN
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator. */
-uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t
shiftop)
+uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
{
- int32_t dest;
- int32_t val = (int32_t)valop;
- int8_t shift = (int8_t)shiftop;
- if (shift >= 32) {
- if (val) {
- SET_QC();
- dest = (val >> 31) ^ ~SIGNBIT;
- } else {
- dest = 0;
- }
- } else if (shift <= -32) {
- dest = 0;
- } else if (shift < 0) {
- int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
- dest = big_dest >> -shift;
- } else {
- dest = val << shift;
- if ((dest >> shift) != val) {
- SET_QC();
- dest = (val >> 31) ^ ~SIGNBIT;
- }
- }
- return dest;
+ return do_sqrshl_bhs(val, shift, 32, true, env->vfp.qc);
}
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values. */
-uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t
shiftop)
+uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
{
- int8_t shift = (uint8_t)shiftop;
- int64_t val = valop;
-
- if (shift >= 64) {
- if (val) {
- SET_QC();
- val = (val >> 63) ^ ~SIGNBIT64;
- }
- } else if (shift <= -64) {
- val = 0;
- } else if (shift < 0) {
- val >>= (-shift - 1);
- if (val == INT64_MAX) {
- /* In this case, it means that the rounding constant is 1,
- * and the addition would overflow. Return the actual
- * result directly. */
- val = 0x4000000000000000ULL;
- } else {
- val++;
- val >>= 1;
- }
- } else {
- int64_t tmp = val;
- val <<= shift;
- if ((val >> shift) != tmp) {
- SET_QC();
- val = (tmp >> 63) ^ ~SIGNBIT64;
- }
- }
- return val;
+ return do_sqrshl_d(val, shift, true, env->vfp.qc);
}
uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
--
2.20.1
- [PATCH for-5.1 00/31] target/arm: SVE2, part 1, Richard Henderson, 2020/03/26
- [PATCH 01/31] target/arm: Add ID_AA64ZFR0 fields and isar_feature_aa64_sve2, Richard Henderson, 2020/03/26
- [PATCH 02/31] target/arm: Implement SVE2 Integer Multiply - Unpredicated, Richard Henderson, 2020/03/26
- [PATCH 03/31] target/arm: Implement SVE2 integer pairwise add and accumulate long, Richard Henderson, 2020/03/26
- [PATCH 04/31] target/arm: Remove fp_status from helper_{recpe, rsqrte}_u32, Richard Henderson, 2020/03/26
- [PATCH 05/31] target/arm: Implement SVE2 integer unary operations (predicated), Richard Henderson, 2020/03/26
- [PATCH 08/31] target/arm: Implement SVE2 integer halving add/subtract (predicated), Richard Henderson, 2020/03/26
- [PATCH 07/31] target/arm: Implement SVE2 saturating/rounding bitwise shift left (predicated), Richard Henderson, 2020/03/26
- [PATCH 06/31] target/arm: Split out saturating/rounding shifts from neon,
Richard Henderson <=
- [PATCH 09/31] target/arm: Implement SVE2 integer pairwise arithmetic, Richard Henderson, 2020/03/26
- [PATCH 11/31] target/arm: Implement SVE2 integer add/subtract long, Richard Henderson, 2020/03/26
- [PATCH 12/31] target/arm: Implement SVE2 integer add/subtract interleaved long, Richard Henderson, 2020/03/26
- [PATCH 10/31] target/arm: Implement SVE2 saturating add/subtract (predicated), Richard Henderson, 2020/03/26
- [PATCH 13/31] target/arm: Implement SVE2 integer add/subtract wide, Richard Henderson, 2020/03/26
- [PATCH 14/31] target/arm: Implement SVE2 integer multiply long, Richard Henderson, 2020/03/26
- [PATCH 15/31] target/arm: Implement PMULLB and PMULLT, Richard Henderson, 2020/03/26
- [PATCH 16/31] target/arm: Tidy SVE tszimm shift formats, Richard Henderson, 2020/03/26
- [PATCH 17/31] target/arm: Implement SVE2 bitwise shift left long, Richard Henderson, 2020/03/26
- [PATCH 18/31] target/arm: Implement SVE2 bitwise exclusive-or interleaved, Richard Henderson, 2020/03/26