[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PULL 27/44] target/arm: Implement MVE saturating doubling multiply accu
From: |
Peter Maydell |
Subject: |
[PULL 27/44] target/arm: Implement MVE saturating doubling multiply accumulates |
Date: |
Wed, 25 Aug 2021 11:35:17 +0100 |
Implement the MVE saturating doubling multiply accumulate insns
VQDMLAH, VQRDMLAH, VQDMLASH and VQRDMLASH. These perform a multiply,
double, add the accumulator shifted by the element size, possibly
round, saturate to twice the element size, then take the high half of
the result. The *MLAH insns do vector * scalar + vector, and the
*MLASH insns do vector * vector + scalar.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper-mve.h | 16 +++++++
target/arm/mve.decode | 5 ++
target/arm/mve_helper.c | 95 ++++++++++++++++++++++++++++++++++++++
target/arm/translate-mve.c | 4 ++
4 files changed, 120 insertions(+)
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
index 328e31e2665..2f54396b2df 100644
--- a/target/arm/helper-mve.h
+++ b/target/arm/helper-mve.h
@@ -375,6 +375,22 @@ DEF_HELPER_FLAGS_4(mve_vmlasb, TCG_CALL_NO_WG, void, env,
ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vmlash, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vmlasw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmlahb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmlahh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmlahw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqrdmlahb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqrdmlahh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqrdmlahw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqdmlashb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmlashh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmlashw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqrdmlashb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqrdmlashh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqrdmlashw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
index cd9c806a11c..7a6de3991b6 100644
--- a/target/arm/mve.decode
+++ b/target/arm/mve.decode
@@ -416,6 +416,11 @@ VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110
.... @2scalar
VMLA 111- 1110 0 . .. ... 1 ... 0 1110 . 100 .... @2scalar
VMLAS 111- 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar
+VQRDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 100 .... @2scalar
+VQRDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 100 .... @2scalar
+VQDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 110 .... @2scalar
+VQDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 110 .... @2scalar
+
# Vector add across vector
{
VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0
rda=%rdalo
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
index 8004b9bb728..a69fcd2243c 100644
--- a/target/arm/mve_helper.c
+++ b/target/arm/mve_helper.c
@@ -964,6 +964,28 @@ DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1,
do_vqdmlsdh_w)
mve_advance_vpt(env); \
}
+#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ uint32_t rm) \
+ { \
+ TYPE *d = vd, *n = vn; \
+ TYPE m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ mergemask(&d[H##ESIZE(e)], \
+ FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat), \
+ mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
/* provide unsigned 2-op scalar helpers for all sizes */
#define DO_2OP_SCALAR_U(OP, FN) \
DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \
@@ -1008,6 +1030,79 @@ DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t,
DO_QRDMULH_B)
DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
+static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat)
+{
+ int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7);
+ return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
+}
+
+static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c,
+ int round, bool *sat)
+{
+ int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15);
+ return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
+}
+
+static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c,
+ int round, bool *sat)
+{
+ /*
+ * Architecturally we should do the entire add, double, round
+ * and then check for saturation. We do three saturating adds,
+ * but we need to be careful about the order. If the first
+ * m1 + m2 saturates then it's impossible for the *2+rc to
+ * bring it back into the non-saturated range. However, if
+ * m1 + m2 is negative then it's possible that doing the doubling
+ * would take the intermediate result below INT64_MAX and the
+ * addition of the rounding constant then brings it back in range.
+ * So we add half the rounding constant and half the "c << esize"
+ * before doubling rather than adding the rounding constant after
+ * the doubling.
+ */
+ int64_t m1 = (int64_t)a * b;
+ int64_t m2 = (int64_t)c << 31;
+ int64_t r;
+ if (sadd64_overflow(m1, m2, &r) ||
+ sadd64_overflow(r, (round << 30), &r) ||
+ sadd64_overflow(r, r, &r)) {
+ *sat = true;
+ return r < 0 ? INT32_MAX : INT32_MIN;
+ }
+ return r >> 32;
+}
+
+/*
+ * The *MLAH insns are vector * scalar + vector;
+ * the *MLASH insns are vector * vector + scalar
+ */
+#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S)
+#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S)
+#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S)
+#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S)
+#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S)
+#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S)
+
+#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S)
+#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S)
+#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S)
+#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S)
+#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S)
+#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S)
+
+DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B)
+DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H)
+DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W)
+
+DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B)
+DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H)
+DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W)
+
/* Vector by scalar plus vector */
#define DO_VMLA(D, N, M) ((N) * (M) + (D))
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
index f8899af352d..e3e115c1aa9 100644
--- a/target/arm/translate-mve.c
+++ b/target/arm/translate-mve.c
@@ -622,6 +622,10 @@ DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar)
DO_2OP_SCALAR(VBRSR, vbrsr)
DO_2OP_SCALAR(VMLA, vmla)
DO_2OP_SCALAR(VMLAS, vmlas)
+DO_2OP_SCALAR(VQDMLAH, vqdmlah)
+DO_2OP_SCALAR(VQRDMLAH, vqrdmlah)
+DO_2OP_SCALAR(VQDMLASH, vqdmlash)
+DO_2OP_SCALAR(VQRDMLASH, vqrdmlash)
static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a)
{
--
2.20.1
- [PULL 15/44] target/arm: Implement MVE integer vector comparisons, (continued)
- [PULL 15/44] target/arm: Implement MVE integer vector comparisons, Peter Maydell, 2021/08/25
- [PULL 19/44] target/arm: Implement MVE shift-by-scalar, Peter Maydell, 2021/08/25
- [PULL 18/44] target/arm: Implement MVE VMLAS, Peter Maydell, 2021/08/25
- [PULL 20/44] target/arm: Move 'x' and 'a' bit definitions into vmlaldav formats, Peter Maydell, 2021/08/25
- [PULL 22/44] target/arm: Implement MVE VABAV, Peter Maydell, 2021/08/25
- [PULL 17/44] target/arm: Implement MVE VPSEL, Peter Maydell, 2021/08/25
- [PULL 26/44] target/arm: Implement MVE VMLA, Peter Maydell, 2021/08/25
- [PULL 21/44] target/arm: Implement MVE integer min/max across vector, Peter Maydell, 2021/08/25
- [PULL 23/44] target/arm: Implement MVE narrowing moves, Peter Maydell, 2021/08/25
- [PULL 25/44] target/arm: Implement MVE VMLADAV and VMLSLDAV, Peter Maydell, 2021/08/25
- [PULL 27/44] target/arm: Implement MVE saturating doubling multiply accumulates,
Peter Maydell <=
- [PULL 31/44] target/arm: Implement MVE VPNOT, Peter Maydell, 2021/08/25
- [PULL 29/44] target/arm: Implement MVE VMAXA, VMINA, Peter Maydell, 2021/08/25
- [PULL 30/44] target/arm: Implement MVE VMOV to/from 2 general-purpose registers, Peter Maydell, 2021/08/25
- [PULL 28/44] target/arm: Implement MVE VQABS, VQNEG, Peter Maydell, 2021/08/25
- [PULL 33/44] target/arm: Implement MVE scatter-gather insns, Peter Maydell, 2021/08/25
- [PULL 36/44] target/arm: Re-indent sdiv and udiv helpers, Peter Maydell, 2021/08/25
- [PULL 34/44] target/arm: Implement MVE scatter-gather immediate forms, Peter Maydell, 2021/08/25
- [PULL 24/44] target/arm: Rename MVEGenDualAccOpFn to MVEGenLongDualAccOpFn, Peter Maydell, 2021/08/25
- [PULL 32/44] target/arm: Implement MVE VCTP, Peter Maydell, 2021/08/25
- [PULL 35/44] target/arm: Implement MVE interleaving loads/stores, Peter Maydell, 2021/08/25