[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PULL 30/47] target/arm: Implement fp16 for Neon VFMA, VMFS
From: |
Peter Maydell |
Subject: |
[PULL 30/47] target/arm: Implement fp16 for Neon VFMA, VMFS |
Date: |
Tue, 1 Sep 2020 16:18:06 +0100 |
Convert the neon floating-point vector operations VFMA and VFMS
to use a gvec helper, and use this to implement the fp16 case.
This is the last use of do_3same_fp() so we can now delete
that function.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20200828183354.27913-32-peter.maydell@linaro.org
---
target/arm/helper.h | 6 +++
target/arm/vec_helper.c | 33 +++++++++++-
target/arm/translate-neon.c.inc | 92 +--------------------------------
3 files changed, 40 insertions(+), 91 deletions(-)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 6f6c96711b7..e6f65c74614 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -665,6 +665,12 @@ DEF_HELPER_FLAGS_5(gvec_fmla_s, TCG_CALL_NO_RWG, void,
ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fmls_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fmls_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_vfma_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_vfma_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 5da5969c1c0..995f09fb71e 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -867,7 +867,32 @@ static float32 float32_mulsub_nf(float32 dest, float32
op1, float32 op2,
return float32_sub(dest, float32_mul(op1, op2, stat), stat);
}
-#define DO_MULADD(NAME, FUNC, TYPE) \
+/* Fused versions; these have the semantics Neon VFMA/VFMS want */
+static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_muladd(op1, op2, dest, 0, stat);
+}
+
+static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_muladd(op1, op2, dest, 0, stat);
+}
+
+static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
+}
+
+static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
+}
+
+#define DO_MULADD(NAME, FUNC, TYPE) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
{ \
intptr_t i, oprsz = simd_oprsz(desc); \
@@ -884,6 +909,12 @@ DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
+DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
+DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
+
+DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
+DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
+
/* For the indexed ops, SVE applies the index per 128-bit vector segment.
* For AdvSIMD, there is of course only one such vector segment.
*/
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
index ab39127026b..4f2378a19b3 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@@ -1033,55 +1033,6 @@ DO_3SAME_PAIR(VPADD, padd_u)
DO_3SAME_VQDMULH(VQDMULH, qdmulh)
DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
-static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
- bool reads_vd)
-{
- /*
- * FP operations handled elementwise 32 bits at a time.
- * If reads_vd is true then the old value of Vd will be
- * loaded before calling the callback function. This is
- * used for multiply-accumulate type operations.
- */
- TCGv_i32 tmp, tmp2;
- int pass;
-
- if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
- return false;
- }
-
- /* UNDEF accesses to D16-D31 if they don't exist. */
- if (!dc_isar_feature(aa32_simd_r32, s) &&
- ((a->vd | a->vn | a->vm) & 0x10)) {
- return false;
- }
-
- if ((a->vn | a->vm | a->vd) & a->q) {
- return false;
- }
-
- if (!vfp_access_check(s)) {
- return true;
- }
-
- TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
- tmp = neon_load_reg(a->vn, pass);
- tmp2 = neon_load_reg(a->vm, pass);
- if (reads_vd) {
- TCGv_i32 tmp_rd = neon_load_reg(a->vd, pass);
- fn(tmp_rd, tmp, tmp2, fpstatus);
- neon_store_reg(a->vd, pass, tmp_rd);
- tcg_temp_free_i32(tmp);
- } else {
- fn(tmp, tmp, tmp2, fpstatus);
- neon_store_reg(a->vd, pass, tmp);
- }
- tcg_temp_free_i32(tmp2);
- }
- tcg_temp_free_ptr(fpstatus);
- return true;
-}
-
#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC) \
static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \
uint32_t rn_ofs, uint32_t rm_ofs, \
@@ -1121,6 +1072,8 @@ DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s,
gen_helper_gvec_fmax_h)
DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
+DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
+DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
@@ -1197,47 +1150,6 @@ static bool trans_VRSQRTS_fp_3s(DisasContext *s,
arg_3same *a)
return do_3same(s, a, gen_VRSQRTS_fp_3s);
}
-static void gen_VFMA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
- TCGv_ptr fpstatus)
-{
- gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
-}
-
-static bool trans_VFMA_fp_3s(DisasContext *s, arg_3same *a)
-{
- if (!dc_isar_feature(aa32_simdfmac, s)) {
- return false;
- }
-
- if (a->size != 0) {
- /* TODO fp16 support */
- return false;
- }
-
- return do_3same_fp(s, a, gen_VFMA_fp_3s, true);
-}
-
-static void gen_VFMS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
- TCGv_ptr fpstatus)
-{
- gen_helper_vfp_negs(vn, vn);
- gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
-}
-
-static bool trans_VFMS_fp_3s(DisasContext *s, arg_3same *a)
-{
- if (!dc_isar_feature(aa32_simdfmac, s)) {
- return false;
- }
-
- if (a->size != 0) {
- /* TODO fp16 support */
- return false;
- }
-
- return do_3same_fp(s, a, gen_VFMS_fp_3s, true);
-}
-
static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
{
/* FP operations handled pairwise 32 bits at a time */
--
2.20.1
- [PULL 16/47] target/arm: Implement VFP vp16 VCVT-with-specified-rounding-mode, (continued)
- [PULL 16/47] target/arm: Implement VFP vp16 VCVT-with-specified-rounding-mode, Peter Maydell, 2020/09/01
- [PULL 22/47] target/arm: Implement FP16 for Neon VADD, VSUB, VABD, VMUL, Peter Maydell, 2020/09/01
- [PULL 21/47] target/arm: Implement VFP fp16 VMOV between gp and halfprec registers, Peter Maydell, 2020/09/01
- [PULL 18/47] target/arm: Implement VFP fp16 VRINT*, Peter Maydell, 2020/09/01
- [PULL 23/47] target/arm: Implement fp16 for Neon VRECPE, VRSQRTE using gvec, Peter Maydell, 2020/09/01
- [PULL 24/47] target/arm: Implement fp16 for Neon VABS, VNEG of floats, Peter Maydell, 2020/09/01
- [PULL 25/47] target/arm: Implement fp16 for VCEQ, VCGE, VCGT comparisons, Peter Maydell, 2020/09/01
- [PULL 27/47] target/arm: Implement fp16 for Neon VMAX, VMIN, Peter Maydell, 2020/09/01
- [PULL 28/47] target/arm: Implement fp16 for Neon VMAXNM, VMINNM, Peter Maydell, 2020/09/01
- [PULL 29/47] target/arm: Implement fp16 for Neon VMLA, VMLS operations, Peter Maydell, 2020/09/01
- [PULL 30/47] target/arm: Implement fp16 for Neon VFMA, VMFS,
Peter Maydell <=
- [PULL 26/47] target/arm: Implement fp16 for VACGE, VACGT, Peter Maydell, 2020/09/01
- [PULL 31/47] target/arm: Implement fp16 for Neon fp compare-vs-0, Peter Maydell, 2020/09/01
- [PULL 33/47] target/arm: Implement fp16 for Neon VRSQRTS, Peter Maydell, 2020/09/01
- [PULL 34/47] target/arm: Implement fp16 for Neon pairwise fp ops, Peter Maydell, 2020/09/01
- [PULL 35/47] target/arm: Implement fp16 for Neon float-integer VCVT, Peter Maydell, 2020/09/01
- [PULL 32/47] target/arm: Implement fp16 for Neon VRECPS, Peter Maydell, 2020/09/01
- [PULL 36/47] target/arm: Convert Neon VCVT fixed-point to gvec, Peter Maydell, 2020/09/01
- [PULL 37/47] target/arm: Implement fp16 for Neon VCVT fixed-point, Peter Maydell, 2020/09/01
- [PULL 39/47] target/arm: Implement fp16 for Neon VRINT-with-specified-rounding-mode, Peter Maydell, 2020/09/01
- [PULL 40/47] target/arm: Implement fp16 for Neon VRINTX, Peter Maydell, 2020/09/01