[PATCH v7 78/92] target/arm: Implement SVE2 LD1RO

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v7 78/92] target/arm: Implement SVE2 LD1RO

From:	Richard Henderson
Subject:	[PATCH v7 78/92] target/arm: Implement SVE2 LD1RO
Date:	Mon, 24 May 2021 18:03:44 -0700

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
v7: Fix replication and tail clearing vs e2e7168a214.
---
 target/arm/sve.decode      |  4 ++
 target/arm/translate-sve.c | 93 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 5a1cceccb6..884c5358eb 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1126,11 +1126,15 @@ LD_zpri         1010010 .. nreg:2 0.... 111 ... ..... 
.....     @rpri_load_msz
 # SVE load and broadcast quadword (scalar plus scalar)
 LD1RQ_zprr      1010010 .. 00 ..... 000 ... ..... ..... \
                 @rprr_load_msz nreg=0
+LD1RO_zprr      1010010 .. 01 ..... 000 ... ..... ..... \
+                @rprr_load_msz nreg=0
 
 # SVE load and broadcast quadword (scalar plus immediate)
 # LD1RQB, LD1RQH, LD1RQS, LD1RQD
 LD1RQ_zpri      1010010 .. 00 0.... 001 ... ..... ..... \
                 @rpri_load_msz nreg=0
+LD1RO_zpri      1010010 .. 01 0.... 001 ... ..... ..... \
+                @rpri_load_msz nreg=0
 
 # SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets)
 PRF             1000010 00 -1 ----- 0-- --- ----- 0 ----
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index a213450583..1dcdbac0af 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -5643,6 +5643,99 @@ static bool trans_LD1RQ_zpri(DisasContext *s, 
arg_rpri_load *a)
     return true;
 }
 
+static void do_ldro(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    unsigned vsz_r32;
+    TCGv_ptr t_pg;
+    int poff, doff;
+
+    if (vsz < 32) {
+        /*
+         * Note that this UNDEFINED check comes after CheckSVEEnabled()
+         * in the ARM pseudocode, which is the sve_access_check() done
+         * in our caller.  We should not now return false from the caller.
+         */
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* Load the first octaword using the normal predicated load helpers.  */
+
+    poff = pred_full_reg_offset(s, pg);
+    if (vsz > 32) {
+        /*
+         * Zero-extend the first 32 bits of the predicate into a temporary.
+         * This avoids triggering an assert making sure we don't have bits
+         * set within a predicate beyond VQ, but we have lowered VQ to 2
+         * for this load operation.
+         */
+        TCGv_i64 tmp = tcg_temp_new_i64();
+#ifdef HOST_WORDS_BIGENDIAN
+        poff += 4;
+#endif
+        tcg_gen_ld32u_i64(tmp, cpu_env, poff);
+
+        poff = offsetof(CPUARMState, vfp.preg_tmp);
+        tcg_gen_st_i64(tmp, cpu_env, poff);
+        tcg_temp_free_i64(tmp);
+    }
+
+    t_pg = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(t_pg, cpu_env, poff);
+
+    gen_helper_gvec_mem *fn
+        = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0];
+    fn(cpu_env, t_pg, addr, tcg_constant_i32(simd_desc(32, 32, zt)));
+
+    tcg_temp_free_ptr(t_pg);
+
+    /*
+     * Replicate that first octaword.
+     * The replication happens in units of 32; if the full vector size
+     * is not a multiple of 32, the final bits are zeroed.
+     */
+    doff = vec_full_reg_offset(s, zt);
+    vsz_r32 = QEMU_ALIGN_DOWN(vsz, 32);
+    if (vsz >= 64) {
+        tcg_gen_gvec_dup_mem(5, doff + 32, doff, vsz_r32 - 32, vsz_r32 - 32);
+    }
+    vsz -= vsz_r32;
+    if (vsz) {
+        tcg_gen_gvec_dup_imm(MO_64, doff + vsz_r32, vsz, vsz, 0);
+    }
+}
+
+static bool trans_LD1RO_zprr(DisasContext *s, arg_rprr_load *a)
+{
+    if (!dc_isar_feature(aa64_sve_f64mm, s)) {
+        return false;
+    }
+    if (a->rm == 31) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 addr = new_tmp_a64(s);
+        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+        do_ldro(s, a->rd, a->pg, addr, a->dtype);
+    }
+    return true;
+}
+
+static bool trans_LD1RO_zpri(DisasContext *s, arg_rpri_load *a)
+{
+    if (!dc_isar_feature(aa64_sve_f64mm, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 addr = new_tmp_a64(s);
+        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 32);
+        do_ldro(s, a->rd, a->pg, addr, a->dtype);
+    }
+    return true;
+}
+
 /* Load and broadcast element.  */
 static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a)
 {
-- 
2.25.1

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH v7 62/92] target/arm: Implement SVE2 complex integer multiply-add (indexed), (continued)
- [PATCH v7 62/92] target/arm: Implement SVE2 complex integer multiply-add (indexed), Richard Henderson, 2021/05/24
- [PATCH v7 71/92] target/arm: Implement SVE2 TBL, TBX, Richard Henderson, 2021/05/24
- [PATCH v7 46/92] target/arm: Implement SVE2 FMMLA, Richard Henderson, 2021/05/24
- [PATCH v7 72/92] target/arm: Implement SVE2 FCVTNT, Richard Henderson, 2021/05/24
- [PATCH v7 74/92] target/arm: Implement SVE2 FCVTXNT, FCVTX, Richard Henderson, 2021/05/24
- [PATCH v7 75/92] target/arm: Implement SVE2 FLOGB, Richard Henderson, 2021/05/24
- [PATCH v7 73/92] target/arm: Implement SVE2 FCVTLT, Richard Henderson, 2021/05/24
- [PATCH v7 76/92] target/arm: Share table of sve load functions, Richard Henderson, 2021/05/24
- [PATCH v7 77/92] target/arm: Tidy do_ldrq, Richard Henderson, 2021/05/24
- [PATCH v7 79/92] target/arm: Implement 128-bit ZIP, UZP, TRN, Richard Henderson, 2021/05/24
- [PATCH v7 78/92] target/arm: Implement SVE2 LD1RO, Richard Henderson <=
- [PATCH v7 82/92] target/arm: Implement SVE2 fp multiply-add long, Richard Henderson, 2021/05/24
- [PATCH v7 80/92] target/arm: Implement SVE2 bitwise shift immediate, Richard Henderson, 2021/05/24
- [PATCH v7 81/92] target/arm: Move endian adjustment macros to vec_internal.h, Richard Henderson, 2021/05/24
- [PATCH v7 83/92] target/arm: Implement aarch64 SUDOT, USDOT, Richard Henderson, 2021/05/24
- [PATCH v7 85/92] target/arm: Remove unused fpst from VDOT_scalar, Richard Henderson, 2021/05/24
- [PATCH v7 87/92] target/arm: Split out do_neon_ddda, Richard Henderson, 2021/05/24
- [PATCH v7 84/92] target/arm: Split out do_neon_ddda_fpst, Richard Henderson, 2021/05/24
- [PATCH v7 88/92] target/arm: Split decode of VSDOT and VUDOT, Richard Henderson, 2021/05/24
- [PATCH v7 89/92] target/arm: Implement aarch32 VSUDOT, VUSDOT, Richard Henderson, 2021/05/24
- [PATCH v7 86/92] target/arm: Fix decode for VDOT (indexed), Richard Henderson, 2021/05/24

Prev by Date: [PATCH v7 79/92] target/arm: Implement 128-bit ZIP, UZP, TRN
Next by Date: [PATCH v7 82/92] target/arm: Implement SVE2 fp multiply-add long
Previous by thread: [PATCH v7 79/92] target/arm: Implement 128-bit ZIP, UZP, TRN
Next by thread: [PATCH v7 82/92] target/arm: Implement SVE2 fp multiply-add long
Index(es):
- Date
- Thread