[PATCH v1 05/19] target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v1 05/19] target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{

From:	Richard Henderson
Subject:	[PATCH v1 05/19] target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r
Date:	Wed, 15 Feb 2023 17:08:40 -1000

Round len_align to 16 instead of 8, handling an odd 8-byte as part
of the tail.  Use MO_ATOM_NONE to indicate that all of these memory
ops have only byte atomicity.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate-sve.c | 107 ++++++++++++++++++++++++++++---------
 1 file changed, 81 insertions(+), 26 deletions(-)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 621a2abb22..f3d5e79dd2 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -4312,11 +4312,12 @@ TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
 void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
                  int len, int rn, int imm)
 {
-    int len_align = QEMU_ALIGN_DOWN(len, 8);
-    int len_remain = len % 8;
-    int nparts = len / 8 + ctpop8(len_remain);
+    int len_align = QEMU_ALIGN_DOWN(len, 16);
+    int len_remain = len % 16;
+    int nparts = len / 16 + ctpop8(len_remain);
     int midx = get_mem_index(s);
     TCGv_i64 dirty_addr, clean_addr, t0, t1;
+    TCGv_i128 t16;
 
     dirty_addr = tcg_temp_new_i64();
     tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
@@ -4334,12 +4335,20 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int 
vofs,
         int i;
 
         t0 = tcg_temp_new_i64();
-        for (i = 0; i < len_align; i += 8) {
-            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
+        t1 = tcg_temp_new_i64();
+        t16 = tcg_temp_new_i128();
+
+        for (i = 0; i < len_align; i += 16) {
+            tcg_gen_qemu_ld_i128(t16, clean_addr, midx,
+                                 MO_LE | MO_128 | MO_ATOM_NONE);
+            tcg_gen_extr_i128_i64(t0, t1, t16);
             tcg_gen_st_i64(t0, base, vofs + i);
-            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+            tcg_gen_st_i64(t1, base, vofs + i + 8);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 16);
         }
         tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+        tcg_temp_free_i128(t16);
     } else {
         TCGLabel *loop = gen_new_label();
         TCGv_ptr tp, i = tcg_const_local_ptr(0);
@@ -4357,16 +4366,25 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int 
vofs,
 
         gen_set_label(loop);
 
-        t0 = tcg_temp_new_i64();
-        tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
-        tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+        t16 = tcg_temp_new_i128();
+        tcg_gen_qemu_ld_i128(t16, clean_addr, midx,
+                             MO_LE | MO_128 | MO_ATOM_NONE);
+        tcg_gen_addi_i64(clean_addr, clean_addr, 16);
 
         tp = tcg_temp_new_ptr();
         tcg_gen_add_ptr(tp, base, i);
-        tcg_gen_addi_ptr(i, i, 8);
+        tcg_gen_addi_ptr(i, i, 16);
+
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        tcg_gen_extr_i128_i64(t0, t1, t16);
+        tcg_temp_free_i128(t16);
+
         tcg_gen_st_i64(t0, tp, vofs);
-        tcg_temp_free_ptr(tp);
+        tcg_gen_st_i64(t1, tp, vofs + 8);
         tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+        tcg_temp_free_ptr(tp);
 
         tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
         tcg_temp_free_ptr(i);
@@ -4381,6 +4399,17 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int 
vofs,
      * Predicate register loads can be any multiple of 2.
      * Note that we still store the entire 64-bit unit into cpu_env.
      */
+    if (len_remain >= 8) {
+        t0 = tcg_temp_new_i64();
+        tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE);
+        tcg_gen_st_i64(t0, base, vofs + len_align);
+        len_remain -= 8;
+        len_align += 8;
+        if (len_remain) {
+            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+        }
+        tcg_temp_free_i64(t0);
+    }
     if (len_remain) {
         t0 = tcg_temp_new_i64();
         switch (len_remain) {
@@ -4388,14 +4417,14 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int 
vofs,
         case 4:
         case 8:
             tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
-                                MO_LE | ctz32(len_remain));
+                                MO_LE | ctz32(len_remain) | MO_ATOM_NONE);
             break;
 
         case 6:
             t1 = tcg_temp_new_i64();
-            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL);
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE);
             tcg_gen_addi_i64(clean_addr, clean_addr, 4);
-            tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW);
+            tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW | MO_ATOM_NONE);
             tcg_gen_deposit_i64(t0, t0, t1, 32, 32);
             tcg_temp_free_i64(t1);
             break;
@@ -4412,11 +4441,12 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int 
vofs,
 void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs,
                  int len, int rn, int imm)
 {
-    int len_align = QEMU_ALIGN_DOWN(len, 8);
-    int len_remain = len % 8;
-    int nparts = len / 8 + ctpop8(len_remain);
+    int len_align = QEMU_ALIGN_DOWN(len, 16);
+    int len_remain = len % 16;
+    int nparts = len / 16 + ctpop8(len_remain);
     int midx = get_mem_index(s);
-    TCGv_i64 dirty_addr, clean_addr, t0;
+    TCGv_i64 dirty_addr, clean_addr, t0, t1;
+    TCGv_i128 t16;
 
     dirty_addr = tcg_temp_new_i64();
     tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
@@ -4435,12 +4465,19 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int 
vofs,
         int i;
 
         t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        t16 = tcg_temp_new_i128();
         for (i = 0; i < len_align; i += 8) {
             tcg_gen_ld_i64(t0, base, vofs + i);
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
-            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+            tcg_gen_ld_i64(t1, base, vofs + i + 8);
+            tcg_gen_concat_i64_i128(t16, t0, t1);
+            tcg_gen_qemu_st_i128(t16, clean_addr, midx,
+                                 MO_LE | MO_128 | MO_ATOM_NONE);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 16);
         }
         tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+        tcg_temp_free_i128(t16);
     } else {
         TCGLabel *loop = gen_new_label();
         TCGv_ptr tp, i = tcg_const_local_ptr(0);
@@ -4459,15 +4496,22 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int 
vofs,
         gen_set_label(loop);
 
         t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
         tp = tcg_temp_new_ptr();
         tcg_gen_add_ptr(tp, base, i);
         tcg_gen_ld_i64(t0, tp, vofs);
-        tcg_gen_addi_ptr(i, i, 8);
+        tcg_gen_ld_i64(t1, tp, vofs + 8);
+        tcg_gen_addi_ptr(i, i, 16);
         tcg_temp_free_ptr(tp);
 
-        tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
-        tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+        t16 = tcg_temp_new_i128();
+        tcg_gen_concat_i64_i128(t16, t0, t1);
         tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+
+        tcg_gen_qemu_st_i128(t16, clean_addr, midx, MO_LEUQ);
+        tcg_temp_free_i128(t16);
+        tcg_gen_addi_i64(clean_addr, clean_addr, 16);
 
         tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
         tcg_temp_free_ptr(i);
@@ -4479,6 +4523,17 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int 
vofs,
     }
 
     /* Predicate register stores can be any multiple of 2.  */
+    if (len_remain >= 8) {
+        t0 = tcg_temp_new_i64();
+        tcg_gen_st_i64(t0, base, vofs + len_align);
+        tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE);
+        len_remain -= 8;
+        len_align += 8;
+        if (len_remain) {
+            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+        }
+        tcg_temp_free_i64(t0);
+    }
     if (len_remain) {
         t0 = tcg_temp_new_i64();
         tcg_gen_ld_i64(t0, base, vofs + len_align);
@@ -4488,14 +4543,14 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int 
vofs,
         case 4:
         case 8:
             tcg_gen_qemu_st_i64(t0, clean_addr, midx,
-                                MO_LE | ctz32(len_remain));
+                                MO_LE | ctz32(len_remain) | MO_ATOM_NONE);
             break;
 
         case 6:
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE);
             tcg_gen_addi_i64(clean_addr, clean_addr, 4);
             tcg_gen_shri_i64(t0, t0, 32);
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW | MO_ATOM_NONE);
             break;
 
         default:
-- 
2.34.1

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH v1 00/19] target/arm: Implement FEAT_LSE2, Richard Henderson, 2023/02/15
- [PATCH v1 01/19] target/arm: Make cpu_exclusive_high hold the high bits, Richard Henderson, 2023/02/15
  - Re: [PATCH v1 01/19] target/arm: Make cpu_exclusive_high hold the high bits, Peter Maydell, 2023/02/23
- [PATCH v1 02/19] target/arm: Use tcg_gen_qemu_ld_i128 for LDXP, Richard Henderson, 2023/02/15
- [PATCH v1 03/19] target/arm: Use tcg_gen_qemu_{st, ld}_i128 for do_fp_{st, ld}, Richard Henderson, 2023/02/15
  - Re: [PATCH v1 03/19] target/arm: Use tcg_gen_qemu_{st, ld}_i128 for do_fp_{st, ld}, Peter Maydell, 2023/02/23
- [PATCH v1 04/19] target/arm: Use tcg_gen_qemu_st_i128 for STZG, STZ2G, Richard Henderson, 2023/02/15
  - Re: [PATCH v1 04/19] target/arm: Use tcg_gen_qemu_st_i128 for STZG, STZ2G, Peter Maydell, 2023/02/23
- [PATCH v1 07/19] target/arm: Add feature test for FEAT_LSE2, Richard Henderson, 2023/02/15
  - Re: [PATCH v1 07/19] target/arm: Add feature test for FEAT_LSE2, Peter Maydell, 2023/02/23
- [PATCH v1 05/19] target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r, Richard Henderson <=
  - Re: [PATCH v1 05/19] target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r, Peter Maydell, 2023/02/23
- [PATCH v1 06/19] target/arm: Sink gen_mte_check1 into load/store_exclusive, Richard Henderson, 2023/02/15
  - Re: [PATCH v1 06/19] target/arm: Sink gen_mte_check1 into load/store_exclusive, Peter Maydell, 2023/02/23
- [PATCH v1 08/19] target/arm: Add atom_data to DisasContext, Richard Henderson, 2023/02/15
  - Re: [PATCH v1 08/19] target/arm: Add atom_data to DisasContext, Peter Maydell, 2023/02/23
- [PATCH v1 09/19] target/arm: Load/store integer pair with one tcg operation, Richard Henderson, 2023/02/15
  - Re: [PATCH v1 09/19] target/arm: Load/store integer pair with one tcg operation, Peter Maydell, 2023/02/23
- [PATCH v1 10/19] target/arm: Hoist finalize_memop out of do_gpr_{ld, st}, Richard Henderson, 2023/02/15
  - Re: [PATCH v1 10/19] target/arm: Hoist finalize_memop out of do_gpr_{ld, st}, Peter Maydell, 2023/02/23
- [PATCH v1 11/19] target/arm: Hoist finalize_memop out of do_fp_{ld, st}, Richard Henderson, 2023/02/15

Prev by Date: [PATCH v1 07/19] target/arm: Add feature test for FEAT_LSE2
Next by Date: [PATCH v1 06/19] target/arm: Sink gen_mte_check1 into load/store_exclusive
Previous by thread: Re: [PATCH v1 07/19] target/arm: Add feature test for FEAT_LSE2
Next by thread: Re: [PATCH v1 05/19] target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r
Index(es):
- Date
- Thread