[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v1 05/19] target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{
From: |
Richard Henderson |
Subject: |
[PATCH v1 05/19] target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r |
Date: |
Wed, 15 Feb 2023 17:08:40 -1000 |
Round len_align to 16 instead of 8, handling an odd 8-byte as part
of the tail. Use MO_ATOM_NONE to indicate that all of these memory
ops have only byte atomicity.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/translate-sve.c | 107 ++++++++++++++++++++++++++++---------
1 file changed, 81 insertions(+), 26 deletions(-)
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 621a2abb22..f3d5e79dd2 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -4312,11 +4312,12 @@ TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
int len, int rn, int imm)
{
- int len_align = QEMU_ALIGN_DOWN(len, 8);
- int len_remain = len % 8;
- int nparts = len / 8 + ctpop8(len_remain);
+ int len_align = QEMU_ALIGN_DOWN(len, 16);
+ int len_remain = len % 16;
+ int nparts = len / 16 + ctpop8(len_remain);
int midx = get_mem_index(s);
TCGv_i64 dirty_addr, clean_addr, t0, t1;
+ TCGv_i128 t16;
dirty_addr = tcg_temp_new_i64();
tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
@@ -4334,12 +4335,20 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int
vofs,
int i;
t0 = tcg_temp_new_i64();
- for (i = 0; i < len_align; i += 8) {
- tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
+ t1 = tcg_temp_new_i64();
+ t16 = tcg_temp_new_i128();
+
+ for (i = 0; i < len_align; i += 16) {
+ tcg_gen_qemu_ld_i128(t16, clean_addr, midx,
+ MO_LE | MO_128 | MO_ATOM_NONE);
+ tcg_gen_extr_i128_i64(t0, t1, t16);
tcg_gen_st_i64(t0, base, vofs + i);
- tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ tcg_gen_st_i64(t1, base, vofs + i + 8);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 16);
}
tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i128(t16);
} else {
TCGLabel *loop = gen_new_label();
TCGv_ptr tp, i = tcg_const_local_ptr(0);
@@ -4357,16 +4366,25 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int
vofs,
gen_set_label(loop);
- t0 = tcg_temp_new_i64();
- tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
- tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ t16 = tcg_temp_new_i128();
+ tcg_gen_qemu_ld_i128(t16, clean_addr, midx,
+ MO_LE | MO_128 | MO_ATOM_NONE);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 16);
tp = tcg_temp_new_ptr();
tcg_gen_add_ptr(tp, base, i);
- tcg_gen_addi_ptr(i, i, 8);
+ tcg_gen_addi_ptr(i, i, 16);
+
+ t0 = tcg_temp_new_i64();
+ t1 = tcg_temp_new_i64();
+ tcg_gen_extr_i128_i64(t0, t1, t16);
+ tcg_temp_free_i128(t16);
+
tcg_gen_st_i64(t0, tp, vofs);
- tcg_temp_free_ptr(tp);
+ tcg_gen_st_i64(t1, tp, vofs + 8);
tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_ptr(tp);
tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
tcg_temp_free_ptr(i);
@@ -4381,6 +4399,17 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int
vofs,
* Predicate register loads can be any multiple of 2.
* Note that we still store the entire 64-bit unit into cpu_env.
*/
+ if (len_remain >= 8) {
+ t0 = tcg_temp_new_i64();
+ tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE);
+ tcg_gen_st_i64(t0, base, vofs + len_align);
+ len_remain -= 8;
+ len_align += 8;
+ if (len_remain) {
+ tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ }
+ tcg_temp_free_i64(t0);
+ }
if (len_remain) {
t0 = tcg_temp_new_i64();
switch (len_remain) {
@@ -4388,14 +4417,14 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int
vofs,
case 4:
case 8:
tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
- MO_LE | ctz32(len_remain));
+ MO_LE | ctz32(len_remain) | MO_ATOM_NONE);
break;
case 6:
t1 = tcg_temp_new_i64();
- tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL);
+ tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE);
tcg_gen_addi_i64(clean_addr, clean_addr, 4);
- tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW);
+ tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW | MO_ATOM_NONE);
tcg_gen_deposit_i64(t0, t0, t1, 32, 32);
tcg_temp_free_i64(t1);
break;
@@ -4412,11 +4441,12 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int
vofs,
void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs,
int len, int rn, int imm)
{
- int len_align = QEMU_ALIGN_DOWN(len, 8);
- int len_remain = len % 8;
- int nparts = len / 8 + ctpop8(len_remain);
+ int len_align = QEMU_ALIGN_DOWN(len, 16);
+ int len_remain = len % 16;
+ int nparts = len / 16 + ctpop8(len_remain);
int midx = get_mem_index(s);
- TCGv_i64 dirty_addr, clean_addr, t0;
+ TCGv_i64 dirty_addr, clean_addr, t0, t1;
+ TCGv_i128 t16;
dirty_addr = tcg_temp_new_i64();
tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
@@ -4435,12 +4465,19 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int
vofs,
int i;
t0 = tcg_temp_new_i64();
+ t1 = tcg_temp_new_i64();
+ t16 = tcg_temp_new_i128();
for (i = 0; i < len_align; i += 8) {
tcg_gen_ld_i64(t0, base, vofs + i);
- tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
- tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ tcg_gen_ld_i64(t1, base, vofs + i + 8);
+ tcg_gen_concat_i64_i128(t16, t0, t1);
+ tcg_gen_qemu_st_i128(t16, clean_addr, midx,
+ MO_LE | MO_128 | MO_ATOM_NONE);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 16);
}
tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i128(t16);
} else {
TCGLabel *loop = gen_new_label();
TCGv_ptr tp, i = tcg_const_local_ptr(0);
@@ -4459,15 +4496,22 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int
vofs,
gen_set_label(loop);
t0 = tcg_temp_new_i64();
+ t1 = tcg_temp_new_i64();
tp = tcg_temp_new_ptr();
tcg_gen_add_ptr(tp, base, i);
tcg_gen_ld_i64(t0, tp, vofs);
- tcg_gen_addi_ptr(i, i, 8);
+ tcg_gen_ld_i64(t1, tp, vofs + 8);
+ tcg_gen_addi_ptr(i, i, 16);
tcg_temp_free_ptr(tp);
- tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
- tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ t16 = tcg_temp_new_i128();
+ tcg_gen_concat_i64_i128(t16, t0, t1);
tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+
+ tcg_gen_qemu_st_i128(t16, clean_addr, midx, MO_LEUQ);
+ tcg_temp_free_i128(t16);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 16);
tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
tcg_temp_free_ptr(i);
@@ -4479,6 +4523,17 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int
vofs,
}
/* Predicate register stores can be any multiple of 2. */
+ if (len_remain >= 8) {
+ t0 = tcg_temp_new_i64();
+ tcg_gen_st_i64(t0, base, vofs + len_align);
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE);
+ len_remain -= 8;
+ len_align += 8;
+ if (len_remain) {
+ tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+ }
+ tcg_temp_free_i64(t0);
+ }
if (len_remain) {
t0 = tcg_temp_new_i64();
tcg_gen_ld_i64(t0, base, vofs + len_align);
@@ -4488,14 +4543,14 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int
vofs,
case 4:
case 8:
tcg_gen_qemu_st_i64(t0, clean_addr, midx,
- MO_LE | ctz32(len_remain));
+ MO_LE | ctz32(len_remain) | MO_ATOM_NONE);
break;
case 6:
- tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL);
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE);
tcg_gen_addi_i64(clean_addr, clean_addr, 4);
tcg_gen_shri_i64(t0, t0, 32);
- tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW);
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW | MO_ATOM_NONE);
break;
default:
--
2.34.1
- [PATCH v1 00/19] target/arm: Implement FEAT_LSE2, Richard Henderson, 2023/02/15
- [PATCH v1 01/19] target/arm: Make cpu_exclusive_high hold the high bits, Richard Henderson, 2023/02/15
- [PATCH v1 02/19] target/arm: Use tcg_gen_qemu_ld_i128 for LDXP, Richard Henderson, 2023/02/15
- [PATCH v1 03/19] target/arm: Use tcg_gen_qemu_{st, ld}_i128 for do_fp_{st, ld}, Richard Henderson, 2023/02/15
- [PATCH v1 04/19] target/arm: Use tcg_gen_qemu_st_i128 for STZG, STZ2G, Richard Henderson, 2023/02/15
- [PATCH v1 07/19] target/arm: Add feature test for FEAT_LSE2, Richard Henderson, 2023/02/15
- [PATCH v1 05/19] target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r,
Richard Henderson <=
- [PATCH v1 06/19] target/arm: Sink gen_mte_check1 into load/store_exclusive, Richard Henderson, 2023/02/15
- [PATCH v1 08/19] target/arm: Add atom_data to DisasContext, Richard Henderson, 2023/02/15
- [PATCH v1 09/19] target/arm: Load/store integer pair with one tcg operation, Richard Henderson, 2023/02/15
- [PATCH v1 10/19] target/arm: Hoist finalize_memop out of do_gpr_{ld, st}, Richard Henderson, 2023/02/15
- [PATCH v1 11/19] target/arm: Hoist finalize_memop out of do_fp_{ld, st}, Richard Henderson, 2023/02/15