[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH v2 09/10] target-arm: optimize neon vld/vst ops
From: |
juha . riihimaki |
Subject: |
[Qemu-devel] [PATCH v2 09/10] target-arm: optimize neon vld/vst ops |
Date: |
Sat, 24 Oct 2009 15:19:08 +0300 |
From: Juha Riihimäki <address@hidden>
Reduce the amount of TCG ops generated from NEON vld/vst instructions
by simplifying the code generation.
Signed-off-by: Juha Riihimäki <address@hidden>
---
target-arm/translate.c | 67 ++++++++++++++++++++++++-----------------------
1 files changed, 34 insertions(+), 33 deletions(-)
diff --git a/target-arm/translate.c b/target-arm/translate.c
index f262758..55d6377 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -3708,6 +3708,7 @@ static int disas_neon_ls_insn(CPUState * env,
DisasContext *s, uint32_t insn)
TCGv tmp;
TCGv tmp2;
TCGv_i64 tmp64;
+ TCGv stride_var;
if (!vfp_enabled(env))
return 1;
@@ -3729,6 +3730,7 @@ static int disas_neon_ls_insn(CPUState * env,
DisasContext *s, uint32_t insn)
return 1;
load_reg_var(s, addr, rn);
stride = (1 << size) * interleave;
+ stride_var = tcg_const_i32(stride);
for (reg = 0; reg < nregs; reg++) {
if (interleave > 2 || (interleave == 2 && nregs == 2)) {
load_reg_var(s, addr, rn);
@@ -3747,7 +3749,7 @@ static int disas_neon_ls_insn(CPUState * env,
DisasContext *s, uint32_t insn)
neon_load_reg64(tmp64, rd);
gen_st64(tmp64, addr, IS_USER(s));
}
- tcg_gen_addi_i32(addr, addr, stride);
+ tcg_gen_add_i32(addr, addr, stride_var);
} else {
for (pass = 0; pass < 2; pass++) {
if (size == 2) {
@@ -3758,58 +3760,57 @@ static int disas_neon_ls_insn(CPUState * env,
DisasContext *s, uint32_t insn)
tmp = neon_load_reg(rd, pass);
gen_st32(tmp, addr, IS_USER(s));
}
- tcg_gen_addi_i32(addr, addr, stride);
+ tcg_gen_add_i32(addr, addr, stride_var);
} else if (size == 1) {
if (load) {
tmp = gen_ld16u(addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
+ tcg_gen_add_i32(addr, addr, stride_var);
tmp2 = gen_ld16u(addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
- gen_bfi(tmp, tmp, tmp2, 16, 0xffff);
+ tcg_gen_add_i32(addr, addr, stride_var);
+ tcg_gen_shli_i32(tmp2, tmp2, 16);
+ tcg_gen_or_i32(tmp, tmp, tmp2);
dead_tmp(tmp2);
neon_store_reg(rd, pass, tmp);
} else {
tmp = neon_load_reg(rd, pass);
- tmp2 = new_tmp();
- tcg_gen_shri_i32(tmp2, tmp, 16);
- gen_st16(tmp, addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
- gen_st16(tmp2, addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
+ tcg_gen_qemu_st16(tmp, addr, IS_USER(s));
+ tcg_gen_add_i32(addr, addr, stride_var);
+ tcg_gen_shri_i32(tmp, tmp, 16);
+ tcg_gen_qemu_st16(tmp, addr, IS_USER(s));
+ tcg_gen_add_i32(addr, addr, stride_var);
+ dead_tmp(tmp);
}
} else /* size == 0 */ {
if (load) {
- TCGV_UNUSED(tmp2);
- for (n = 0; n < 4; n++) {
- tmp = gen_ld8u(addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
- if (n == 0) {
- tmp2 = tmp;
- } else {
- gen_bfi(tmp2, tmp2, tmp, n * 8, 0xff);
- dead_tmp(tmp);
- }
+ tmp = gen_ld8u(addr, IS_USER(s));
+ tcg_gen_add_i32(addr, addr, stride_var);
+ for (n = 1; n < 4; n++) {
+ tmp2 = gen_ld8u(addr, IS_USER(s));
+ tcg_gen_add_i32(addr, addr, stride_var);
+ tcg_gen_shli_i32(tmp2, tmp2, n * 8);
+ tcg_gen_or_i32(tmp, tmp, tmp2);
+ dead_tmp(tmp2);
}
- neon_store_reg(rd, pass, tmp2);
+ neon_store_reg(rd, pass, tmp);
} else {
- tmp2 = neon_load_reg(rd, pass);
- for (n = 0; n < 4; n++) {
- tmp = new_tmp();
- if (n == 0) {
- tcg_gen_mov_i32(tmp, tmp2);
- } else {
- tcg_gen_shri_i32(tmp, tmp2, n * 8);
- }
- gen_st8(tmp, addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
+ tmp2 = tcg_const_i32(8);
+ tmp = neon_load_reg(rd, pass);
+ for (n = 0; n < 3; n++) {
+ tcg_gen_qemu_st8(tmp, addr, IS_USER(s));
+ tcg_gen_add_i32(addr, addr, stride_var);
+ tcg_gen_shr_i32(tmp, tmp, tmp2);
}
- dead_tmp(tmp2);
+ tcg_gen_qemu_st8(tmp, addr, IS_USER(s));
+ tcg_gen_add_i32(addr, addr, stride_var);
+ dead_tmp(tmp);
+ tcg_temp_free_i32(tmp2);
}
}
}
}
rd += spacing;
}
+ tcg_temp_free_i32(stride_var);
stride = nregs * 8;
} else {
size = (insn >> 10) & 3;
--
1.6.5
- Re: [Qemu-devel] [PATCH v2 03/10] target-arm: allow modifying vfp fpexc en bit only, (continued)
[Qemu-devel] [PATCH v2 07/10] target-arm: optimize thumb2 load/store multiple ops, juha . riihimaki, 2009/10/24
[Qemu-devel] [PATCH v2 09/10] target-arm: optimize neon vld/vst ops,
juha . riihimaki <=
- Re: [Qemu-devel] [PATCH v2 09/10] target-arm: optimize neon vld/vst ops, Laurent Desnogues, 2009/10/25
- Re: [Qemu-devel] [PATCH v2 09/10] target-arm: optimize neon vld/vst ops, Juha.Riihimaki, 2009/10/26
- Re: [Qemu-devel] [PATCH v2 09/10] target-arm: optimize neon vld/vst ops, Laurent Desnogues, 2009/10/26
- Re: [Qemu-devel] [PATCH v2 09/10] target-arm: optimize neon vld/vst ops, Aurelien Jarno, 2009/10/26
- Re: [Qemu-devel] [PATCH v2 09/10] target-arm: optimize neon vld/vst ops, Juha.Riihimaki, 2009/10/29
- Re: [Qemu-devel] [PATCH v2 09/10] target-arm: optimize neon vld/vst ops, Laurent Desnogues, 2009/10/29
[Qemu-devel] [PATCH v2 06/10] target-arm: fix neon vsri, vshl and vsli ops, juha . riihimaki, 2009/10/24
[Qemu-devel] [PATCH v2 10/10] target-arm: fix neon shift helper functions, juha . riihimaki, 2009/10/24