[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v5 16/19] tcg-arm: Improve scheduling of tcg_out
From: |
Aurelien Jarno |
Subject: |
Re: [Qemu-devel] [PATCH v5 16/19] tcg-arm: Improve scheduling of tcg_out_tlb_read |
Date: |
Mon, 22 Apr 2013 11:55:21 +0200 |
User-agent: |
Mutt/1.5.21 (2010-09-15) |
On Sun, Mar 31, 2013 at 03:35:02PM -0700, Richard Henderson wrote:
> The schedule was fully serial, with no possibility for dual issue.
> The old schedule had a minimal issue of 7 cycles; the new schedule
> has a minimal issue of 5 cycles.
>
> Signed-off-by: Richard Henderson <address@hidden>
> ---
> tcg/arm/tcg-target.c | 110
> ++++++++++++++++++++++++++-------------------------
> 1 file changed, 57 insertions(+), 53 deletions(-)
>
> diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
> index 3e62b09..35598a8 100644
> --- a/tcg/arm/tcg-target.c
> +++ b/tcg/arm/tcg-target.c
> @@ -178,18 +178,12 @@ static int target_parse_constraint(TCGArgConstraint
> *ct, const char **pct_str)
> ct->ct |= TCG_CT_REG;
> tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
> #ifdef CONFIG_SOFTMMU
> - /* r0 and r1 will be overwritten when reading the tlb entry,
> + /* r0-r2 will be overwritten when reading the tlb entry,
> so don't use these. */
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
> -#if TARGET_LONG_BITS == 64
> - /* If we're passing env to the helper as r0 and need a regpair
> - * for the address then r2 will be overwritten as we're setting
> - * up the args to the helper.
> - */
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
> #endif
> -#endif
> break;
> case 'L':
> ct->ct |= TCG_CT_REG;
> @@ -203,30 +197,16 @@ static int target_parse_constraint(TCGArgConstraint
> *ct, const char **pct_str)
>
> /* qemu_st address & data_reg */
> case 's':
> - ct->ct |= TCG_CT_REG;
> - tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
> - /* r0 and r1 will be overwritten when reading the tlb entry
> - (softmmu only) and doing the byte swapping, so don't
> - use these. */
> - tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
> - tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
> -#if defined(CONFIG_SOFTMMU) && (TARGET_LONG_BITS == 64)
> - /* Avoid clashes with registers being used for helper args */
> - tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
> - tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
> -#endif
> - break;
> /* qemu_st64 data_reg2 */
> case 'S':
> ct->ct |= TCG_CT_REG;
> tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
> - /* r0 and r1 will be overwritten when reading the tlb entry
> - (softmmu only) and doing the byte swapping, so don't
> - use these. */
> + /* r0-r2 will be overwritten when reading the tlb entry (softmmu
> only)
> + and r0-r1 doing the byte swapping, so don't use these. */
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
It means that the 's' and 'S' constraints are basically now the same. I
think we should therefore keep only on of the two.
> -#ifdef CONFIG_SOFTMMU
> - /* r2 is still needed to load data_reg, so don't use it. */
> +#if defined(CONFIG_SOFTMMU)
> + /* Avoid clashes with registers being used for helper args */
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
> #if TARGET_LONG_BITS == 64
> /* Avoid clashes with registers being used for helper args */
> @@ -341,6 +321,8 @@ typedef enum {
> INSN_LDRSB_REG = 0x001000d0,
> INSN_STRB_IMM = 0x04400000,
> INSN_STRB_REG = 0x06400000,
> +
> + INSN_LDRD_IMM = 0x004000d0,
> } ARMInsn;
>
> #define SHIFT_IMM_LSL(im) (((im) << 7) | 0x00)
> @@ -806,15 +788,6 @@ static inline void tcg_out_ld32_12(TCGContext *s, int
> cond, TCGReg rt,
> tcg_out_memop_12(s, cond, INSN_LDR_IMM, rt, rn, imm12, 1, 0);
> }
>
> -/* Offset pre-increment with base writeback. */
> -static inline void tcg_out_ld32_12wb(TCGContext *s, int cond, TCGReg rt,
> - TCGReg rn, int imm12)
> -{
> - /* ldr with writeback and both register equals is UNPREDICTABLE */
> - assert(rd != rn);
> - tcg_out_memop_12(s, cond, INSN_LDR_IMM, rt, rn, imm12, 1, 1);
> -}
> -
> static inline void tcg_out_st32_12(TCGContext *s, int cond, TCGReg rt,
> TCGReg rn, int imm12)
> {
> @@ -1151,47 +1124,78 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg
> argreg,
>
> #define TLB_SHIFT (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
>
> -/* Load and compare a TLB entry, leaving the flags set. Leaves R0 pointing
> +/* Load and compare a TLB entry, leaving the flags set. Leaves R2 pointing
> to the tlb entry. Clobbers R1 and TMP. */
>
> static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
> int s_bits, int tlb_offset)
> {
> + TCGReg base = TCG_AREG0;
> +
> /* Should generate something like the following:
> - * shr r8, addr_reg, #TARGET_PAGE_BITS
> - * and r0, r8, #(CPU_TLB_SIZE - 1) @ Assumption: CPU_TLB_BITS <= 8
> - * add r0, env, r0 lsl #CPU_TLB_ENTRY_BITS
> + * pre-v7:
> + * shr tmp, addr_reg, #TARGET_PAGE_BITS (1)
> + * add r2, env, #off & 0xff00
> + * and r0, tmp, #(CPU_TLB_SIZE - 1) (2)
> + * add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS (3)
> + * ldr r0, [r2, #off & 0xff]! (4)
> + * tst addr_reg, #s_mask
> + * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS (5)
> + *
> + * v7 (not implemented yet):
> + * ubfx r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS (1)
> + * movw tmp, #~TARGET_PAGE_MASK & ~s_mask
> + * movw r0, #off
> + * add r2, env, r2, lsl #CPU_TLB_ENTRY_BITS (2)
> + * bic tmp, addr_reg, tmp
> + * ldr r0, [r2, r0]! (3)
> + * cmp r0, tmp (4)
> */
> # if CPU_TLB_BITS > 8
> # error
> # endif
> tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
> 0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
> +
> + /* We assume that the offset is contained within 16 bits. */
> + assert((tlb_offset & ~0xffff) == 0);
> + if (tlb_offset > 0xff) {
> + tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
> + (24 << 7) | (tlb_offset >> 8));
> + tlb_offset &= 0xff;
> + base = TCG_REG_R2;
> + }
> +
> tcg_out_dat_imm(s, COND_AL, ARITH_AND,
> TCG_REG_R0, TCG_REG_TMP, CPU_TLB_SIZE - 1);
> - tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_AREG0,
> + tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
> TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
>
> - /* We assume that the offset is contained within 20 bits. */
> - assert((tlb_offset & ~0xfffff) == 0);
> - if (tlb_offset > 0xfff) {
> - tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_REG_R0,
> - 0xa00 | (tlb_offset >> 12));
> - tlb_offset &= 0xfff;
> + /* Load the tlb comparator. Use ldrd if needed and available,
> + but due to how the pointer needs setting up, ldm isn't useful.
> + Base arm5 doesn't have ldrd, but armv5te does. */
> + if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
> + tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0,
> + TCG_REG_R2, tlb_offset, 1, 1);
> + } else {
> + tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
> + TCG_REG_R2, tlb_offset, 1, 1);
> + if (TARGET_LONG_BITS == 64) {
> + tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
> + TCG_REG_R2, 4, 1, 0);
> + }
> }
> - tcg_out_ld32_12wb(s, COND_AL, TCG_REG_R1, TCG_REG_R0, tlb_offset);
> - tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R1,
> - TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
>
> /* Check alignment. */
> if (s_bits) {
> - tcg_out_dat_imm(s, COND_EQ, ARITH_TST,
> + tcg_out_dat_imm(s, COND_AL, ARITH_TST,
> 0, addrlo, (1 << s_bits) - 1);
> }
>
> + tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
> + TCG_REG_R0, TCG_REG_TMP,
> SHIFT_IMM_LSL(TARGET_PAGE_BITS));
> +
> if (TARGET_LONG_BITS == 64) {
> - /* XXX: possibly we could use a block data load in the first access.
> */
> - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, 4);
> tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0,
> TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0));
> }
> @@ -1224,7 +1228,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg
> *args, int opc)
> tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
> offsetof(CPUArchState,
> tlb_table[mem_index][0].addr_read));
>
> - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0,
> + tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
> offsetof(CPUTLBEntry, addend)
> - offsetof(CPUTLBEntry, addr_read));
>
> @@ -1396,7 +1400,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg
> *args, int opc)
> offsetof(CPUArchState,
> tlb_table[mem_index][0].addr_write));
>
> - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0,
> + tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
> offsetof(CPUTLBEntry, addend)
> - offsetof(CPUTLBEntry, addr_write));
>
Beside the comment above, it looks fine to me.
--
Aurelien Jarno GPG: 1024D/F1BCDB73
address@hidden http://www.aurel32.net
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [Qemu-devel] [PATCH v5 16/19] tcg-arm: Improve scheduling of tcg_out_tlb_read,
Aurelien Jarno <=