qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v3 38/39] tcg/arm: Use LDRD to load tlb mask+tab


From: Alistair Francis
Subject: Re: [Qemu-devel] [PATCH v3 38/39] tcg/arm: Use LDRD to load tlb mask+table
Date: Fri, 10 May 2019 14:08:58 -0700

On Tue, May 7, 2019 at 5:32 PM Richard Henderson
<address@hidden> wrote:
>
> This changes the code generation for the tlb from e.g.
>
>         ldr      ip, [r6, #-0x10]
>         ldr      r2, [r6, #-0xc]
>         and      ip, ip, r4, lsr #8
>         ldrd     r0, r1, [r2, ip]!
>         ldr      r2, [r2, #0x18]
>
> to
>
>         ldrd     r0, r1, [r6, #-0x10]
>         and      r0, r0, r4, lsr #8
>         ldrd     r2, r3, [r1, r0]!
>         ldr      r1, [r1, #0x18]
>
> for armv7 hosts.  Rearranging the register allocation in
> order to avoid overlap between the two ldrd pairs causes
> the patch to be larger than it ordinarily would be.
>
> Signed-off-by: Richard Henderson <address@hidden>
> ---
> v3: Add QEMU_BUILD_BUG_ON for mask/table ordering; comment fixes.
> ---
>  tcg/arm/tcg-target.inc.c | 92 +++++++++++++++++++++++-----------------
>  1 file changed, 53 insertions(+), 39 deletions(-)
>
> diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
> index ad32b04e13..ac813abfb8 100644
> --- a/tcg/arm/tcg-target.inc.c
> +++ b/tcg/arm/tcg-target.inc.c
> @@ -267,6 +267,7 @@ static const char 
> *target_parse_constraint(TCGArgConstraint *ct,
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
> +        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
>  #endif
>          break;
> @@ -1224,6 +1225,10 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg 
> argreg,
>  QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
>  QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256);
>
> +/* These offsets are built into the LDRD below.  */
> +QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
> +QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);
> +
>  /* Load and compare a TLB entry, leaving the flags set.  Returns the register
>     containing the addend of the tlb entry.  Clobbers R0, R1, R2, TMP.  */
>
> @@ -1238,47 +1243,54 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg 
> addrlo, TCGReg addrhi,
>      unsigned s_bits = opc & MO_SIZE;
>      unsigned a_bits = get_alignment_bits(opc);
>
> -    /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
> -    tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP, TCG_AREG0, mask_off);
> -    tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R2, TCG_AREG0, table_off);
> -
> -    /* Extract the tlb index from the address into TMP.  */
> -    tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_TMP, TCG_REG_TMP, addrlo,
> -                    SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
> -
>      /*
> -     * Add the tlb_table pointer, creating the CPUTLBEntry address in R2.
> -     * Load the tlb comparator into R0/R1 and the fast path addend into R2.
> +     * We don't support inline unaligned acceses, but we can easily
> +     * support overalignment checks.
>       */
> -    if (cmp_off == 0) {
> -       if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
> -            tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, 
> TCG_REG_TMP);
> -        } else {
> -            tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, 
> TCG_REG_TMP);
> -        }
> -    } else {
> -        tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
> -                       TCG_REG_R2, TCG_REG_R2, TCG_REG_TMP, 0);
> -        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
> -            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
> -        } else {
> -            tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
> -       }
> -    }
> -    if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {
> -        tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4);
> -    }
> -
> -    /* Load the tlb addend.  */
> -    tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2,
> -                    offsetof(CPUTLBEntry, addend));
> -
> -    /* Check alignment.  We don't support inline unaligned acceses,
> -       but we can easily support overalignment checks.  */
>      if (a_bits < s_bits) {
>          a_bits = s_bits;
>      }
>
> +    /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
> +    if (use_armv6_instructions) {
> +        tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
> +    } else {
> +        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R0, TCG_AREG0, mask_off);
> +        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R1, TCG_AREG0, table_off);
> +    }
> +
> +    /* Extract the tlb index from the address into R0.  */
> +    tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo,
> +                    SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
> +
> +    /*
> +     * Add the tlb_table pointer, creating the CPUTLBEntry address in R1.
> +     * Load the tlb comparator into R2/R3 and the fast path addend into R1.
> +     */
> +    if (cmp_off == 0) {
> +        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
> +            tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
> +        } else {
> +            tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
> +        }
> +    } else {
> +        tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
> +                        TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0);
> +        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
> +            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
> +        } else {
> +            tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
> +        }
> +    }
> +    if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {
> +        tcg_out_ld32_12(s, COND_AL, TCG_REG_R3, TCG_REG_R1, cmp_off + 4);
> +    }
> +
> +    /* Load the tlb addend.  */
> +    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1,
> +                    offsetof(CPUTLBEntry, addend));
> +
> +    /* Check alignment, check comparators.  */
>      if (use_armv7_instructions) {
>          tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1));
>          int rot = encode_imm(mask);
> @@ -1291,22 +1303,24 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg 
> addrlo, TCGReg addrhi,
>              tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
>                              addrlo, TCG_REG_TMP, 0);
>          }
> -        tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R0, TCG_REG_TMP, 
> 0);
> +        tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 
> 0);
>      } else {
>          if (a_bits) {
>              tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo,
>                              (1 << a_bits) - 1);
>          }
> +        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo,
> +                        SHIFT_IMM_LSR(TARGET_PAGE_BITS));
>          tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP,
> -                        0, TCG_REG_R0, TCG_REG_TMP,
> +                        0, TCG_REG_R2, TCG_REG_TMP,
>                          SHIFT_IMM_LSL(TARGET_PAGE_BITS));
>      }
>
>      if (TARGET_LONG_BITS == 64) {
> -        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addrhi, 0);
> +        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0);

This is complex and I'm probably misunderstanding something but isn't
it possible for TCG_REG_R3 to not be set if use_armv6_instructions is
true and TARGET_LONG_BITS is 64?

Alistair

>      }
>
> -    return TCG_REG_R2;
> +    return TCG_REG_R1;
>  }
>
>  /* Record the context of a call to the out of line helper code for the slow
> --
> 2.17.1
>
>



reply via email to

[Prev in Thread] Current Thread [Next in Thread]