[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-ppc] [PATCH 5/5] tcg/ppc: Improve unaligned load/store handlin
From: |
Benjamin Herrenschmidt |
Subject: |
Re: [Qemu-ppc] [PATCH 5/5] tcg/ppc: Improve unaligned load/store handling on 64-bit backend |
Date: |
Mon, 17 Aug 2015 18:16:59 +1000 |
On Mon, 2015-08-17 at 17:34 +1000, Benjamin Herrenschmidt wrote:
> Currently, we get to the slow path for any unaligned access in the
> backend, because we effectively preserve the bottom address bits
> below the alignment requirement when comparing with the TLB entry,
> so any non-0 bit there will cause the compare to fail.
Forget about this one, it was already picked up by Richard, I forgot
about it when I did git send-email. The other 4 however are candidate
for review/merge.
Cheers.
Ben.
> For the same number of instructions, we can instead add the access
> size - 1 to the address and stick to clearing all the bottom bits.
>
> That means that normal unaligned accesses will not fallback (the HW
> will handle them fine). Only when crossing a page boundary well we
> end up having a mismatch because we'll end up pointing to the next
> page which cannot possibly be in that same TLB entry.
>
> Signed-off-by: Benjamin Herrenschmidt <address@hidden>
> ---
> tcg/ppc/tcg-target.c | 41 +++++++++++++++++++++++++++++++----------
> 1 file changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
> index 2b6eafa..ce8d546 100644
> --- a/tcg/ppc/tcg-target.c
> +++ b/tcg/ppc/tcg-target.c
> @@ -1361,7 +1361,7 @@ static void * const qemu_st_helpers[16] = {
> in CR7, loads the addend of the TLB into R3, and returns the
> register
> containing the guest address (zero-extended into R4). Clobbers
> R0 and R2. */
>
> -static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp s_bits,
> +static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp opc,
> TCGReg addrlo, TCGReg addrhi,
> int mem_index, bool is_read)
> {
> @@ -1371,6 +1371,7 @@ static TCGReg tcg_out_tlb_read(TCGContext *s,
> TCGMemOp s_bits,
> : offsetof(CPUArchState,
> tlb_table[mem_index][0].addr_write));
> int add_off = offsetof(CPUArchState,
> tlb_table[mem_index][0].addend);
> TCGReg base = TCG_AREG0;
> + TCGMemOp s_bits = opc & MO_SIZE;
>
> /* Extract the page index, shifted into place for tlb index. */
> if (TCG_TARGET_REG_BITS == 64) {
> @@ -1422,17 +1423,37 @@ static TCGReg tcg_out_tlb_read(TCGContext *s,
> TCGMemOp s_bits,
> to minimize any load use delay. */
> tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_REG_R3, add_off);
>
> - /* Clear the non-page, non-alignment bits from the address. */
> + /* Clear the non-page, non-alignment bits from the address */
> if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) {
> + /* We don't support unaligned accesses on 32-bits, preserve
> + * the bottom bits and thus trigger a comparison failure on
> + * unaligned accesses
> + */
> tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0,
> (32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
> - } else if (!s_bits) {
> - tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo,
> - 0, 63 - TARGET_PAGE_BITS);
> + } else if (s_bits) {
> + /* > byte access, we need to handle alignment */
> + if ((opc & MO_AMASK) == MO_ALIGN) {
> + /* Alignment required by the front-end, same as 32-bits
> */
> + tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo,
> + 64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS -
> s_bits);
> + tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0,
> TARGET_PAGE_BITS, 0);
> + } else {
> + /* We support unaligned accesses, we need to make sure we
> fail
> + * if we cross a page boundary. The trick is to add the
> + * access_size-1 to the address before masking the low
> bits.
> + * That will make the address overflow to the next page
> if we
> + * cross a page boundary which will then force a mismatch
> of
> + * the TLB compare since the next page cannot possibly be
> in
> + * the same TLB index.
> + */
> + tcg_out32(s, ADDI | TAI(TCG_REG_R0, addrlo, (1 <<
> s_bits) - 1));
> + tcg_out_rld(s, RLDICR, TCG_REG_R0, TCG_REG_R0,
> + 0, 63 - TARGET_PAGE_BITS);
> + }
> } else {
> - tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo,
> - 64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS -
> s_bits);
> - tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0,
> TARGET_PAGE_BITS, 0);
> + /* Byte access, just chop off the bits below the page index
> */
> + tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo, 0, 63 -
> TARGET_PAGE_BITS);
> }
>
> if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
> @@ -1592,7 +1613,7 @@ static void tcg_out_qemu_ld(TCGContext *s,
> const TCGArg *args, bool is_64)
>
> #ifdef CONFIG_SOFTMMU
> mem_index = get_mmuidx(oi);
> - addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index,
> true);
> + addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index,
> true);
>
> /* Load a pointer into the current opcode w/conditional branch
> -link. */
> label_ptr = s->code_ptr;
> @@ -1667,7 +1688,7 @@ static void tcg_out_qemu_st(TCGContext *s,
> const TCGArg *args, bool is_64)
>
> #ifdef CONFIG_SOFTMMU
> mem_index = get_mmuidx(oi);
> - addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index,
> false);
> + addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index,
> false);
>
> /* Load a pointer into the current opcode w/conditional branch
> -link. */
> label_ptr = s->code_ptr;