Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimi

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimi

From:	Alex Bennée
Subject:	Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations
Date:	Tue, 28 Jul 2020 15:56:26 +0100
User-agent:	mu4e 1.5.5; emacs 28.0.50

Richard Henderson <richard.henderson@linaro.org> writes:

> On 7/24/20 12:51 PM, Shu-Chun Weng wrote:
>> This change has no functional change.
>> 
>> There is a potential link error with "undefined symbol:
>> qemu_build_not_reached" due to how `store_helper` is structured.
>> This does not produce at current QEMU head, but was reproducible at
>> v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.
>
> Thanks for the hint -- so far I had not been able to reproduce the
> problem with any of clang 10, 11, or head (12), with default options.
>
>> The current function structure is:
>> 
>>     inline QEMU_ALWAYSINLINE
>>     store_memop() {
>>         switch () {
>>             ...
>>         default:
>>             qemu_build_not_reached();
>>         }
>>     }
>>     inline QEMU_ALWAYSINLINE
>>     store_helper() {
>>         ...
>>         if (span_two_pages_or_io) {
>>             ...
>>             helper_rst_stb_mmu();
>>         }
>>         store_memop();
>>     }
>>     helper_rst_stb_mmu() {
>>         store_helper();
>>     }
> ...
>> The new structure is:
>> 
>>     inline QEMU_ALWAYSINLINE
>>     store_memop() {
>>         switch () {
>>             ...
>>         default:
>>             qemu_build_not_reached();
>>         }
>>     }
>>     inline QEMU_ALWAYSINLINE
>>     store_helper_size_aligned()() {
>>         ...
>>         if (span_two_pages_or_io) {
>>             return false;
>>         }
>>         store_memoop();
>>         return true;
>>     }
>>     inline QEMU_ALWAYSINLINE
>>     store_helper() {
>>         if (store_helper_size_aligned() {
>>             return;
>>         }
>>         helper_rst_stb_mmu();
>>     }
>>     helper_rst_stb_mmu() {
>>         store_helper_size_aligned()();
>>     }
>
> Reasonable, I guess.
>
> I did some experimenting though, and if I pull out the unaligned
> portion into a noinline function, I can save about 6k code size.
>
> Thoughts?

I think on balance I prefers having the unaligned helper out of line.
AFAICT they both perform just as well and it must surely help to have
the uncommon case both out of the hotpath and shared with the other
implementations.

For this version:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

>
>
> r~
>
>
> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
> index 5698292749..7e603d6666 100644
> --- a/accel/tcg/cputlb.c
> +++ b/accel/tcg/cputlb.c
> @@ -2009,6 +2009,80 @@ store_memop(void *haddr, uint64_t val, MemOp op)
>      }
>  }
>  
> +static void __attribute__((noinline))
> +store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
> +                       uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
> +                       bool big_endian)
> +{
> +    const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
> +    uintptr_t index, index2;
> +    CPUTLBEntry *entry, *entry2;
> +    target_ulong page2, tlb_addr, tlb_addr2;
> +    TCGMemOpIdx oi;
> +    size_t size2;
> +    int i;
> +
> +    /*
> +     * Ensure the second page is in the TLB.  Note that the first page
> +     * is already guaranteed to be filled, and that the second page
> +     * cannot evict the first.
> +     */
> +    page2 = (addr + size) & TARGET_PAGE_MASK;
> +    size2 = (addr + size) & ~TARGET_PAGE_MASK;
> +    index2 = tlb_index(env, mmu_idx, page2);
> +    entry2 = tlb_entry(env, mmu_idx, page2);
> +
> +    tlb_addr2 = tlb_addr_write(entry2);
> +    if (!tlb_hit_page(tlb_addr2, page2)) {
> +        if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
> +            tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
> +                     mmu_idx, retaddr);
> +            index2 = tlb_index(env, mmu_idx, page2);
> +            entry2 = tlb_entry(env, mmu_idx, page2);
> +        }
> +        tlb_addr2 = tlb_addr_write(entry2);
> +    }
> +
> +    index = tlb_index(env, mmu_idx, addr);
> +    entry = tlb_entry(env, mmu_idx, addr);
> +    tlb_addr = tlb_addr_write(entry);
> +
> +    /*
> +     * Handle watchpoints.  Since this may trap, all checks
> +     * must happen before any store.
> +     */
> +    if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
> +        cpu_check_watchpoint(env_cpu(env), addr, size - size2,
> +                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
> +                             BP_MEM_WRITE, retaddr);
> +    }
> +    if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
> +        cpu_check_watchpoint(env_cpu(env), page2, size2,
> +                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
> +                             BP_MEM_WRITE, retaddr);
> +    }
> +
> +    /*
> +     * XXX: not efficient, but simple.
> +     * This loop must go in the forward direction to avoid issues
> +     * with self-modifying code in Windows 64-bit.
> +     */
> +    oi = make_memop_idx(MO_UB, mmu_idx);
> +    if (big_endian) {
> +        for (i = 0; i < size; ++i) {
> +            /* Big-endian extract.  */
> +            uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
> +            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
> +        }
> +    } else {
> +        for (i = 0; i < size; ++i) {
> +            /* Little-endian extract.  */
> +            uint8_t val8 = val >> (i * 8);
> +            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
> +        }
> +    }
> +}
> +
>  static inline void QEMU_ALWAYS_INLINE
>  store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
>               TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
> @@ -2097,64 +2171,9 @@ store_helper(CPUArchState *env, target_ulong addr, 
> uint64_t val,
>      if (size > 1
>          && unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
>                       >= TARGET_PAGE_SIZE)) {
> -        int i;
> -        uintptr_t index2;
> -        CPUTLBEntry *entry2;
> -        target_ulong page2, tlb_addr2;
> -        size_t size2;
> -
>      do_unaligned_access:
> -        /*
> -         * Ensure the second page is in the TLB.  Note that the first page
> -         * is already guaranteed to be filled, and that the second page
> -         * cannot evict the first.
> -         */
> -        page2 = (addr + size) & TARGET_PAGE_MASK;
> -        size2 = (addr + size) & ~TARGET_PAGE_MASK;
> -        index2 = tlb_index(env, mmu_idx, page2);
> -        entry2 = tlb_entry(env, mmu_idx, page2);
> -        tlb_addr2 = tlb_addr_write(entry2);
> -        if (!tlb_hit_page(tlb_addr2, page2)) {
> -            if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
> -                tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
> -                         mmu_idx, retaddr);
> -                index2 = tlb_index(env, mmu_idx, page2);
> -                entry2 = tlb_entry(env, mmu_idx, page2);
> -            }
> -            tlb_addr2 = tlb_addr_write(entry2);
> -        }
> -
> -        /*
> -         * Handle watchpoints.  Since this may trap, all checks
> -         * must happen before any store.
> -         */
> -        if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
> -            cpu_check_watchpoint(env_cpu(env), addr, size - size2,
> -                                 env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
> -                                 BP_MEM_WRITE, retaddr);
> -        }
> -        if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
> -            cpu_check_watchpoint(env_cpu(env), page2, size2,
> -                                 
> env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
> -                                 BP_MEM_WRITE, retaddr);
> -        }
> -
> -        /*
> -         * XXX: not efficient, but simple.
> -         * This loop must go in the forward direction to avoid issues
> -         * with self-modifying code in Windows 64-bit.
> -         */
> -        for (i = 0; i < size; ++i) {
> -            uint8_t val8;
> -            if (memop_big_endian(op)) {
> -                /* Big-endian extract.  */
> -                val8 = val >> (((size - 1) * 8) - (i * 8));
> -            } else {
> -                /* Little-endian extract.  */
> -                val8 = val >> (i * 8);
> -            }
> -            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
> -        }
> +        store_helper_unaligned(env, addr, val, retaddr, size,
> +                               mmu_idx, memop_big_endian(op));
>          return;
>      }
>  
> @@ -2162,8 +2181,9 @@ store_helper(CPUArchState *env, target_ulong addr, 
> uint64_t val,
>      store_memop(haddr, val, op);
>  }
>  
> -void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
> -                        TCGMemOpIdx oi, uintptr_t retaddr)
> +void __attribute__((noinline))
> +helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
> +                   TCGMemOpIdx oi, uintptr_t retaddr)
>  {
>      store_helper(env, addr, val, oi, retaddr, MO_UB);
>  }


-- 
Alex Bennée

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations, Shu-Chun Weng, 2020/07/24
- Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations, Richard Henderson, 2020/07/27
  - Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations, Shu-Chun Weng, 2020/07/27
  - Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations, Alex Bennée <=

Prev by Date: Re: [PATCH v2 0/2] QEMU Gating CI
Next by Date: Re: [PATCH] osdep.h: Add doc comment for qemu_get_thread_id()
Previous by thread: Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations
Next by thread: [PATCH] linux-user: Ensure mmap_min_addr is non-zero
Index(es):
- Date
- Thread