[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [Qemu-ppc] [PATCH 02/17] ppc: avoid excessive TLB flush
From: |
Alexander Graf |
Subject: |
Re: [Qemu-devel] [Qemu-ppc] [PATCH 02/17] ppc: avoid excessive TLB flushing |
Date: |
Fri, 05 Sep 2014 09:10:01 +0200 |
User-agent: |
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Thunderbird/31.0 |
On 28.08.14 19:14, Paolo Bonzini wrote:
> PowerPC TCG flushes the TLB on every IR/DR change, which basically
> means on every user<->kernel context switch. Use the 6-element
> TLB array as a cache, where each MMU index is mapped to a different
> state of the IR/DR/PR/HV bits.
>
> This brings the number of TLB flushes down from ~900000 to ~50000
> for starting up the Debian installer, which is in line with x86
> and gives a ~10% performance improvement.
>
> Signed-off-by: Paolo Bonzini <address@hidden>
> ---
> cputlb.c | 19 +++++++++++++++++
> hw/ppc/spapr_hcall.c | 6 +++++-
> include/exec/exec-all.h | 5 +++++
> target-ppc/cpu.h | 4 +++-
> target-ppc/excp_helper.c | 6 +-----
> target-ppc/helper_regs.h | 52
> +++++++++++++++++++++++++++++++--------------
> target-ppc/translate_init.c | 5 +++++
> 7 files changed, 74 insertions(+), 23 deletions(-)
>
> diff --git a/cputlb.c b/cputlb.c
> index afd3705..17e1b03 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -67,6 +67,25 @@ void tlb_flush(CPUState *cpu, int flush_global)
> tlb_flush_count++;
> }
>
> +void tlb_flush_idx(CPUState *cpu, int mmu_idx)
> +{
> + CPUArchState *env = cpu->env_ptr;
> +
> +#if defined(DEBUG_TLB)
> + printf("tlb_flush_idx %d:\n", mmu_idx);
> +#endif
> + /* must reset current TB so that interrupts cannot modify the
> + links while we are modifying them */
> + cpu->current_tb = NULL;
> +
> + memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[mmu_idx]));
> + memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
> +
> + env->tlb_flush_addr = -1;
> + env->tlb_flush_mask = 0;
> + tlb_flush_count++;
> +}
> +
> static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
> {
> if (addr == (tlb_entry->addr_read &
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 467858c..b95961c 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -556,13 +556,17 @@ static target_ulong h_cede(PowerPCCPU *cpu,
> sPAPREnvironment *spapr,
> {
> CPUPPCState *env = &cpu->env;
> CPUState *cs = CPU(cpu);
> + bool flush;
>
> env->msr |= (1ULL << MSR_EE);
> - hreg_compute_hflags(env);
> + flush = hreg_compute_hflags(env);
> if (!cpu_has_work(cs)) {
> cs->halted = 1;
> cs->exception_index = EXCP_HLT;
> cs->exit_request = 1;
> + } else if (flush) {
> + cs->interrupt_request |= CPU_INTERRUPT_EXITTB;
> + cs->exit_request = 1;
Can this ever happen?
> }
> return H_SUCCESS;
> }
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 5e5d86e..629a550 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -100,6 +100,7 @@ void tcg_cpu_address_space_init(CPUState *cpu,
> AddressSpace *as);
> /* cputlb.c */
> void tlb_flush_page(CPUState *cpu, target_ulong addr);
> void tlb_flush(CPUState *cpu, int flush_global);
> +void tlb_flush_idx(CPUState *cpu, int mmu_idx);
> void tlb_set_page(CPUState *cpu, target_ulong vaddr,
> hwaddr paddr, int prot,
> int mmu_idx, target_ulong size);
> @@ -112,6 +113,10 @@ static inline void tlb_flush_page(CPUState *cpu,
> target_ulong addr)
> static inline void tlb_flush(CPUState *cpu, int flush_global)
> {
> }
> +
> +static inline void tlb_flush_idx(CPUState *cpu, int mmu_idx)
> +{
> +}
> #endif
>
> #define CODE_GEN_ALIGN 16 /* must be >= of the size of a icache
> line */
> diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
> index b64c652..c1cb27f 100644
> --- a/target-ppc/cpu.h
> +++ b/target-ppc/cpu.h
> @@ -922,7 +922,7 @@ struct ppc_segment_page_sizes {
>
>
> /*****************************************************************************/
> /* The whole PowerPC CPU context */
> -#define NB_MMU_MODES 3
> +#define NB_MMU_MODES 6
>
> #define PPC_CPU_OPCODES_LEN 0x40
>
> @@ -1085,6 +1085,8 @@ struct CPUPPCState {
> target_ulong hflags; /* hflags is a MSR & HFLAGS_MASK */
> target_ulong hflags_nmsr; /* specific hflags, not coming from MSR */
> int mmu_idx; /* precomputed MMU index to speed up mem accesses */
> + uint32_t mmu_msr[NB_MMU_MODES]; /* ir/dr/hv/pr values for TLBs */
> + int mmu_fifo; /* for replacement in mmu_msr */
>
> /* Power management */
> int (*check_pow)(CPUPPCState *env);
> diff --git a/target-ppc/excp_helper.c b/target-ppc/excp_helper.c
> index be71590..bf25d44 100644
> --- a/target-ppc/excp_helper.c
> +++ b/target-ppc/excp_helper.c
> @@ -623,9 +623,6 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int
> excp_model, int excp)
>
> if (env->spr[SPR_LPCR] & LPCR_AIL) {
> new_msr |= (1 << MSR_IR) | (1 << MSR_DR);
> - } else if (msr & ((1 << MSR_IR) | (1 << MSR_DR))) {
> - /* If we disactivated any translation, flush TLBs */
> - tlb_flush(cs, 1);
> }
>
> #ifdef TARGET_PPC64
> @@ -678,8 +675,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int
> excp_model, int excp)
> if ((env->mmu_model == POWERPC_MMU_BOOKE) ||
> (env->mmu_model == POWERPC_MMU_BOOKE206)) {
> /* XXX: The BookE changes address space when switching modes,
> - we should probably implement that as different MMU indexes,
> - but for the moment we do it the slow way and flush all. */
> + TODO: still needed?!? */
> tlb_flush(cs, 1);
> }
> }
> diff --git a/target-ppc/helper_regs.h b/target-ppc/helper_regs.h
> index 271fddf..291f9c1 100644
> --- a/target-ppc/helper_regs.h
> +++ b/target-ppc/helper_regs.h
> @@ -39,17 +39,38 @@ static inline void hreg_swap_gpr_tgpr(CPUPPCState *env)
> env->tgpr[3] = tmp;
> }
>
> -static inline void hreg_compute_mem_idx(CPUPPCState *env)
> +static inline bool hreg_compute_mem_idx(CPUPPCState *env)
> {
> - /* Precompute MMU index */
> - if (msr_pr == 0 && msr_hv != 0) {
> - env->mmu_idx = 2;
> - } else {
> - env->mmu_idx = 1 - msr_pr;
> + CPUState *cs = CPU(ppc_env_get_cpu(env));
> + int msr = env->msr;
> + int i;
> +
> + if (!tcg_enabled()) {
> + return false;
> + }
> +
> + msr &= (1 << MSR_IR) | (1 << MSR_DR) | (1 << MSR_PR) | MSR_HVB;
> + if (msr_pr == 1) {
> + msr &= ~MSR_HVB;
> }
> +
> + for (i = 0; i < NB_MMU_MODES; i++) {
> + if (env->mmu_msr[i] == msr) {
> + env->mmu_idx = i;
> + return false;
> + }
> + }
> +
> + /* Use a new index with FIFO replacement. */
> + i = (env->mmu_fifo == NB_MMU_MODES - 1 ? 0 : env->mmu_fifo + 1);
> + env->mmu_fifo = i;
> + env->mmu_msr[i] = msr;
> + env->mmu_idx = i;
> + tlb_flush_idx(cs, i);
> + return true;
> }
Ok, so this basically changes the semantics of mmu_idx from a static
array with predefined meanings to a dynamic array with runtime changing
semantics.
The first thing that comes to mind here is why we're not just extending
the existing array? After all, we have 4 bits -> 16 states minus one for
PR+HV. Can our existing logic not deal with this?
Second thing I'm failing to grasp still is that in the previous patch
you're changing ctx.mem_idx into to different static semantics. But that
mem_idx gets passed to our ld/st helpers which again boils down to the
mem_idx above, no? So aren't we accessing random unrelated mmu contexts now?
There's a good chance I'm not fully grasping something here :).
Alex
- Re: [Qemu-devel] [Qemu-ppc] [PATCH 02/17] ppc: avoid excessive TLB flushing,
Alexander Graf <=