qemu-ppc
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2 9/9] spapr: implement nested-hv capability for the virtual


From: Nicholas Piggin
Subject: Re: [PATCH v2 9/9] spapr: implement nested-hv capability for the virtual hypervisor
Date: Wed, 16 Feb 2022 22:30:05 +1000

Excerpts from Nicholas Piggin's message of February 16, 2022 9:38 pm:
> Excerpts from Cédric Le Goater's message of February 16, 2022 8:52 pm:
>> On 2/16/22 11:25, Nicholas Piggin wrote:
>>> This implements the Nested KVM HV hcall API for spapr under TCG.
>>> 
>>> The L2 is switched in when the H_ENTER_NESTED hcall is made, and the
>>> L1 is switched back in returned from the hcall when a HV exception
>>> is sent to the vhyp. Register state is copied in and out according to
>>> the nested KVM HV hcall API specification.
>>> 
>>> The hdecr timer is started when the L2 is switched in, and it provides
>>> the HDEC / 0x980 return to L1.
>>> 
>>> The MMU re-uses the bare metal radix 2-level page table walker by
>>> using the get_pate method to point the MMU to the nested partition
>>> table entry. MMU faults due to partition scope errors raise HV
>>> exceptions and accordingly are routed back to the L1.
>>> 
>>> The MMU does not tag translations for the L1 (direct) vs L2 (nested)
>>> guests, so the TLB is flushed on any L1<->L2 transition (hcall entry
>>> and exit).>
>>> Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>
>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> 
>> Reviewed-by: Cédric Le Goater <clg@kaod.org>
>> 
>> Some last comments below,
> 
> [...]
> 
>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>> index edbf3eeed0..852fe61b36 100644
>>> --- a/include/hw/ppc/spapr.h
>>> +++ b/include/hw/ppc/spapr.h
>>> @@ -199,6 +199,9 @@ struct SpaprMachineState {
>>>       bool has_graphics;
>>>       uint32_t vsmt;       /* Virtual SMT mode (KVM's "core stride") */
>>>   
>>> +    /* Nested HV support (TCG only) */
>>> +    uint64_t nested_ptcr;
>>> +
>> 
>> this is new state to migrate.
>> 
> 
> [...]
> 
>>> +/* Linux 64-bit powerpc pt_regs struct, used by nested HV */
>>> +struct kvmppc_pt_regs {
>>> +    uint64_t gpr[32];
>>> +    uint64_t nip;
>>> +    uint64_t msr;
>>> +    uint64_t orig_gpr3;    /* Used for restarting system calls */
>>> +    uint64_t ctr;
>>> +    uint64_t link;
>>> +    uint64_t xer;
>>> +    uint64_t ccr;
>>> +    uint64_t softe;        /* Soft enabled/disabled */
>>> +    uint64_t trap;         /* Reason for being here */
>>> +    uint64_t dar;          /* Fault registers */
>>> +    uint64_t dsisr;        /* on 4xx/Book-E used for ESR */
>>> +    uint64_t result;       /* Result of a system call */
>>> +};
>> 
>> I think we need to start moving all the spapr hcall definitions under
>> spapr_hcall.h. It can come later.
> 
> Sure.
> 
> [...]
> 
>>> diff --git a/include/hw/ppc/spapr_cpu_core.h 
>>> b/include/hw/ppc/spapr_cpu_core.h
>>> index dab3dfc76c..b560514560 100644
>>> --- a/include/hw/ppc/spapr_cpu_core.h
>>> +++ b/include/hw/ppc/spapr_cpu_core.h
>>> @@ -48,6 +48,11 @@ typedef struct SpaprCpuState {
>>>       bool prod; /* not migrated, only used to improve dispatch latencies */
>>>       struct ICPState *icp;
>>>       struct XiveTCTX *tctx;
>>> +
>>> +    /* Fields for nested-HV support */
>>> +    bool in_nested; /* true while the L2 is executing */
>>> +    CPUPPCState *nested_host_state; /* holds the L1 state while L2 
>>> executes */
>>> +    int64_t nested_tb_offset; /* L1->L2 TB offset */
>> 
>> This needs a new vmstate.
> 
> How about instead of the vmstate (we would need all the L1 state in
> nested_host_state as well), we just add a migration blocker in the
> L2 entry path. We could limit the max hdecr to say 1 second to
> ensure it unblocks before long.
> 
> I know migration blockers are not preferred but in this case it gives
> us some iterations to debug and optimise first, which might change
> the data to migrate.

This should be roughly the incremental patch to do this.

Thanks,
Nick

--
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 87e68da77f..14e41b7d31 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2882,6 +2882,13 @@ static void spapr_machine_init(MachineState *machine)
             "may run and log hardware error on the destination");
     }
 
+    if (spapr_get_cap(spapr, SPAPR_CAP_NESTED_KVM_HV) == SPAPR_CAP_ON) {
+        /* Create the error string for live migration blocker */
+        error_setg(&spapr->nested_hv_migration_blocker,
+            "A nested-hv L2 guest is running. Migration is blocked until it "
+            "exits to the L1.");
+    }
+
     if (mc->nvdimm_supported) {
         spapr_create_nvdimm_dr_connectors(spapr);
     }
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index e183892287..89295bc723 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -21,6 +21,7 @@
 #include "hw/ppc/spapr_numa.h"
 #include "mmu-book3s-v3.h"
 #include "hw/mem/memory-device.h"
+#include "migration/blocker.h"
 
 bool is_ram_address(SpaprMachineState *spapr, hwaddr addr)
 {
@@ -1565,7 +1566,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
     target_ulong hv_ptr = args[0];
     target_ulong regs_ptr = args[1];
-    target_ulong hdec, now = cpu_ppc_load_tbl(env);
+    target_ulong hdec, now;
     target_ulong lpcr, lpcr_mask;
     struct kvmppc_hv_guest_state *hvstate;
     struct kvmppc_hv_guest_state hv_state;
@@ -1578,11 +1579,16 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
         return H_NOT_AVAILABLE;
     }
 
+    if (migrate_add_blocker(spapr->nested_hv_migration_blocker, NULL)) {
+        return 0; /* This returns nothing to the L1, essentially an EAGAIN */
+    }
+
     len = sizeof(*hvstate);
     hvstate = address_space_map(CPU(cpu)->as, hv_ptr, &len, false,
                                 MEMTXATTRS_UNSPECIFIED);
     if (len != sizeof(*hvstate)) {
         address_space_unmap(CPU(cpu)->as, hvstate, len, 0, false);
+        migrate_del_blocker(spapr->nested_hv_migration_blocker);
         return H_PARAMETER;
     }
 
@@ -1590,16 +1596,36 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
 
     address_space_unmap(CPU(cpu)->as, hvstate, len, len, false);
 
+    spapr_cpu->nested_tb_offset = hv_state.tb_offset;
+    spapr_cpu->nested_hdec_expiry = hv_state.hdec_expiry;
+
+    now = cpu_ppc_load_tbl(env);
+    if (now >= hv_state.hdec_expiry) {
+        migrate_del_blocker(spapr->nested_hv_migration_blocker);
+        return env->excp_vectors[POWERPC_EXCP_HDECR];
+    }
+
+    hdec = hv_state.hdec_expiry - now;
+    if (hdec > env->tb_env->tb_freq) {
+        /*
+         * Limit hdecr to 1 second to prevent the L1 blocking migration for
+         * too long with a large hdecr value.
+         */
+        hdec = env->tb_env->tb_freq;
+    }
+
     /*
      * We accept versions 1 and 2. Version 2 fields are unused because TCG
      * does not implement DAWR*.
      */
     if (hv_state.version > HV_GUEST_STATE_VERSION) {
+        migrate_del_blocker(spapr->nested_hv_migration_blocker);
         return H_PARAMETER;
     }
 
     spapr_cpu->nested_host_state = g_try_malloc(sizeof(CPUPPCState));
     if (!spapr_cpu->nested_host_state) {
+        migrate_del_blocker(spapr->nested_hv_migration_blocker);
         return H_NO_MEM;
     }
 
@@ -1611,6 +1637,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
     if (!regs || len != sizeof(*regs)) {
         address_space_unmap(CPU(cpu)->as, regs, len, 0, false);
         g_free(spapr_cpu->nested_host_state);
+        migrate_del_blocker(spapr->nested_hv_migration_blocker);
         return H_P2;
     }
 
@@ -1648,8 +1675,6 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
     /* hv_state.amor is not used */
     env->spr[SPR_DPDES] = hv_state.dpdes;
     env->spr[SPR_HFSCR] = hv_state.hfscr;
-    hdec = hv_state.hdec_expiry - now;
-    spapr_cpu->nested_tb_offset = hv_state.tb_offset;
     /* TCG does not implement DAWR*, CIABR, PURR, SPURR, IC, VTB, HEIR SPRs*/
     env->spr[SPR_SRR0] = hv_state.srr0;
     env->spr[SPR_SRR1] = hv_state.srr1;
@@ -1693,6 +1718,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
 
 void spapr_exit_nested(PowerPCCPU *cpu, int excp)
 {
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
     CPUState *cs = CPU(cpu);
     CPUPPCState *env = &cpu->env;
     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
@@ -1781,6 +1807,19 @@ void spapr_exit_nested(PowerPCCPU *cpu, int excp)
     /* Is it okay to specify write length larger than actual data written? */
     address_space_unmap(CPU(cpu)->as, regs, len, len, true);
 
+    /*
+     * hdecr is capped at entry, so we may exit here with a HDECR exception
+     * without having exceeded the guest's limit. Clear the HDECR interrupt
+     * return in this case.
+     */
+    if (excp == POWERPC_EXCP_HDECR) {
+        target_ulong now;
+        now = cpu_ppc_load_tbl(env) - spapr_cpu->nested_tb_offset;
+        if (now < spapr_cpu->nested_hdec_expiry) {
+            r3_return = 0;
+        }
+    }
+
 out_restore_l1:
     memcpy(env->gpr, spapr_cpu->nested_host_state->gpr, sizeof(env->gpr));
     env->lr = spapr_cpu->nested_host_state->lr;
@@ -1825,6 +1864,8 @@ out_restore_l1:
 
     g_free(spapr_cpu->nested_host_state);
     spapr_cpu->nested_host_state = NULL;
+
+    migrate_del_blocker(spapr->nested_hv_migration_blocker);
 }
 
 static void hypercall_register_types(void)
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 852fe61b36..70b330ef9a 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -266,6 +266,7 @@ struct SpaprMachineState {
     uint32_t FORM2_assoc_array[NUMA_NODES_MAX_NUM][FORM2_NUMA_ASSOC_SIZE];
 
     Error *fwnmi_migration_blocker;
+    Error *nested_hv_migration_blocker;
 };
 
 #define H_SUCCESS         0
diff --git a/include/hw/ppc/spapr_cpu_core.h b/include/hw/ppc/spapr_cpu_core.h
index b560514560..09da577ca1 100644
--- a/include/hw/ppc/spapr_cpu_core.h
+++ b/include/hw/ppc/spapr_cpu_core.h
@@ -53,6 +53,7 @@ typedef struct SpaprCpuState {
     bool in_nested; /* true while the L2 is executing */
     CPUPPCState *nested_host_state; /* holds the L1 state while L2 executes */
     int64_t nested_tb_offset; /* L1->L2 TB offset */
+    uint64_t nested_hdec_expiry; /* L1 hdec expiry in absolute L1 TB */
 } SpaprCpuState;
 
 static inline SpaprCpuState *spapr_cpu_state(PowerPCCPU *cpu)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]