[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 11/17] pseries: savevm support for pseries machi
From: |
Anthony Liguori |
Subject: |
Re: [Qemu-devel] [PATCH 11/17] pseries: savevm support for pseries machine |
Date: |
Mon, 08 Jul 2013 13:45:05 -0500 |
User-agent: |
Notmuch/0.15.2+202~g0c4b8aa (http://notmuchmail.org) Emacs/23.3.1 (x86_64-pc-linux-gnu) |
Alexey Kardashevskiy <address@hidden> writes:
> From: David Gibson <address@hidden>
>
> This adds the necessary pieces to implement savevm / migration for the
> pseries machine. The most complex part here is migrating the hash
> table - for the paravirtualized pseries machine the guest's hash page
> table is not stored within guest memory, but externally and the guest
> accesses it via hypercalls.
>
> This patch uses a hypervisor reserved bit of the HPTE as a dirty bit
> (tracking changes to the HPTE itself, not the page it references).
> This is used to implement a live migration style incremental save and
> restore of the hash table contents.
>
> In addition it adds VMStateDescription information to save and restore
> the (few) remaining pieces of state information needed by the pseries
> machine.
>
> Signed-off-by: David Gibson <address@hidden>
> Signed-off-by: Alexey Kardashevskiy <address@hidden>
I vaguely recall making the suggestion to use a live section like this.
How large is the HTAB typically?
Regards,
Anthony Liguori
> ---
> hw/ppc/spapr.c | 269
> +++++++++++++++++++++++++++++++++++++++++++++++-
> hw/ppc/spapr_hcall.c | 8 +-
> include/hw/ppc/spapr.h | 12 ++-
> 3 files changed, 281 insertions(+), 8 deletions(-)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index def3505..f989a22 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -32,6 +32,7 @@
> #include "sysemu/cpus.h"
> #include "sysemu/kvm.h"
> #include "kvm_ppc.h"
> +#include "mmu-hash64.h"
>
> #include "hw/boards.h"
> #include "hw/ppc/ppc.h"
> @@ -667,7 +668,7 @@ static void spapr_cpu_reset(void *opaque)
>
> env->spr[SPR_HIOR] = 0;
>
> - env->external_htab = spapr->htab;
> + env->external_htab = (uint8_t *)spapr->htab;
> env->htab_base = -1;
> env->htab_mask = HTAB_SIZE(spapr) - 1;
> env->spr[SPR_SDR1] = (target_ulong)spapr->htab |
> @@ -719,6 +720,268 @@ static int spapr_vga_init(PCIBus *pci_bus)
> }
> }
>
> +static const VMStateDescription vmstate_spapr = {
> + .name = "spapr",
> + .version_id = 1,
> + .minimum_version_id = 1,
> + .minimum_version_id_old = 1,
> + .fields = (VMStateField []) {
> + VMSTATE_UINT32(next_irq, sPAPREnvironment),
> +
> + /* RTC offset */
> + VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
> +
> + VMSTATE_END_OF_LIST()
> + },
> +};
> +
> +#define HPTE(_table, _i) (void *)(((uint64_t *)(_table)) + ((_i) * 2))
> +#define HPTE_VALID(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
> +#define HPTE_DIRTY(_hpte) (tswap64(*((uint64_t *)(_hpte))) &
> HPTE64_V_HPTE_DIRTY)
> +#define CLEAN_HPTE(_hpte) ((*(uint64_t *)(_hpte)) &=
> tswap64(~HPTE64_V_HPTE_DIRTY))
> +
> +static int htab_save_setup(QEMUFile *f, void *opaque)
> +{
> + sPAPREnvironment *spapr = opaque;
> +
> + spapr->htab_save_index = 0;
> + spapr->htab_first_pass = true;
> +
> + /* "Iteration" header */
> + qemu_put_be32(f, spapr->htab_shift);
> +
> + return 0;
> +}
> +
> +#define MAX_ITERATION_NS 5000000 /* 5 ms */
> +
> +static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
> + int64_t max_ns)
> +{
> + int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
> + int index = spapr->htab_save_index;
> + int64_t starttime = qemu_get_clock_ns(rt_clock);
> +
> + assert(spapr->htab_first_pass);
> +
> + do {
> + int chunkstart;
> +
> + /* Consume invalid HPTEs */
> + while ((index < htabslots)
> + && !HPTE_VALID(HPTE(spapr->htab, index))) {
> + index++;
> + CLEAN_HPTE(HPTE(spapr->htab, index));
> + }
> +
> + /* Consume valid HPTEs */
> + chunkstart = index;
> + while ((index < htabslots)
> + && HPTE_VALID(HPTE(spapr->htab, index))) {
> + index++;
> + CLEAN_HPTE(HPTE(spapr->htab, index));
> + }
> +
> + if (index > chunkstart) {
> + int n_valid = index - chunkstart;
> +
> + qemu_put_be32(f, chunkstart);
> + qemu_put_be16(f, n_valid);
> + qemu_put_be16(f, 0);
> + qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
> + HASH_PTE_SIZE_64 * n_valid);
> +
> + if ((qemu_get_clock_ns(rt_clock) - starttime) > max_ns) {
> + break;
> + }
> + }
> + } while ((index < htabslots) && !qemu_file_rate_limit(f));
> +
> + if (index >= htabslots) {
> + assert(index == htabslots);
> + index = 0;
> + spapr->htab_first_pass = false;
> + }
> + spapr->htab_save_index = index;
> +}
> +
> +static bool htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr,
> + int64_t max_ns)
> +{
> + bool final = max_ns < 0;
> + int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
> + int examined = 0, sent = 0;
> + int index = spapr->htab_save_index;
> + int64_t starttime = qemu_get_clock_ns(rt_clock);
> +
> + assert(!spapr->htab_first_pass);
> +
> + do {
> + int chunkstart, invalidstart;
> +
> + /* Consume non-dirty HPTEs */
> + while ((index < htabslots)
> + && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
> + index++;
> + examined++;
> + }
> +
> + chunkstart = index;
> + /* Consume valid dirty HPTEs */
> + while ((index < htabslots)
> + && HPTE_DIRTY(HPTE(spapr->htab, index))
> + && HPTE_VALID(HPTE(spapr->htab, index))) {
> + CLEAN_HPTE(HPTE(spapr->htab, index));
> + index++;
> + examined++;
> + }
> +
> + invalidstart = index;
> + /* Consume invalid dirty HPTEs */
> + while ((index < htabslots)
> + && HPTE_DIRTY(HPTE(spapr->htab, index))
> + && !HPTE_VALID(HPTE(spapr->htab, index))) {
> + CLEAN_HPTE(HPTE(spapr->htab, index));
> + index++;
> + examined++;
> + }
> +
> + if (index > chunkstart) {
> + int n_valid = invalidstart - chunkstart;
> + int n_invalid = index - invalidstart;
> +
> + qemu_put_be32(f, chunkstart);
> + qemu_put_be16(f, n_valid);
> + qemu_put_be16(f, n_invalid);
> + qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
> + HASH_PTE_SIZE_64 * n_valid);
> + sent += index - chunkstart;
> +
> + if (!final && (qemu_get_clock_ns(rt_clock) - starttime) >
> max_ns) {
> + break;
> + }
> + }
> +
> + if (examined >= htabslots) {
> + break;
> + }
> +
> + if (index >= htabslots) {
> + assert(index == htabslots);
> + index = 0;
> + }
> + } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
> +
> + if (index >= htabslots) {
> + assert(index == htabslots);
> + index = 0;
> + }
> +
> + spapr->htab_save_index = index;
> +
> + return (examined >= htabslots) && (sent == 0);
> +}
> +
> +static int htab_save_iterate(QEMUFile *f, void *opaque)
> +{
> + sPAPREnvironment *spapr = opaque;
> + bool nothingleft = false;;
> +
> + /* Iteration header */
> + qemu_put_be32(f, 0);
> +
> + if (spapr->htab_first_pass) {
> + htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
> + } else {
> + nothingleft = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
> + }
> +
> + /* End marker */
> + qemu_put_be32(f, 0);
> + qemu_put_be16(f, 0);
> + qemu_put_be16(f, 0);
> +
> + return nothingleft ? 1 : 0;
> +}
> +
> +static int htab_save_complete(QEMUFile *f, void *opaque)
> +{
> + sPAPREnvironment *spapr = opaque;
> +
> + /* Iteration header */
> + qemu_put_be32(f, 0);
> +
> + htab_save_later_pass(f, spapr, -1);
> +
> + /* End marker */
> + qemu_put_be32(f, 0);
> + qemu_put_be16(f, 0);
> + qemu_put_be16(f, 0);
> +
> + return 0;
> +}
> +
> +static int htab_load(QEMUFile *f, void *opaque, int version_id)
> +{
> + sPAPREnvironment *spapr = opaque;
> + uint32_t section_hdr;
> +
> + if (version_id < 1 || version_id > 1) {
> + fprintf(stderr, "htab_load() bad version\n");
> + return -EINVAL;
> + }
> +
> + section_hdr = qemu_get_be32(f);
> +
> + if (section_hdr) {
> + /* First section, just the hash shift */
> + if (spapr->htab_shift != section_hdr) {
> + return -EINVAL;
> + }
> + return 0;
> + }
> +
> + while (true) {
> + uint32_t index;
> + uint16_t n_valid, n_invalid;
> +
> + index = qemu_get_be32(f);
> + n_valid = qemu_get_be16(f);
> + n_invalid = qemu_get_be16(f);
> +
> + if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
> + /* End of Stream */
> + break;
> + }
> +
> + if ((index + n_valid + n_invalid) >=
> + (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
> + /* Bad index in stream */
> + fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
> + "in htab stream\n", index, n_valid, n_invalid);
> + return -EINVAL;
> + }
> +
> + if (n_valid) {
> + qemu_get_buffer(f, HPTE(spapr->htab, index),
> + HASH_PTE_SIZE_64 * n_valid);
> + }
> + if (n_invalid) {
> + memset(HPTE(spapr->htab, index + n_valid), 0,
> + HASH_PTE_SIZE_64 * n_invalid);
> + }
> + }
> +
> + return 0;
> +}
> +
> +static SaveVMHandlers savevm_htab_handlers = {
> + .save_live_setup = htab_save_setup,
> + .save_live_iterate = htab_save_iterate,
> + .save_live_complete = htab_save_complete,
> + .load_state = htab_load,
> +};
> +
> static struct icp_state *try_create_xics(const char *type, int nr_servers,
> int nr_irqs)
> {
> @@ -987,6 +1250,10 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
>
> spapr->entry_point = 0x100;
>
> + vmstate_register(NULL, 0, &vmstate_spapr, spapr);
> + register_savevm_live(NULL, "spapr/htab", -1, 1,
> + &savevm_htab_handlers, spapr);
> +
> /* Prepare the device tree */
> spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
> initrd_base, initrd_size,
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index e6f321d..7ca984e 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -115,7 +115,7 @@ static target_ulong h_enter(PowerPCCPU *cpu,
> sPAPREnvironment *spapr,
> }
> ppc_hash64_store_hpte1(env, hpte, ptel);
> /* eieio(); FIXME: need some sort of barrier for smp? */
> - ppc_hash64_store_hpte0(env, hpte, pteh);
> + ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
>
> args[0] = pte_index + i;
> return H_SUCCESS;
> @@ -152,7 +152,7 @@ static target_ulong remove_hpte(CPUPPCState *env,
> target_ulong ptex,
> }
> *vp = v;
> *rp = r;
> - ppc_hash64_store_hpte0(env, hpte, 0);
> + ppc_hash64_store_hpte0(env, hpte, HPTE64_V_HPTE_DIRTY);
> rb = compute_tlbie_rb(v, r, ptex);
> ppc_tlb_invalidate_one(env, rb);
> return REMOVE_SUCCESS;
> @@ -282,11 +282,11 @@ static target_ulong h_protect(PowerPCCPU *cpu,
> sPAPREnvironment *spapr,
> r |= (flags << 48) & HPTE64_R_KEY_HI;
> r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO);
> rb = compute_tlbie_rb(v, r, pte_index);
> - ppc_hash64_store_hpte0(env, hpte, v & ~HPTE64_V_VALID);
> + ppc_hash64_store_hpte0(env, hpte, (v & ~HPTE64_V_VALID) |
> HPTE64_V_HPTE_DIRTY);
> ppc_tlb_invalidate_one(env, rb);
> ppc_hash64_store_hpte1(env, hpte, r);
> /* Don't need a memory barrier, due to qemu's global lock */
> - ppc_hash64_store_hpte0(env, hpte, v);
> + ppc_hash64_store_hpte0(env, hpte, v | HPTE64_V_HPTE_DIRTY);
> return H_SUCCESS;
> }
>
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 09c4570..4cfe449 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -9,6 +9,8 @@ struct sPAPRPHBState;
> struct sPAPRNVRAM;
> struct icp_state;
>
> +#define HPTE64_V_HPTE_DIRTY 0x0000000000000040ULL
> +
> typedef struct sPAPREnvironment {
> struct VIOsPAPRBus *vio_bus;
> QLIST_HEAD(, sPAPRPHBState) phbs;
> @@ -17,20 +19,24 @@ typedef struct sPAPREnvironment {
>
> hwaddr ram_limit;
> void *htab;
> - long htab_shift;
> + uint32_t htab_shift;
> hwaddr rma_size;
> int vrma_adjust;
> hwaddr fdt_addr, rtas_addr;
> long rtas_size;
> void *fdt_skel;
> target_ulong entry_point;
> - int next_irq;
> - int rtc_offset;
> + uint32_t next_irq;
> + uint64_t rtc_offset;
> char *cpu_model;
> bool has_graphics;
>
> uint32_t epow_irq;
> Notifier epow_notifier;
> +
> + /* Migration state */
> + int htab_save_index;
> + bool htab_first_pass;
> } sPAPREnvironment;
>
> #define H_SUCCESS 0
> --
> 1.7.10.4
- Re: [Qemu-devel] [PATCH 11/17] pseries: savevm support for pseries machine,
Anthony Liguori <=