[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v5 10/10] KVM: Dirty ring support
From: |
Keqian Zhu |
Subject: |
Re: [PATCH v5 10/10] KVM: Dirty ring support |
Date: |
Mon, 22 Mar 2021 21:37:19 +0800 |
User-agent: |
Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Thunderbird/45.7.1 |
On 2021/3/11 4:33, Peter Xu wrote:
> KVM dirty ring is a new interface to pass over dirty bits from kernel to the
> userspace. Instead of using a bitmap for each memory region, the dirty ring
> contains an array of dirtied GPAs to fetch (in the form of offset in slots).
> For each vcpu there will be one dirty ring that binds to it.
>
> kvm_dirty_ring_reap() is the major function to collect dirty rings. It can be
> called either by a standalone reaper thread that runs in the background,
> collecting dirty pages for the whole VM. It can also be called directly by
> any
> thread that has BQL taken.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> accel/kvm/kvm-all.c | 332 ++++++++++++++++++++++++++++++++++++++++-
> accel/kvm/trace-events | 7 +
> include/hw/core/cpu.h | 8 +
> 3 files changed, 344 insertions(+), 3 deletions(-)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index ae9393266b2..bf2b21f038b 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -15,6 +15,7 @@
>
> #include "qemu/osdep.h"
> #include <sys/ioctl.h>
> +#include <poll.h>
>
> #include <linux/kvm.h>
>
> @@ -80,6 +81,25 @@ struct KVMParkedVcpu {
> QLIST_ENTRY(KVMParkedVcpu) node;
> };
>
> +enum KVMDirtyRingReaperState {
> + KVM_DIRTY_RING_REAPER_NONE = 0,
> + /* The reaper is sleeping */
> + KVM_DIRTY_RING_REAPER_WAIT,
> + /* The reaper is reaping for dirty pages */
> + KVM_DIRTY_RING_REAPER_REAPING,
> +};
> +
> +/*
> + * KVM reaper instance, responsible for collecting the KVM dirty bits
> + * via the dirty ring.
> + */
> +struct KVMDirtyRingReaper {
> + /* The reaper thread */
> + QemuThread reaper_thr;
> + volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
> + volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
> +};
> +
> struct KVMState
> {
> AccelState parent_obj;
> @@ -131,6 +151,7 @@ struct KVMState
> bool kvm_dirty_ring_enabled; /* Whether KVM dirty ring is enabled */
> uint64_t kvm_dirty_ring_size; /* Size of the per-vcpu dirty ring */
> uint32_t kvm_dirty_gfn_count; /* Number of dirty GFNs per ring */
> + struct KVMDirtyRingReaper reaper;
> };
>
> KVMState *kvm_state;
> @@ -392,6 +413,13 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
> goto err;
> }
>
> + if (cpu->kvm_dirty_gfns) {
> + ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size);
> + if (ret < 0) {
> + goto err;
> + }
> + }
> +
> vcpu = g_malloc0(sizeof(*vcpu));
> vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
> vcpu->kvm_fd = cpu->kvm_fd;
> @@ -468,6 +496,19 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
> (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
> }
>
> + if (s->kvm_dirty_ring_enabled) {
> + /* Use MAP_SHARED to share pages with the kernel */
> + cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> + PROT_READ | PROT_WRITE, MAP_SHARED,
> + cpu->kvm_fd,
> + PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
> + if (cpu->kvm_dirty_gfns == MAP_FAILED) {
> + ret = -errno;
> + DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
> + goto err;
> + }
> + }
> +
> ret = kvm_arch_init_vcpu(cpu);
> if (ret < 0) {
> error_setg_errno(errp, -ret,
> @@ -586,6 +627,11 @@ static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
> cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
> }
>
> +static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
> +{
> + memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
> +}
> +
> #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
>
> /* Allocate the dirty bitmap for a slot */
> @@ -642,6 +688,170 @@ static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot
> *slot)
> return ret == 0;
> }
>
> +/* Should be with all slots_lock held for the address spaces. */
> +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
> + uint32_t slot_id, uint64_t offset)
> +{
> + KVMMemoryListener *kml;
> + KVMSlot *mem;
> +
> + if (as_id >= s->nr_as) {
> + return;
> + }
> +
> + kml = s->as[as_id].ml;
> + mem = &kml->slots[slot_id];
> +
> + if (!mem->memory_size || offset >= (mem->memory_size /
> TARGET_PAGE_SIZE)) {
It seems that TARGET_PAGE_SIZE should be qemu_real_host_page_size.
> + return;
> + }
> +
> + set_bit(offset, mem->dirty_bmap);
> +}
> +
> +static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
> +{
> + return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
> +}
> +
> +static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
> +{
> + gfn->flags = KVM_DIRTY_GFN_F_RESET;
> +}
> +
> +/*
> + * Should be with all slots_lock held for the address spaces. It returns the
> + * dirty page we've collected on this dirty ring.
> + */
> +static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
> +{
> + struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
> + uint32_t gfn_count = s->kvm_dirty_gfn_count;
> + uint32_t count = 0, fetch = cpu->kvm_fetch_index;
> +
> + assert(dirty_gfns && gfn_count);
> + trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
> +
> + while (true) {
> + cur = &dirty_gfns[fetch % gfn_count];
> + if (!dirty_gfn_is_dirtied(cur)) {
> + break;
> + }
> + kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
> + cur->offset);
> + dirty_gfn_set_collected(cur);
> + trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
> + fetch++;
> + count++;
> + }
> + cpu->kvm_fetch_index = fetch;
> +
> + return count;
> +}
> +
> +/* Must be with slots_lock held */
> +static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
> +{
> + int ret;
> + CPUState *cpu;
> + uint64_t total = 0;
> + int64_t stamp;
> +
> + stamp = get_clock();
> +
> + CPU_FOREACH(cpu) {
> + total += kvm_dirty_ring_reap_one(s, cpu);
> + }
> +
> + if (total) {
> + ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
> + assert(ret == total);
> + }
> +
> + stamp = get_clock() - stamp;
> +
> + if (total) {
> + trace_kvm_dirty_ring_reap(total, stamp / 1000);
> + }
> +
> + return total;
> +}
> +
> +/*
> + * Currently for simplicity, we must hold BQL before calling this. We can
> + * consider to drop the BQL if we're clear with all the race conditions.
> + */
> +static uint64_t kvm_dirty_ring_reap(KVMState *s)
> +{
> + uint64_t total;
> +
> + /*
> + * We need to lock all kvm slots for all address spaces here,
> + * because:
> + *
> + * (1) We need to mark dirty for dirty bitmaps in multiple slots
> + * and for tons of pages, so it's better to take the lock here
> + * once rather than once per page. And more importantly,
> + *
> + * (2) We must _NOT_ publish dirty bits to the other threads
> + * (e.g., the migration thread) via the kvm memory slot dirty
> + * bitmaps before correctly re-protect those dirtied pages.
> + * Otherwise we can have potential risk of data corruption if
> + * the page data is read in the other thread before we do
> + * reset below.
> + */
> + kvm_slots_lock();
> + total = kvm_dirty_ring_reap_locked(s);
> + kvm_slots_unlock();
> +
> + return total;
> +}
> +
> +static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
> +{
> + /* No need to do anything */
> +}
> +
> +/*
> + * Kick all vcpus out in a synchronized way. When returned, we
> + * guarantee that every vcpu has been kicked and at least returned to
> + * userspace once.
> + */
> +static void kvm_cpu_synchronize_kick_all(void)
> +{
> + CPUState *cpu;
> +
> + CPU_FOREACH(cpu) {
> + run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
> + }
> +}
> +
> +/*
> + * Flush all the existing dirty pages to the KVM slot buffers. When
> + * this call returns, we guarantee that all the touched dirty pages
> + * before calling this function have been put into the per-kvmslot
> + * dirty bitmap.
> + *
> + * This function must be called with BQL held.
> + */
> +static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
The argument is not used.
> +{
> + trace_kvm_dirty_ring_flush(0);
> + /*
> + * The function needs to be serialized. Since this function
> + * should always be with BQL held, serialization is guaranteed.
> + * However, let's be sure of it.
> + */
> + assert(qemu_mutex_iothread_locked());
> + /*
> + * First make sure to flush the hardware buffers by kicking all
> + * vcpus out in a synchronous way.
> + */
> + kvm_cpu_synchronize_kick_all();
Can we make this function to be architecture specific?
It seems that kick out vCPU is an architecture specific way to flush hardware
buffers
to dirty ring (for x86 PML).
> + kvm_dirty_ring_reap(kvm_state);
> + trace_kvm_dirty_ring_flush(1);
> +}
> +
> /**
> * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
> *
> @@ -1174,7 +1384,24 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
> goto out;
> }
> if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
> - kvm_slot_get_dirty_log(kvm_state, mem);
> + /*
> + * NOTE: We should be aware of the fact that here we're only
> + * doing a best effort to sync dirty bits. No matter whether
> + * we're using dirty log or dirty ring, we ignored two facts:
> + *
> + * (1) dirty bits can reside in hardware buffers (PML)
> + *
> + * (2) after we collected dirty bits here, pages can be
> dirtied
> + * again before we do the final KVM_SET_USER_MEMORY_REGION to
> + * remove the slot.
> + *
> + * Not easy. Let's cross the fingers until it's fixed.
> + */
> + if (kvm_state->kvm_dirty_ring_enabled) {
> + kvm_dirty_ring_reap_locked(kvm_state);
> + } else {
> + kvm_slot_get_dirty_log(kvm_state, mem);
> + }
> kvm_slot_sync_dirty_pages(mem);
> }
>
> @@ -1222,6 +1449,51 @@ out:
> kvm_slots_unlock();
> }
>
> +static void *kvm_dirty_ring_reaper_thread(void *data)
> +{
> + KVMState *s = data;
> + struct KVMDirtyRingReaper *r = &s->reaper;
> +
> + rcu_register_thread();
> +
> + trace_kvm_dirty_ring_reaper("init");
> +
> + while (true) {
> + r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
> + trace_kvm_dirty_ring_reaper("wait");
> + /*
> + * TODO: provide a smarter timeout rather than a constant?
> + */
> + sleep(1);
> +
> + trace_kvm_dirty_ring_reaper("wakeup");
> + r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
> +
> + qemu_mutex_lock_iothread();
> + kvm_dirty_ring_reap(s);
> + qemu_mutex_unlock_iothread();
> +
> + r->reaper_iteration++;
> + }
I don't know when does this iteration exit?
And I see that we start this reaper_thread in kvm_init(), maybe it's better to
start it
when start dirty log and stop it when stop dirty log.
> +
> + trace_kvm_dirty_ring_reaper("exit");
> +
> + rcu_unregister_thread();
> +
> + return NULL;
> +}
> +
> +static int kvm_dirty_ring_reaper_init(KVMState *s)
> +{
> + struct KVMDirtyRingReaper *r = &s->reaper;
> +
> + qemu_thread_create(&r->reaper_thr, "kvm-reaper",
> + kvm_dirty_ring_reaper_thread,
> + s, QEMU_THREAD_JOINABLE);
> +
> + return 0;
> +}
> +
> static void kvm_region_add(MemoryListener *listener,
> MemoryRegionSection *section)
> {
> @@ -1250,6 +1522,36 @@ static void kvm_log_sync(MemoryListener *listener,
> kvm_slots_unlock();
> }
>
> +static void kvm_log_sync_global(MemoryListener *l)
> +{
> + KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
> + KVMState *s = kvm_state;
> + KVMSlot *mem;
> + int i;
> +
> + /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
> + kvm_dirty_ring_flush(&s->reaper);
> +
> + /*
> + * TODO: make this faster when nr_slots is big while there are
> + * only a few used slots (small VMs).
> + */
> + kvm_slots_lock();
> + for (i = 0; i < s->nr_slots; i++) {
> + mem = &kml->slots[i];
> + if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
> + kvm_slot_sync_dirty_pages(mem);
> + /*
> + * This is not needed by KVM_GET_DIRTY_LOG because the
> + * ioctl will unconditionally overwrite the whole region.
> + * However kvm dirty ring has no such side effect.
> + */
> + kvm_slot_reset_dirty_pages(mem);
> + }
> + }
> + kvm_slots_unlock();
> +}
> +
> static void kvm_log_clear(MemoryListener *listener,
> MemoryRegionSection *section)
> {
> @@ -1356,10 +1658,15 @@ void kvm_memory_listener_register(KVMState *s,
> KVMMemoryListener *kml,
> kml->listener.region_del = kvm_region_del;
> kml->listener.log_start = kvm_log_start;
> kml->listener.log_stop = kvm_log_stop;
> - kml->listener.log_sync = kvm_log_sync;
> - kml->listener.log_clear = kvm_log_clear;
> kml->listener.priority = 10;
>
> + if (s->kvm_dirty_ring_enabled) {
> + kml->listener.log_sync_global = kvm_log_sync_global;
> + } else {
> + kml->listener.log_sync = kvm_log_sync;
> + kml->listener.log_clear = kvm_log_clear;
> + }
> +
> memory_listener_register(&kml->listener, as);
>
> for (i = 0; i < s->nr_as; ++i) {
> @@ -2281,6 +2588,14 @@ static int kvm_init(MachineState *ms)
> ret = ram_block_discard_disable(true);
> assert(!ret);
> }
> +
> + if (s->kvm_dirty_ring_enabled) {
> + ret = kvm_dirty_ring_reaper_init(s);
> + if (ret) {
> + goto err;
> + }
> + }
> +
> return 0;
>
> err:
> @@ -2593,6 +2908,17 @@ int kvm_cpu_exec(CPUState *cpu)
> case KVM_EXIT_INTERNAL_ERROR:
> ret = kvm_handle_internal_error(cpu, run);
> break;
> + case KVM_EXIT_DIRTY_RING_FULL:
> + /*
> + * We shouldn't continue if the dirty ring of this vcpu is
> + * still full. Got kicked by KVM_RESET_DIRTY_RINGS.
> + */
> + trace_kvm_dirty_ring_full(cpu->cpu_index);
> + qemu_mutex_lock_iothread();
> + kvm_dirty_ring_reap(kvm_state);
> + qemu_mutex_unlock_iothread();
> + ret = 0;
> + break;
> case KVM_EXIT_SYSTEM_EVENT:
> switch (run->system_event.type) {
> case KVM_SYSTEM_EVENT_SHUTDOWN:
> diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
> index e15ae8980d3..72a01320a1a 100644
> --- a/accel/kvm/trace-events
> +++ b/accel/kvm/trace-events
> @@ -18,4 +18,11 @@ kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val,
> bool assign, uint32_t
> kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr,
> uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x
> gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"
> kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size)
> "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32
> kvm_resample_fd_notify(int gsi) "gsi %d"
> +kvm_dirty_ring_full(int id) "vcpu %d"
> +kvm_dirty_ring_reap_vcpu(int id) "vcpu %d"
> +kvm_dirty_ring_page(int vcpu, uint32_t slot, uint64_t offset) "vcpu %d fetch
> %"PRIu32" offset 0x%"PRIx64
> +kvm_dirty_ring_reaper(const char *s) "%s"
> +kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took
> %"PRIi64" us)"
> +kvm_dirty_ring_reaper_kick(const char *reason) "%s"
> +kvm_dirty_ring_flush(int finished) "%d"
>
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index c68bc3ba8af..2f0991d93f7 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -323,6 +323,11 @@ struct qemu_work_item;
> * @ignore_memory_transaction_failures: Cached copy of the MachineState
> * flag of the same name: allows the board to suppress calling of the
> * CPU do_transaction_failed hook function.
> + * @kvm_dirty_ring_full:
> + * Whether the kvm dirty ring of this vcpu is soft-full.
> + * @kvm_dirty_ring_avail:
> + * Semaphore to be posted when the kvm dirty ring of the vcpu is
> + * available again.
The doc does not match code.
> *
> * State of one CPU core or thread.
> */
> @@ -394,9 +399,12 @@ struct CPUState {
> */
> uintptr_t mem_io_pc;
>
> + /* Only used in KVM */
> int kvm_fd;
> struct KVMState *kvm_state;
> struct kvm_run *kvm_run;
> + struct kvm_dirty_gfn *kvm_dirty_gfns;
> + uint32_t kvm_fetch_index;
>
> /* Used for events with 'vcpu' and *without* the 'disabled' properties */
> DECLARE_BITMAP(trace_dstate_delayed, CPU_TRACE_DSTATE_MAX_EVENTS);
>
Thanks,
Keqian
- Re: [PATCH v5 02/10] KVM: Use a big lock to replace per-kml slots_lock, (continued)
[PATCH v5 08/10] KVM: Add dirty-gfn-count property, Peter Xu, 2021/03/10
[PATCH v5 09/10] KVM: Disable manual dirty log when dirty ring enabled, Peter Xu, 2021/03/10
[PATCH v5 06/10] KVM: Simplify dirty log sync in kvm_set_phys_mem, Peter Xu, 2021/03/10
[PATCH v5 10/10] KVM: Dirty ring support, Peter Xu, 2021/03/10
- Re: [PATCH v5 10/10] KVM: Dirty ring support,
Keqian Zhu <=
[PATCH v5 07/10] KVM: Cache kvm slot dirty bitmap size, Peter Xu, 2021/03/10
Re: [PATCH v5 00/10] KVM: Dirty ring support (QEMU part), Peter Xu, 2021/03/19
Re: [PATCH v5 00/10] KVM: Dirty ring support (QEMU part), Keqian Zhu, 2021/03/22