qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[RFC PATCH 1/1] Dirty quota-based throttling of vcpus


From: Shivam Kumar
Subject: [RFC PATCH 1/1] Dirty quota-based throttling of vcpus
Date: Sun, 20 Nov 2022 22:54:59 +0000

Introduces a (new) throttling scheme where QEMU defines a limit on the dirty
rate of each vcpu of the VM. This limit is enfored on the vcpus in small
intervals (dirty quota intervals) by allowing the vcpus to dirty only as many
pages in these intervals as to maintain a dirty rate below the set limit.

Suggested-by: Shaju Abraham <shaju.abraham@nutanix.com>
Suggested-by: Manish Mishra <manish.mishra@nutanix.com>
Co-developed-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
Signed-off-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com>
---
 accel/kvm/kvm-all.c       | 91 +++++++++++++++++++++++++++++++++++++++
 include/exec/memory.h     |  3 ++
 include/hw/core/cpu.h     |  5 +++
 include/sysemu/kvm_int.h  |  1 +
 linux-headers/linux/kvm.h |  9 ++++
 migration/migration.c     | 22 ++++++++++
 migration/migration.h     | 31 +++++++++++++
 softmmu/memory.c          | 64 +++++++++++++++++++++++++++
 8 files changed, 226 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f99b0becd8..ea50605592 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -46,6 +46,8 @@
 #include "sysemu/hw_accel.h"
 #include "kvm-cpus.h"
 #include "sysemu/dirtylimit.h"
+#include "hw/core/cpu.h"
+#include "migration/migration.h"
 
 #include "hw/boards.h"
 #include "monitor/stats.h"
@@ -2463,6 +2465,8 @@ static int kvm_init(MachineState *ms)
         }
     }
 
+    s->dirty_quota_supported = kvm_vm_check_extension(s, KVM_CAP_DIRTY_QUOTA);
+
     /*
      * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
      * enabled.  More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
@@ -2808,6 +2812,88 @@ static void kvm_eat_signals(CPUState *cpu)
     } while (sigismember(&chkset, SIG_IPI));
 }
 
+static void handle_dirty_quota_sleep(int64_t sleep_time)
+{
+    /* Do not throttle the vcpu more than the maximum throttle. */
+    sleep_time = MIN(sleep_time,
+                        DIRTY_QUOTA_MAX_THROTTLE * DIRTY_QUOTA_INTERVAL_SIZE);
+    /* Convert sleep time from nanoseconds to microseconds. */
+    g_usleep(sleep_time / 1000);
+}
+
+static uint64_t handle_dirty_quota_exhausted(
+                    CPUState *cpu, const uint64_t count, const uint64_t quota)
+{
+    MigrationState *s = migrate_get_current();
+    uint64_t time_to_sleep;
+    int64_t unclaimed_quota;
+    int64_t dirty_quota_overflow = (count - quota);
+    uint64_t dirty_rate_limit = qatomic_read(&s->per_vcpu_dirty_rate_limit);
+    uint64_t new_quota = (dirty_rate_limit * DIRTY_QUOTA_INTERVAL_SIZE) /
+                                                        NANOSECONDS_PER_SECOND;
+    uint64_t current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+
+    /* Penalize the vCPU if it dirtied more pages than it was allowed to. */
+    if (dirty_quota_overflow > 0) {
+        time_to_sleep = (dirty_quota_overflow * NANOSECONDS_PER_SECOND) /
+                                                            dirty_rate_limit;
+        cpu->dirty_quota_expiry_time = current_time + time_to_sleep;
+        return time_to_sleep;
+    }
+
+    /*
+     * If the current dirty quota interval hasn't ended, try using common quota
+     * if it is available, else sleep.
+     */
+    current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    if (current_time < cpu->dirty_quota_expiry_time) {
+        qemu_spin_lock(&s->common_dirty_quota_lock);
+        if (s->common_dirty_quota > 0) {
+            s->common_dirty_quota -= new_quota;
+            qemu_spin_unlock(&s->common_dirty_quota_lock);
+            cpu->kvm_run->dirty_quota = count + new_quota;
+            return 0;
+        }
+
+        qemu_spin_unlock(&s->common_dirty_quota_lock);
+        current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+        /* If common quota isn't available, sleep for the remaining interval. 
*/
+        if (current_time < cpu->dirty_quota_expiry_time) {
+            time_to_sleep = cpu->dirty_quota_expiry_time - current_time;
+            return time_to_sleep;
+        }
+    }
+
+    /*
+     * This is a fresh dirty quota interval. If the vcpu has not claimed its
+     * quota for the previous intervals, add them to the common quota.
+     */
+    current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    unclaimed_quota = (current_time - cpu->dirty_quota_expiry_time) *
+                        dirty_rate_limit;
+    qemu_spin_lock(&s->common_dirty_quota_lock);
+    s->common_dirty_quota += unclaimed_quota;
+    qemu_spin_unlock(&s->common_dirty_quota_lock);
+
+    /*  Allocate the vcpu this new interval's dirty quota. */
+    cpu->kvm_run->dirty_quota = count + new_quota;
+    cpu->dirty_quota_expiry_time = current_time + DIRTY_QUOTA_INTERVAL_SIZE;
+    return 0;
+}
+
+
+static void handle_kvm_exit_dirty_quota_exhausted(CPUState *cpu,
+                                    const uint64_t count, const uint64_t quota)
+{
+    uint64_t time_to_sleep;
+    do {
+        time_to_sleep = handle_dirty_quota_exhausted(cpu, count, quota);
+        if (time_to_sleep > 0) {
+            handle_dirty_quota_sleep(time_to_sleep);
+        }
+    } while (time_to_sleep != 0);
+}
+
 int kvm_cpu_exec(CPUState *cpu)
 {
     struct kvm_run *run = cpu->kvm_run;
@@ -2943,6 +3029,11 @@ int kvm_cpu_exec(CPUState *cpu)
             dirtylimit_vcpu_execute(cpu);
             ret = 0;
             break;
+        case KVM_EXIT_DIRTY_QUOTA_EXHAUSTED:
+            handle_kvm_exit_dirty_quota_exhausted(cpu,
+                    run->dirty_quota_exit.count, run->dirty_quota_exit.quota);
+            ret = 0;
+            break;
         case KVM_EXIT_SYSTEM_EVENT:
             switch (run->system_event.type) {
             case KVM_SYSTEM_EVENT_SHUTDOWN:
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 91f8a2395a..becd0144a0 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -3009,6 +3009,9 @@ bool ram_block_discard_is_disabled(void);
  */
 bool ram_block_discard_is_required(void);
 
+void dirty_quota_migration_start(void);
+void dirty_quota_migration_stop(void);
+
 #endif
 
 #endif
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 8830546121..7c5543849a 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -36,6 +36,9 @@
 typedef int (*WriteCoreDumpFunction)(const void *buf, size_t size,
                                      void *opaque);
 
+#define DIRTY_QUOTA_INTERVAL_SIZE 10000000
+#define DIRTY_QUOTA_MAX_THROTTLE .99
+
 /**
  * SECTION:cpu
  * @section_id: QEMU-cpu
@@ -443,6 +446,8 @@ struct CPUState {
 
     /* track IOMMUs whose translations we've cached in the TCG TLB */
     GArray *iommu_notifiers;
+
+    uint64_t dirty_quota_expiry_time;
 };
 
 typedef QTAILQ_HEAD(CPUTailQ, CPUState) CPUTailQ;
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index 3b4adcdc10..51e3df18c7 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -110,6 +110,7 @@ struct KVMState
     struct KVMDirtyRingReaper reaper;
     NotifyVmexitOption notify_vmexit;
     uint32_t notify_window;
+    bool dirty_quota_supported; /* Whether KVM supports dirty quota or not */
 };
 
 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index ebdafa576d..bc1d308afd 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -272,6 +272,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_RISCV_SBI        35
 #define KVM_EXIT_RISCV_CSR        36
 #define KVM_EXIT_NOTIFY           37
+#define KVM_EXIT_DIRTY_QUOTA_EXHAUSTED 38
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -508,6 +509,11 @@ struct kvm_run {
 #define KVM_NOTIFY_CONTEXT_INVALID     (1 << 0)
                        __u32 flags;
                } notify;
+               /* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
+               struct {
+                       __u64 count;
+                       __u64 quota;
+               } dirty_quota_exit;
                /* Fix the size of the union. */
                char padding[256];
        };
@@ -529,6 +535,8 @@ struct kvm_run {
                struct kvm_sync_regs regs;
                char padding[SYNC_REGS_SIZE_BYTES];
        } s;
+
+       __u64 dirty_quota;
 };
 
 /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
@@ -1175,6 +1183,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
+#define KVM_CAP_DIRTY_QUOTA 224
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..b94f636f08 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -61,6 +61,8 @@
 #include "sysemu/cpus.h"
 #include "yank_functions.h"
 #include "sysemu/qtest.h"
+#include "hw/core/cpu.h"
+#include "sysemu/kvm_int.h"
 
 #define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling 
*/
 
@@ -3685,8 +3687,11 @@ static void migration_update_counters(MigrationState *s,
                                       int64_t current_time)
 {
     uint64_t transferred, transferred_pages, time_spent;
+    uint64_t pages_transferred_since_last_update, time_spent_since_last_update;
     uint64_t current_bytes; /* bytes transferred since the beginning */
     double bandwidth;
+    CPUState *cpu;
+    uint32_t nr_cpus;
 
     if (current_time < s->iteration_start_time + BUFFER_DELAY) {
         return;
@@ -3706,6 +3711,23 @@ static void migration_update_counters(MigrationState *s,
     s->pages_per_second = (double) transferred_pages /
                              (((double) time_spent / 1000.0));
 
+    if (kvm_state->dirty_quota_supported) {
+        CPU_FOREACH(cpu) {
+            nr_cpus++;
+        }
+        pages_transferred_since_last_update = transferred_pages -
+                                    s->last_counters_update.transferred_pages;
+        time_spent_since_last_update = time_spent -
+                                    s->last_counters_update.time_spent;
+        qatomic_set(&s->per_vcpu_dirty_rate_limit,
+            ((double) pages_transferred_since_last_update) /
+            (((double) time_spent_since_last_update) / 1000.0) /
+            ((double) nr_cpus));
+
+        s->last_counters_update.transferred_pages = transferred_pages;
+        s->last_counters_update.time_spent = time_spent;
+    }
+
     /*
      * if we haven't sent anything, we don't want to
      * recalculate. 10000 is a small enough number for our purposes
diff --git a/migration/migration.h b/migration/migration.h
index cdad8aceaa..66c680b81c 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -249,6 +249,15 @@ struct MigrationState {
     uint64_t iteration_initial_bytes;
     /* time at the start of current iteration */
     int64_t iteration_start_time;
+
+    /* state related to last migration counters update */
+    struct {
+        /* time spent from the start of iteration till the last update */
+        uint64_t time_spent;
+        /* pages already sent in the current iteration till the last update */
+        uint64_t transferred_pages;
+    } last_counters_update;
+
     /*
      * The final stage happens when the remaining data is smaller than
      * this threshold; it's calculated from the requested downtime and
@@ -373,6 +382,28 @@ struct MigrationState {
      * This save hostname when out-going migration starts
      */
     char *hostname;
+
+    /*
+     * Dirty quota throttling tries to limit the dirty rate of the guest to 
some
+     * factor of network throughput. This factor is dirty_quota_throttle_ratio.
+     */
+    double dirty_quota_throttle_ratio;
+
+    /*
+     * For dirty quota throttling, this is the limit on the dirty rate of the
+     * vcpus. There maybe exceptions where this limit might be enforced losely
+     * to avoid overthrottling of the vcpus.
+     */
+    uint64_t per_vcpu_dirty_rate_limit;
+
+    /*
+     * If a vcpu doesn't claim its dirty quota for a given dirty quota 
interval,
+     * the unclaimed quota gets added to common quota.
+     * Common dirty quota can be claimed by any vcpu which has already used its
+     * individual dirty quota for the current dirty quota interval.
+     */
+    QemuSpin common_dirty_quota_lock;
+    int64_t common_dirty_quota;
 };
 
 void migrate_set_state(int *state, int old_state, int new_state);
diff --git a/softmmu/memory.c b/softmmu/memory.c
index bc0be3f62c..8f725a9b89 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -12,6 +12,7 @@
  * Contributions after 2012-01-13 are licensed under the terms of the
  * GNU GPL, version 2 or (at your option) any later version.
  */
+#include <linux/kvm.h>
 
 #include "qemu/osdep.h"
 #include "qemu/log.h"
@@ -34,6 +35,10 @@
 #include "hw/boards.h"
 #include "migration/vmstate.h"
 #include "exec/address-spaces.h"
+#include "hw/core/cpu.h"
+#include "exec/target_page.h"
+#include "migration/migration.h"
+#include "sysemu/kvm_int.h"
 
 //#define DEBUG_UNASSIGNED
 
@@ -2869,6 +2874,46 @@ static unsigned int postponed_stop_flags;
 static VMChangeStateEntry *vmstate_change;
 static void memory_global_dirty_log_stop_postponed_run(void);
 
+static void init_vcpu_dirty_quota(CPUState *cpu, run_on_cpu_data arg)
+{
+    uint64_t current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    cpu->kvm_run->dirty_quota = 1;
+    cpu->dirty_quota_expiry_time = current_time;
+}
+
+void dirty_quota_migration_start(void)
+{
+    if (!kvm_state->dirty_quota_supported) {
+        return;
+    }
+
+    MigrationState *s = migrate_get_current();
+    /* Assuming initial bandwidth to be 128 MBps. */
+    double pages_per_second = (((double) 1e9) / 8.0) /
+                                    (double) qemu_target_page_size();
+    uint32_t nr_cpus;
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        nr_cpus++;
+    }
+    /*
+     * Currently we are hardcoding this to 2. There are plans to allow the user
+     * to manually select this ratio.
+     */
+    s->dirty_quota_throttle_ratio = 2;
+    qatomic_set(&s->per_vcpu_dirty_rate_limit,
+                pages_per_second / s->dirty_quota_throttle_ratio / nr_cpus);
+
+    qemu_spin_lock(&s->common_dirty_quota_lock);
+    s->common_dirty_quota = 0;
+    qemu_spin_unlock(&s->common_dirty_quota_lock);
+
+    CPU_FOREACH(cpu) {
+        run_on_cpu(cpu, init_vcpu_dirty_quota, RUN_ON_CPU_NULL);
+    }
+}
+
 void memory_global_dirty_log_start(unsigned int flags)
 {
     unsigned int old_flags;
@@ -2891,6 +2936,7 @@ void memory_global_dirty_log_start(unsigned int flags)
     trace_global_dirty_changed(global_dirty_tracking);
 
     if (!old_flags) {
+        dirty_quota_migration_start();
         MEMORY_LISTENER_CALL_GLOBAL(log_global_start, Forward);
         memory_region_transaction_begin();
         memory_region_update_pending = true;
@@ -2898,6 +2944,23 @@ void memory_global_dirty_log_start(unsigned int flags)
     }
 }
 
+static void reset_vcpu_dirty_quota(CPUState *cpu, run_on_cpu_data arg)
+{
+    cpu->kvm_run->dirty_quota = 0;
+}
+
+void dirty_quota_migration_stop(void)
+{
+    if (!kvm_state->dirty_quota_supported) {
+        return;
+    }
+
+    CPUState *cpu;
+    CPU_FOREACH(cpu) {
+        run_on_cpu(cpu, reset_vcpu_dirty_quota, RUN_ON_CPU_NULL);
+    }
+}
+
 static void memory_global_dirty_log_do_stop(unsigned int flags)
 {
     assert(flags && !(flags & (~GLOBAL_DIRTY_MASK)));
@@ -2907,6 +2970,7 @@ static void memory_global_dirty_log_do_stop(unsigned int 
flags)
     trace_global_dirty_changed(global_dirty_tracking);
 
     if (!global_dirty_tracking) {
+        dirty_quota_migration_stop();
         memory_region_transaction_begin();
         memory_region_update_pending = true;
         memory_region_transaction_commit();
-- 
2.22.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]