Re: [Qemu-devel] [PATCH 2/2] migration: calculate downtime on dst side

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 2/2] migration: calculate downtime on dst side

From:	Alexey Perevalov
Subject:	Re: [Qemu-devel] [PATCH 2/2] migration: calculate downtime on dst side
Date:	Wed, 05 Apr 2017 17:31:59 +0300
User-agent:	Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.8.0

On 04/04/2017 10:01 PM, Dr. David Alan Gilbert wrote:

* Alexey Perevalov (address@hidden) wrote:

This patch provides downtime calculation per vCPU,
as a summary and as a overlapped value for all vCPUs.

This approach just keeps tree with page fault addr as a key,
and t1-t2 interval of pagefault time and page copy time, with
affected vCPU bit mask.
For more implementation details please see comment to
get_postcopy_total_downtime function.

Signed-off-by: Alexey Perevalov <address@hidden>
---
  include/migration/migration.h |  11 ++
  migration/migration.c         | 238 +++++++++++++++++++++++++++++++++++++++++-
  migration/postcopy-ram.c      |  61 ++++++++++-
  migration/savevm.c            |   2 +
  migration/trace-events        |  10 +-
  5 files changed, 318 insertions(+), 4 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 5720c88..8f9af77 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -123,10 +123,21 @@ struct MigrationIncomingState {

/* See savevm.c */

      LoadStateEntry_Head loadvm_handlers;
+
+    /*
+     *  Tree for keeping postcopy downtime,
+     *  necessary to calculate correct downtime, during multiple
+     *  vm suspends, it keeps host page address as a key and
+     *  DowntimeDuration as a data
+     */
+    GTree *postcopy_downtime;
  };

MigrationIncomingState *migration_incoming_get_current(void);

  void migration_incoming_state_destroy(void);
+void mark_postcopy_downtime_begin(uint64_t addr, int cpu);
+void mark_postcopy_downtime_end(uint64_t addr);
+int64_t get_postcopy_total_downtime(void);

/*

   * An outstanding page request, on the source, having been received
diff --git a/migration/migration.c b/migration/migration.c
index 54060f7..57d71e1 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -77,6 +77,12 @@ static NotifierList migration_state_notifiers =

static bool deferred_incoming;+typedef struct {

+    int64_t begin;
+    int64_t end;
+    uint64_t cpus;
+} DowntimeDuration;
+
  /*
   * Current state of incoming postcopy; note this is not part of
   * MigrationIncomingState since it's state is used during cleanup
@@ -117,6 +123,21 @@ MigrationState *migrate_get_current(void)
      return &current_migration;
  }

+static gint addr_compare(gconstpointer a, gconstpointer b,

+                                 gpointer user_data G_GNUC_UNUSED)
+{
+    if (a == b)
+        return 0;
+    else if (a > b)
+        return 1;
+    return -1;
+}

Weird that glib doesn't have that builtin?

g_int64_equal is suitable, it operates with value by pointer,
I used pointers as key, it's not a problem to use faulted page
address as a value. But in this case I should choose g_int64_equal
or g_int64_equal depends on qemu's host architecture, or always relay
on int64 as size of pointer like kernel does in uffd_msg.

+static void destroy_downtime_duration(gpointer data)
+{
+    free(data);

    g_free

+}
+
  MigrationIncomingState *migration_incoming_get_current(void)
  {
      static bool once;
@@ -128,6 +149,9 @@ MigrationIncomingState *migration_incoming_get_current(void)
          QLIST_INIT(&mis_current.loadvm_handlers);
          qemu_mutex_init(&mis_current.rp_mutex);
          qemu_event_init(&mis_current.main_thread_load_event, false);
+        mis_current.postcopy_downtime = g_tree_new_full(addr_compare,
+                                             NULL, NULL,
+                                             destroy_downtime_duration);
          once = true;
      }
      return &mis_current;
@@ -138,10 +162,13 @@ void migration_incoming_state_destroy(void)
      struct MigrationIncomingState *mis = migration_incoming_get_current();

qemu_event_destroy(&mis->main_thread_load_event);

+    if (mis->postcopy_downtime) {
+        g_tree_destroy(mis->postcopy_downtime);
+        mis->postcopy_downtime = NULL;
+    }
      loadvm_free_handlers(mis);
  }

-

  typedef struct {
      bool optional;
      uint32_t size;
@@ -1119,7 +1146,6 @@ MigrationState *migrate_init(const MigrationParams 
*params)
      s->last_req_rb = NULL;
      error_free(s->error);
      s->error = NULL;
-
      migrate_set_state(&s->state, MIGRATION_STATUS_NONE, 
MIGRATION_STATUS_SETUP);

QSIMPLEQ_INIT(&s->src_page_requests);

@@ -2109,3 +2135,211 @@ PostcopyState postcopy_state_set(PostcopyState 
new_state)
      return atomic_xchg(&incoming_postcopy_state, new_state);
  }

+void mark_postcopy_downtime_begin(uint64_t addr, int cpu)

+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    DowntimeDuration *dd;
+    if (!mis || !mis->postcopy_downtime) {
+        error_report("Migration incoming state should exists mis %p", mis);
+        return;
+    }
+
+    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr); /* !!! cast */
+    if (!dd) {
+        dd = (DowntimeDuration *)g_malloc0(sizeof(DowntimeDuration));

g_new0 is quite nice to avoid the sizeof.

+        g_tree_insert(mis->postcopy_downtime, (gpointer)addr, (gpointer)dd);
+    }
+
+    if (cpu < 0)
+        /* assume in this situation all vCPUs are sleeping */
+        dd->cpus = ~0u;
+    else
+        set_bit(cpu, &dd->cpus);
+
+    /*
+     *  overwrite previously set dd->begin, if that page already was
+     *     faulted on another cpu
+     */
+    dd->begin = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    trace_mark_postcopy_downtime_begin(addr, dd, dd->begin, cpu);
+}
+
+void mark_postcopy_downtime_end(uint64_t addr)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    DowntimeDuration *dd;
+    if (!mis || !mis->postcopy_downtime) {
+        error_report("Migration incoming state should exists mis %p", mis);

I don't think you need to check for mis not existing, the code always returns
it from migration_incoming_get_current.

+        return;
+    }
+
+    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr);
+    if (!dd) {
+        /* error_report("Could not populate downtime duration completion time 
\n\
+                        There is no downtime duration for 0x%"PRIx64, addr); */

It's probably worth keeping a count of these so you can tell - otherwise
you could be losing counts and never know.

+        return;
+    }
+
+    dd->end = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    trace_mark_postcopy_downtime_end(addr, dd, dd->end);
+}
+
+typedef struct {
+    int64_t tp; /* point in time */
+    bool is_end;
+    int64_t cpus; /* cpus bit mask */

Ah! Useful comment that the DowntimeDuration didn't have.
Note that 64 isn't enough - there is already support for somewhere
over 200 vCPU VMs.

agree

+} OverlapDowntime;

Could you comment what this function is doing - it's not that obvious.
I'm guessing something to do with handling the same page for multiple CPUs?

ok, I'll, in code,
here in couple word: it just split downtime duration interval
and makes points to be able latter to sort it (like that SSESEE).
Name isn't so readable/fancy, so I'll change name too.

+static gboolean populate_downtime_points(gpointer key, gpointer value,
+                                        gpointer data)
+{
+    DowntimeDuration *dd = (DowntimeDuration *)value;
+    GPtrArray *interval = (GPtrArray *)data;
+    OverlapDowntime *od_begin = g_malloc0(sizeof(OverlapDowntime));
+    OverlapDowntime *od_end = g_malloc0(sizeof(OverlapDowntime));
+
+    od_begin->tp = dd->begin;
+    od_begin->is_end = false;
+    od_begin->cpus = dd->cpus;
+    g_ptr_array_add(interval, od_begin);
+
+    od_end->tp = dd->end;
+    od_end->is_end = true;
+    od_end->cpus = dd->cpus;
+    g_ptr_array_add(interval, od_end);
+
+    if (dd->end && dd->begin)
+        trace_sumup_downtime_duration(dd->end - dd->begin, (uint64_t)key, 
dd->cpus);

Try and keep the trace_ names matching the function to start with.

+    return FALSE;
+}
+
+static gboolean calculate_per_cpu(gpointer key, gpointer value,
+                                  gpointer data)
+{
+    int *downtime_cpu = (int *)data;
+    DowntimeDuration *dd = (DowntimeDuration *)value;
+    int cpu_iter;
+    for (cpu_iter = 0; cpu_iter < smp_cpus; cpu_iter++) {
+        if (test_bit(cpu_iter, &dd->cpus) && dd->end && dd->begin)
+            downtime_cpu[cpu_iter] += dd->end - dd->begin;

An interesting sanity check would be to check that end >= begin

+    }
+    return FALSE;
+}
+
+static gint compare_downtime(gconstpointer a, gconstpointer b)
+{
+    DowntimeDuration *dda = (DowntimeDuration *)a;
+    DowntimeDuration *ddb = (DowntimeDuration *)b;
+    return dda->begin - ddb->begin;
+}
+
+static uint64_t get_sufficient_smp_cpus(void)
+{
+    int i;
+    static uint64_t sufficient_cpus;
+    for (i = 0; i < smp_cpus; i++)
+    {
+       set_bit(i, &sufficient_cpus);
+    }
+    return sufficient_cpus;
+}
+
+/*
+ * This function calculates downtime per cpu and trace it
+ *
+ *  Also it calculates total downtime as an interval's overlap,
+ *  for many vCPU.
+ *
+ *  The approach is following:
+ *  Initially intervals are represented in tree where key is
+ *  pagefault address, and values:
+ *   begin - page fault time
+ *   end   - page load time
+ *   cpus  - bit mask shows affected cpus
+ *
+ *  To calculate overlap on all cpus, intervals converted into
+ *  array of points in time (downtime_points), the size of
+ *  array is 2 * number of nodes in tree of intervals (2 array
+ *  elements per one in element of interval).
+ *  Each element is marked as end (E) or as start (S) of interval.
+ *  The overlap downtime will be calculated for SE, only in case
+ *  there is sequence S(0..N)E(M) for every vCPU.
+ *
+ * As example we have 3 CPU
+ *
+ *      S1        E1           S1               E1
+ * -----***********------------xxx***************------------------------> CPU1
+ *
+ *             S2                E2
+ * ------------****************xxx---------------------------------------> CPU2
+ *
+ *                         S3            E3
+ * ------------------------****xxx********-------------------------------> CPU3
+ *
+ * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
+ * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include 
CPU3,
+ * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
+ *

I'm a bit confused - are you only regarding 'downtime' as when *all* the vcpus 
are
stopped - which is where you seem to have the 'xxx's here?

Yes, even downtime per vCPU is collected, but the final downtime,
(I hope it will be in qery migrate output) it's overlapping,
yes xxx - when *all* vcpus are stopped.
My fault, symbols */x were omitted in legend.

+ */
+int64_t get_postcopy_total_downtime(void)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    int64_t total_downtime = 0; /* for total overlapped downtime */
+    const int intervals = g_tree_nnodes(mis->postcopy_downtime);
+    const int points = 2 * intervals;
+    uint64_t sufficient_smp_cpus = get_sufficient_smp_cpus();
+    int point_iter, start_point_iter;
+    GPtrArray *downtime_points = g_ptr_array_sized_new(points);
+    /* for summary downtime per cpu */
+    int *downtime_cpu = g_malloc0(smp_cpus * sizeof(int));

Another good use of g_new0

+    if (!mis || !mis->postcopy_downtime) {
+        error_report("Migration incoming state should exists, mis %p", mis);
+        return -1;
+    }
+
+    /* make downtime points S/E from interval */
+    g_tree_foreach(mis->postcopy_downtime, populate_downtime_points,
+                   downtime_points);
+    g_tree_foreach(mis->postcopy_downtime, calculate_per_cpu, downtime_cpu);
+
+    /* just for RFC patch */
+    for (point_iter = 0; point_iter < smp_cpus; point_iter++)
+    {
+        trace_downtime_per_cpu(point_iter, downtime_cpu[point_iter]);
+    }
+
+    g_ptr_array_sort(downtime_points, compare_downtime);
+
+    for (point_iter = 1; point_iter < points; point_iter++) {
+        OverlapDowntime *od = g_ptr_array_index(downtime_points, point_iter);
+        uint64_t cur_cpus = od->cpus;
+        int smp_cpus_i = smp_cpus;
+        OverlapDowntime *prev_od = g_ptr_array_index(downtime_points,
+                                                     point_iter - 1);
+        /* we need sequence SE */
+        if (!od->is_end || prev_od->is_end)
+            continue;
+
+        for (start_point_iter = point_iter - 1;
+             start_point_iter >= 0 && smp_cpus_i;
+             start_point_iter--, smp_cpus_i--) {
+            OverlapDowntime *t_od = g_ptr_array_index(downtime_points,
+                                                      start_point_iter);
+            /* should be S */
+            if (t_od->is_end)
+                break;
+
+            cur_cpus |= t_od->cpus;
+            if (sufficient_smp_cpus & cur_cpus) {
+                total_downtime += od->tp - prev_od->tp;
+                /* situation when one S point represents all vCPU is possible 
*/
+                break;
+            }
+        }
+    }
+    trace_get_postcopy_total_downtime(g_tree_nnodes(mis->postcopy_downtime),
+        total_downtime);
+
+    g_ptr_array_free(downtime_points, TRUE);
+    return total_downtime;
+}
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index dc80dbb..3bd9db0 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -23,6 +23,7 @@
  #include "migration/postcopy-ram.h"
  #include "sysemu/sysemu.h"
  #include "sysemu/balloon.h"
+#include <sys/param.h>
  #include "qemu/error-report.h"
  #include "trace.h"

@@ -404,6 +405,60 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,

      return 0;
  }

+#define PROC_LEN 1024

+static void trace_for_thread(const char *msg, pid_t thread_id)
+{
+    const char *status_file_frmt = "/proc/%d/status";
+    char status_file_path[MAXPATHLEN];
+    char proc_name[PROC_LEN];
+    char proc_status[PROC_LEN];
+    char *line = NULL;
+    FILE *f;
+    ssize_t read;
+    size_t len;
+
+    sprintf(status_file_path, status_file_frmt, thread_id);

Maybe consider using g_strdup_printf for all the names rather
than fixed size arrays and sprintf's etc

+    f = fopen(status_file_path, "r");
+    if (!f) {
+        error_report("can't open %s", status_file_path);
+        return;
+    }
+
+    memset(proc_name, 0, sizeof(proc_name));
+    memset(proc_status, 0, sizeof(proc_status));
+    while ((read = getline(&line, &len, f)) != -1) {
+        if (strstr(line, "Name"))
+            strncpy(proc_name, line, sizeof(proc_name));
+        if (strstr(line, "State"))
+            strncpy(proc_status, line, sizeof(proc_status));
+    }

There's got to be a 'safe' way to do that rather than getline.

+    free(line);
+    trace_vcpu_thread_status(msg, thread_id, proc_name, proc_status);
+}
+
+static int defined_mem_fault_cpu_index(pid_t pid)
+{
+    CPUState *cpu_iter;
+
+    CPU_FOREACH(cpu_iter) {
+        if (cpu_iter->thread_id == pid)
+           return cpu_iter->cpu_index;
+    }
+    trace_for_thread("can't find cpu_index for thread id", pid);

error_report?

+    return -1;
+}
+
+static void trace_cpu_state(void)
+{
+    CPUState *cpu_iter;
+    CPU_FOREACH(cpu_iter) {
+        trace_for_thread("vCPU", cpu_iter->thread_id);
+        trace_postcopy_vcpu_running(cpu_iter->cpu_index, cpu_iter->running);
+    }
+}
+
  /*
   * Handle faults detected by the USERFAULT markings
   */
@@ -445,6 +500,7 @@ static void *postcopy_ram_fault_thread(void *opaque)
          }

ret = read(mis->userfault_fd, &msg, sizeof(msg));

+        trace_cpu_state();
          if (ret != sizeof(msg)) {
              if (errno == EAGAIN) {
                  /*
@@ -481,8 +537,10 @@ static void *postcopy_ram_fault_thread(void *opaque)
          rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
          trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
                                                  qemu_ram_get_idstr(rb),
-                                                rb_offset);
+                                                rb_offset, 
msg.arg.pagefault.pid);

+ mark_postcopy_downtime_begin(msg.arg.pagefault.address,

+               defined_mem_fault_cpu_index(msg.arg.pagefault.pid));
          /*
           * Send the request to the source - we want to request one
           * of our host page sizes (which is >= TPS)
@@ -577,6 +635,7 @@ int postcopy_place_page(MigrationIncomingState *mis, void 
*host, void *from,

return -e;

      }
+    mark_postcopy_downtime_end((uint64_t)host);

trace_postcopy_place_page(host);

      return 0;
diff --git a/migration/savevm.c b/migration/savevm.c
index 3b19a4a..e12c0a2 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1630,6 +1630,7 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)

if (autostart) {

          /* Hold onto your hats, starting the CPU */
+       trace_loadvm_postcopy_vm_start(get_postcopy_total_downtime());
          vm_start();
      } else {
          /* leave it paused and let management decide when to start the CPU */
@@ -1930,6 +1931,7 @@ qemu_loadvm_section_part_end(QEMUFile *f, 
MigrationIncomingState *mis)
          return -EINVAL;
      }

+ trace_loadvm_postcopy_vm_start(get_postcopy_total_downtime());

      return 0;
  }

diff --git a/migration/trace-events b/migration/trace-events

index 7372ce2..8a21684 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -44,6 +44,7 @@ vmstate_subsection_save_loop(const char *name, const char *sub) 
"%s/%s"
  vmstate_subsection_save_top(const char *idstr) "%s"
  vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s"
  qemu_announce_self_iter(const char *mac) "%s"
+loadvm_postcopy_vm_start(int64_t downtime) "%"PRId64

# migration/vmstate.c

  vmstate_load_field_error(const char *field, int ret) "field \"%s\" load failed, 
ret = %d"
@@ -110,6 +111,11 @@ process_incoming_migration_co_end(int ret, int ps) "ret=%d 
postcopy-state=%d"
  process_incoming_migration_co_postcopy_end_main(void) ""
  migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p 
ioctype=%s"
  migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname)  
"ioc=%p ioctype=%s hostname=%s"
+mark_postcopy_downtime_begin(uint64_t addr, void *dd, int64_t time, int cpu) "addr 0x%" PRIx64 
" dd %p time %" PRId64 " cpu %d"
+mark_postcopy_downtime_end(uint64_t addr, void *dd, int64_t time) "addr 0x%" PRIx64 
" dd %p time %" PRId64
+get_postcopy_total_downtime(int num, int64_t total) "faults %d, total downtime 
%" PRId64
+sumup_downtime_duration(int64_t downtime, uint64_t addr, int cpubit) "downtime %" PRId64 " 
addr 0x%" PRIx64 "cpus %d"
+downtime_per_cpu(int cpu_index, int downtime) "downtime cpu[%d]=%d"

# migration/rdma.c

  qemu_rdma_accept_incoming_migration(void) ""
@@ -186,7 +192,7 @@ postcopy_ram_enable_notify(void) ""
  postcopy_ram_fault_thread_entry(void) ""
  postcopy_ram_fault_thread_exit(void) ""
  postcopy_ram_fault_thread_quit(void) ""
-postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) 
"Request for HVA=%" PRIx64 " rb=%s offset=%zx"
+postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, int pid) 
"Request for HVA=%" PRIx64 " rb=%s offset=%zx %d"
  postcopy_ram_incoming_cleanup_closeuf(void) ""
  postcopy_ram_incoming_cleanup_entry(void) ""
  postcopy_ram_incoming_cleanup_exit(void) ""
@@ -195,6 +201,8 @@ save_xbzrle_page_skipping(void) ""
  save_xbzrle_page_overflow(void) ""
  ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 
" milliseconds, %d iterations"
  ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" 
PRIu64
+vcpu_thread_status(const char *msg, int tpid, char *name, char *status) "%s 
host_tid %d %s %s"
+postcopy_vcpu_running(int cpu_index, int is_running) "cpu %d running %d"

# migration/exec.c

  migration_exec_outgoing(const char *cmd) "cmd=%s"
--
1.8.3.1

--
Dr. David Alan Gilbert / address@hidden / Manchester, UK



--
Best regards,
Alexey Perevalov

[Prev in Thread]

Current Thread

[Next in Thread]

Re: [Qemu-devel] [PATCH 2/2] migration: calculate downtime on dst side, Dr. David Alan Gilbert, 2017/04/04
- Re: [Qemu-devel] [PATCH 2/2] migration: calculate downtime on dst side, Alexey Perevalov <=

Prev by Date: Re: [Qemu-devel] [PATCH 1/2] userfault: add pid into uffd_msg
Next by Date: Re: [Qemu-devel] [RFC PATCH 0/2] Calcuate downtime for postcopy live migration
Previous by thread: Re: [Qemu-devel] [PATCH 2/2] migration: calculate downtime on dst side
Next by thread: Re: [Qemu-devel] [RFC PATCH 0/2] Calcuate downtime for postcopy live migration
Index(es):
- Date
- Thread