[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 2/2] migration: calculate downtime on dst side
From: |
Dr. David Alan Gilbert |
Subject: |
Re: [Qemu-devel] [PATCH 2/2] migration: calculate downtime on dst side |
Date: |
Tue, 4 Apr 2017 20:01:12 +0100 |
User-agent: |
Mutt/1.8.0 (2017-02-23) |
* Alexey Perevalov (address@hidden) wrote:
> This patch provides downtime calculation per vCPU,
> as a summary and as a overlapped value for all vCPUs.
>
> This approach just keeps tree with page fault addr as a key,
> and t1-t2 interval of pagefault time and page copy time, with
> affected vCPU bit mask.
> For more implementation details please see comment to
> get_postcopy_total_downtime function.
>
> Signed-off-by: Alexey Perevalov <address@hidden>
> ---
> include/migration/migration.h | 11 ++
> migration/migration.c | 238
> +++++++++++++++++++++++++++++++++++++++++-
> migration/postcopy-ram.c | 61 ++++++++++-
> migration/savevm.c | 2 +
> migration/trace-events | 10 +-
> 5 files changed, 318 insertions(+), 4 deletions(-)
>
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index 5720c88..8f9af77 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -123,10 +123,21 @@ struct MigrationIncomingState {
>
> /* See savevm.c */
> LoadStateEntry_Head loadvm_handlers;
> +
> + /*
> + * Tree for keeping postcopy downtime,
> + * necessary to calculate correct downtime, during multiple
> + * vm suspends, it keeps host page address as a key and
> + * DowntimeDuration as a data
> + */
> + GTree *postcopy_downtime;
> };
>
> MigrationIncomingState *migration_incoming_get_current(void);
> void migration_incoming_state_destroy(void);
> +void mark_postcopy_downtime_begin(uint64_t addr, int cpu);
> +void mark_postcopy_downtime_end(uint64_t addr);
> +int64_t get_postcopy_total_downtime(void);
>
> /*
> * An outstanding page request, on the source, having been received
> diff --git a/migration/migration.c b/migration/migration.c
> index 54060f7..57d71e1 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -77,6 +77,12 @@ static NotifierList migration_state_notifiers =
>
> static bool deferred_incoming;
>
> +typedef struct {
> + int64_t begin;
> + int64_t end;
> + uint64_t cpus;
> +} DowntimeDuration;
> +
> /*
> * Current state of incoming postcopy; note this is not part of
> * MigrationIncomingState since it's state is used during cleanup
> @@ -117,6 +123,21 @@ MigrationState *migrate_get_current(void)
> return ¤t_migration;
> }
>
> +static gint addr_compare(gconstpointer a, gconstpointer b,
> + gpointer user_data G_GNUC_UNUSED)
> +{
> + if (a == b)
> + return 0;
> + else if (a > b)
> + return 1;
> + return -1;
> +}
Weird that glib doesn't have that builtin?
> +static void destroy_downtime_duration(gpointer data)
> +{
> + free(data);
g_free
> +}
> +
> MigrationIncomingState *migration_incoming_get_current(void)
> {
> static bool once;
> @@ -128,6 +149,9 @@ MigrationIncomingState
> *migration_incoming_get_current(void)
> QLIST_INIT(&mis_current.loadvm_handlers);
> qemu_mutex_init(&mis_current.rp_mutex);
> qemu_event_init(&mis_current.main_thread_load_event, false);
> + mis_current.postcopy_downtime = g_tree_new_full(addr_compare,
> + NULL, NULL,
> + destroy_downtime_duration);
> once = true;
> }
> return &mis_current;
> @@ -138,10 +162,13 @@ void migration_incoming_state_destroy(void)
> struct MigrationIncomingState *mis = migration_incoming_get_current();
>
> qemu_event_destroy(&mis->main_thread_load_event);
> + if (mis->postcopy_downtime) {
> + g_tree_destroy(mis->postcopy_downtime);
> + mis->postcopy_downtime = NULL;
> + }
> loadvm_free_handlers(mis);
> }
>
> -
> typedef struct {
> bool optional;
> uint32_t size;
> @@ -1119,7 +1146,6 @@ MigrationState *migrate_init(const MigrationParams
> *params)
> s->last_req_rb = NULL;
> error_free(s->error);
> s->error = NULL;
> -
> migrate_set_state(&s->state, MIGRATION_STATUS_NONE,
> MIGRATION_STATUS_SETUP);
>
> QSIMPLEQ_INIT(&s->src_page_requests);
> @@ -2109,3 +2135,211 @@ PostcopyState postcopy_state_set(PostcopyState
> new_state)
> return atomic_xchg(&incoming_postcopy_state, new_state);
> }
>
> +void mark_postcopy_downtime_begin(uint64_t addr, int cpu)
> +{
> + MigrationIncomingState *mis = migration_incoming_get_current();
> + DowntimeDuration *dd;
> + if (!mis || !mis->postcopy_downtime) {
> + error_report("Migration incoming state should exists mis %p", mis);
> + return;
> + }
> +
> + dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr); /* !!! cast
> */
> + if (!dd) {
> + dd = (DowntimeDuration *)g_malloc0(sizeof(DowntimeDuration));
g_new0 is quite nice to avoid the sizeof.
> + g_tree_insert(mis->postcopy_downtime, (gpointer)addr, (gpointer)dd);
> + }
> +
> + if (cpu < 0)
> + /* assume in this situation all vCPUs are sleeping */
> + dd->cpus = ~0u;
> + else
> + set_bit(cpu, &dd->cpus);
> +
> + /*
> + * overwrite previously set dd->begin, if that page already was
> + * faulted on another cpu
> + */
> + dd->begin = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> + trace_mark_postcopy_downtime_begin(addr, dd, dd->begin, cpu);
> +}
> +
> +void mark_postcopy_downtime_end(uint64_t addr)
> +{
> + MigrationIncomingState *mis = migration_incoming_get_current();
> + DowntimeDuration *dd;
> + if (!mis || !mis->postcopy_downtime) {
> + error_report("Migration incoming state should exists mis %p", mis);
I don't think you need to check for mis not existing, the code always returns
it from migration_incoming_get_current.
> + return;
> + }
> +
> + dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr);
> + if (!dd) {
> + /* error_report("Could not populate downtime duration completion
> time \n\
> + There is no downtime duration for 0x%"PRIx64, addr);
> */
It's probably worth keeping a count of these so you can tell - otherwise
you could be losing counts and never know.
> + return;
> + }
> +
> + dd->end = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> + trace_mark_postcopy_downtime_end(addr, dd, dd->end);
> +}
> +
> +typedef struct {
> + int64_t tp; /* point in time */
> + bool is_end;
> + int64_t cpus; /* cpus bit mask */
Ah! Useful comment that the DowntimeDuration didn't have.
Note that 64 isn't enough - there is already support for somewhere
over 200 vCPU VMs.
> +} OverlapDowntime;
Could you comment what this function is doing - it's not that obvious.
I'm guessing something to do with handling the same page for multiple CPUs?
> +static gboolean populate_downtime_points(gpointer key, gpointer value,
> + gpointer data)
> +{
> + DowntimeDuration *dd = (DowntimeDuration *)value;
> + GPtrArray *interval = (GPtrArray *)data;
> + OverlapDowntime *od_begin = g_malloc0(sizeof(OverlapDowntime));
> + OverlapDowntime *od_end = g_malloc0(sizeof(OverlapDowntime));
> +
> + od_begin->tp = dd->begin;
> + od_begin->is_end = false;
> + od_begin->cpus = dd->cpus;
> + g_ptr_array_add(interval, od_begin);
> +
> + od_end->tp = dd->end;
> + od_end->is_end = true;
> + od_end->cpus = dd->cpus;
> + g_ptr_array_add(interval, od_end);
> +
> + if (dd->end && dd->begin)
> + trace_sumup_downtime_duration(dd->end - dd->begin, (uint64_t)key,
> dd->cpus);
Try and keep the trace_ names matching the function to start with.
> + return FALSE;
> +}
> +
> +static gboolean calculate_per_cpu(gpointer key, gpointer value,
> + gpointer data)
> +{
> + int *downtime_cpu = (int *)data;
> + DowntimeDuration *dd = (DowntimeDuration *)value;
> + int cpu_iter;
> + for (cpu_iter = 0; cpu_iter < smp_cpus; cpu_iter++) {
> + if (test_bit(cpu_iter, &dd->cpus) && dd->end && dd->begin)
> + downtime_cpu[cpu_iter] += dd->end - dd->begin;
An interesting sanity check would be to check that end >= begin
> + }
> + return FALSE;
> +}
> +
> +static gint compare_downtime(gconstpointer a, gconstpointer b)
> +{
> + DowntimeDuration *dda = (DowntimeDuration *)a;
> + DowntimeDuration *ddb = (DowntimeDuration *)b;
> + return dda->begin - ddb->begin;
> +}
> +
> +static uint64_t get_sufficient_smp_cpus(void)
> +{
> + int i;
> + static uint64_t sufficient_cpus;
> + for (i = 0; i < smp_cpus; i++)
> + {
> + set_bit(i, &sufficient_cpus);
> + }
> + return sufficient_cpus;
> +}
> +
> +/*
> + * This function calculates downtime per cpu and trace it
> + *
> + * Also it calculates total downtime as an interval's overlap,
> + * for many vCPU.
> + *
> + * The approach is following:
> + * Initially intervals are represented in tree where key is
> + * pagefault address, and values:
> + * begin - page fault time
> + * end - page load time
> + * cpus - bit mask shows affected cpus
> + *
> + * To calculate overlap on all cpus, intervals converted into
> + * array of points in time (downtime_points), the size of
> + * array is 2 * number of nodes in tree of intervals (2 array
> + * elements per one in element of interval).
> + * Each element is marked as end (E) or as start (S) of interval.
> + * The overlap downtime will be calculated for SE, only in case
> + * there is sequence S(0..N)E(M) for every vCPU.
> + *
> + * As example we have 3 CPU
> + *
> + * S1 E1 S1 E1
> + * -----***********------------xxx***************------------------------>
> CPU1
> + *
> + * S2 E2
> + * ------------****************xxx--------------------------------------->
> CPU2
> + *
> + * S3 E3
> + * ------------------------****xxx********------------------------------->
> CPU3
> + *
> + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
> + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include
> CPU3,
> + * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
> + *
I'm a bit confused - are you only regarding 'downtime' as when *all* the vcpus
are
stopped - which is where you seem to have the 'xxx's here?
> + */
> +int64_t get_postcopy_total_downtime(void)
> +{
> + MigrationIncomingState *mis = migration_incoming_get_current();
> + int64_t total_downtime = 0; /* for total overlapped downtime */
> + const int intervals = g_tree_nnodes(mis->postcopy_downtime);
> + const int points = 2 * intervals;
> + uint64_t sufficient_smp_cpus = get_sufficient_smp_cpus();
> + int point_iter, start_point_iter;
> + GPtrArray *downtime_points = g_ptr_array_sized_new(points);
> + /* for summary downtime per cpu */
> + int *downtime_cpu = g_malloc0(smp_cpus * sizeof(int));
Another good use of g_new0
> + if (!mis || !mis->postcopy_downtime) {
> + error_report("Migration incoming state should exists, mis %p", mis);
> + return -1;
> + }
> +
> + /* make downtime points S/E from interval */
> + g_tree_foreach(mis->postcopy_downtime, populate_downtime_points,
> + downtime_points);
> + g_tree_foreach(mis->postcopy_downtime, calculate_per_cpu, downtime_cpu);
> +
> + /* just for RFC patch */
> + for (point_iter = 0; point_iter < smp_cpus; point_iter++)
> + {
> + trace_downtime_per_cpu(point_iter, downtime_cpu[point_iter]);
> + }
> +
> + g_ptr_array_sort(downtime_points, compare_downtime);
> +
> + for (point_iter = 1; point_iter < points; point_iter++) {
> + OverlapDowntime *od = g_ptr_array_index(downtime_points, point_iter);
> + uint64_t cur_cpus = od->cpus;
> + int smp_cpus_i = smp_cpus;
> + OverlapDowntime *prev_od = g_ptr_array_index(downtime_points,
> + point_iter - 1);
> + /* we need sequence SE */
> + if (!od->is_end || prev_od->is_end)
> + continue;
> +
> + for (start_point_iter = point_iter - 1;
> + start_point_iter >= 0 && smp_cpus_i;
> + start_point_iter--, smp_cpus_i--) {
> + OverlapDowntime *t_od = g_ptr_array_index(downtime_points,
> + start_point_iter);
> + /* should be S */
> + if (t_od->is_end)
> + break;
> +
> + cur_cpus |= t_od->cpus;
> + if (sufficient_smp_cpus & cur_cpus) {
> + total_downtime += od->tp - prev_od->tp;
> + /* situation when one S point represents all vCPU is
> possible */
> + break;
> + }
> + }
> + }
> + trace_get_postcopy_total_downtime(g_tree_nnodes(mis->postcopy_downtime),
> + total_downtime);
> +
> + g_ptr_array_free(downtime_points, TRUE);
> + return total_downtime;
> +}
> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> index dc80dbb..3bd9db0 100644
> --- a/migration/postcopy-ram.c
> +++ b/migration/postcopy-ram.c
> @@ -23,6 +23,7 @@
> #include "migration/postcopy-ram.h"
> #include "sysemu/sysemu.h"
> #include "sysemu/balloon.h"
> +#include <sys/param.h>
> #include "qemu/error-report.h"
> #include "trace.h"
>
> @@ -404,6 +405,60 @@ static int ram_block_enable_notify(const char
> *block_name, void *host_addr,
> return 0;
> }
>
> +#define PROC_LEN 1024
> +static void trace_for_thread(const char *msg, pid_t thread_id)
> +{
> + const char *status_file_frmt = "/proc/%d/status";
> + char status_file_path[MAXPATHLEN];
> + char proc_name[PROC_LEN];
> + char proc_status[PROC_LEN];
> + char *line = NULL;
> + FILE *f;
> + ssize_t read;
> + size_t len;
> +
> + sprintf(status_file_path, status_file_frmt, thread_id);
Maybe consider using g_strdup_printf for all the names rather
than fixed size arrays and sprintf's etc
> + f = fopen(status_file_path, "r");
> + if (!f) {
> + error_report("can't open %s", status_file_path);
> + return;
> + }
> +
> + memset(proc_name, 0, sizeof(proc_name));
> + memset(proc_status, 0, sizeof(proc_status));
> + while ((read = getline(&line, &len, f)) != -1) {
> + if (strstr(line, "Name"))
> + strncpy(proc_name, line, sizeof(proc_name));
> + if (strstr(line, "State"))
> + strncpy(proc_status, line, sizeof(proc_status));
> + }
There's got to be a 'safe' way to do that rather than getline.
> + free(line);
> + trace_vcpu_thread_status(msg, thread_id, proc_name, proc_status);
> +}
> +
> +static int defined_mem_fault_cpu_index(pid_t pid)
> +{
> + CPUState *cpu_iter;
> +
> + CPU_FOREACH(cpu_iter) {
> + if (cpu_iter->thread_id == pid)
> + return cpu_iter->cpu_index;
> + }
> + trace_for_thread("can't find cpu_index for thread id", pid);
error_report?
> + return -1;
> +}
> +
> +static void trace_cpu_state(void)
> +{
> + CPUState *cpu_iter;
> + CPU_FOREACH(cpu_iter) {
> + trace_for_thread("vCPU", cpu_iter->thread_id);
> + trace_postcopy_vcpu_running(cpu_iter->cpu_index, cpu_iter->running);
> + }
> +}
> +
> /*
> * Handle faults detected by the USERFAULT markings
> */
> @@ -445,6 +500,7 @@ static void *postcopy_ram_fault_thread(void *opaque)
> }
>
> ret = read(mis->userfault_fd, &msg, sizeof(msg));
> + trace_cpu_state();
> if (ret != sizeof(msg)) {
> if (errno == EAGAIN) {
> /*
> @@ -481,8 +537,10 @@ static void *postcopy_ram_fault_thread(void *opaque)
> rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
> trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
> qemu_ram_get_idstr(rb),
> - rb_offset);
> + rb_offset,
> msg.arg.pagefault.pid);
>
> + mark_postcopy_downtime_begin(msg.arg.pagefault.address,
> + defined_mem_fault_cpu_index(msg.arg.pagefault.pid));
> /*
> * Send the request to the source - we want to request one
> * of our host page sizes (which is >= TPS)
> @@ -577,6 +635,7 @@ int postcopy_place_page(MigrationIncomingState *mis, void
> *host, void *from,
>
> return -e;
> }
> + mark_postcopy_downtime_end((uint64_t)host);
>
> trace_postcopy_place_page(host);
> return 0;
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 3b19a4a..e12c0a2 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1630,6 +1630,7 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
>
> if (autostart) {
> /* Hold onto your hats, starting the CPU */
> + trace_loadvm_postcopy_vm_start(get_postcopy_total_downtime());
> vm_start();
> } else {
> /* leave it paused and let management decide when to start the CPU */
> @@ -1930,6 +1931,7 @@ qemu_loadvm_section_part_end(QEMUFile *f,
> MigrationIncomingState *mis)
> return -EINVAL;
> }
>
> + trace_loadvm_postcopy_vm_start(get_postcopy_total_downtime());
> return 0;
> }
>
> diff --git a/migration/trace-events b/migration/trace-events
> index 7372ce2..8a21684 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -44,6 +44,7 @@ vmstate_subsection_save_loop(const char *name, const char
> *sub) "%s/%s"
> vmstate_subsection_save_top(const char *idstr) "%s"
> vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s"
> qemu_announce_self_iter(const char *mac) "%s"
> +loadvm_postcopy_vm_start(int64_t downtime) "%"PRId64
>
> # migration/vmstate.c
> vmstate_load_field_error(const char *field, int ret) "field \"%s\" load
> failed, ret = %d"
> @@ -110,6 +111,11 @@ process_incoming_migration_co_end(int ret, int ps)
> "ret=%d postcopy-state=%d"
> process_incoming_migration_co_postcopy_end_main(void) ""
> migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p
> ioctype=%s"
> migration_set_outgoing_channel(void *ioc, const char *ioctype, const char
> *hostname) "ioc=%p ioctype=%s hostname=%s"
> +mark_postcopy_downtime_begin(uint64_t addr, void *dd, int64_t time, int cpu)
> "addr 0x%" PRIx64 " dd %p time %" PRId64 " cpu %d"
> +mark_postcopy_downtime_end(uint64_t addr, void *dd, int64_t time) "addr 0x%"
> PRIx64 " dd %p time %" PRId64
> +get_postcopy_total_downtime(int num, int64_t total) "faults %d, total
> downtime %" PRId64
> +sumup_downtime_duration(int64_t downtime, uint64_t addr, int cpubit)
> "downtime %" PRId64 " addr 0x%" PRIx64 "cpus %d"
> +downtime_per_cpu(int cpu_index, int downtime) "downtime cpu[%d]=%d"
>
> # migration/rdma.c
> qemu_rdma_accept_incoming_migration(void) ""
> @@ -186,7 +192,7 @@ postcopy_ram_enable_notify(void) ""
> postcopy_ram_fault_thread_entry(void) ""
> postcopy_ram_fault_thread_exit(void) ""
> postcopy_ram_fault_thread_quit(void) ""
> -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock,
> size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx"
> +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock,
> size_t offset, int pid) "Request for HVA=%" PRIx64 " rb=%s offset=%zx %d"
> postcopy_ram_incoming_cleanup_closeuf(void) ""
> postcopy_ram_incoming_cleanup_entry(void) ""
> postcopy_ram_incoming_cleanup_exit(void) ""
> @@ -195,6 +201,8 @@ save_xbzrle_page_skipping(void) ""
> save_xbzrle_page_overflow(void) ""
> ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %"
> PRIu64 " milliseconds, %d iterations"
> ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %"
> PRIu64
> +vcpu_thread_status(const char *msg, int tpid, char *name, char *status) "%s
> host_tid %d %s %s"
> +postcopy_vcpu_running(int cpu_index, int is_running) "cpu %d running %d"
>
> # migration/exec.c
> migration_exec_outgoing(const char *cmd) "cmd=%s"
> --
> 1.8.3.1
>
--
Dr. David Alan Gilbert / address@hidden / Manchester, UK
- Re: [Qemu-devel] [PATCH 2/2] migration: calculate downtime on dst side,
Dr. David Alan Gilbert <=