qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v10 14/14, resend] rdma: add setup time accounti


From: Michael R. Hines
Subject: Re: [Qemu-devel] [PATCH v10 14/14, resend] rdma: add setup time accounting to QMP statistics
Date: Mon, 24 Jun 2013 17:28:02 -0400
User-agent: Mozilla/5.0 (X11; Linux i686; rv:17.0) Gecko/20130329 Thunderbird/17.0.5

On 06/24/2013 10:41 AM, Anthony Liguori wrote:
"Michael R. Hines" <address@hidden> writes:

On 06/24/2013 09:51 AM, Paolo Bonzini wrote:
Il 24/06/2013 15:46, address@hidden ha scritto:
From: "Michael R. Hines" <address@hidden>

For very large virtual machines, pinning can take a long time.
While this does not affect the migration's *actual* time itself,
it is still important for the user to know what's going on and
to know what component of the total time is actual taken up by
pinning.

For example, using a 14GB virtual machine, pinning can take as
long as 5 seconds, for which the user would not otherwise know
what was happening.

Reviewed-by: Paolo Bonzini <address@hidden>
Please stop inventing Reviewed-by tags, or I will stop reviewing your
patches.

Paolo
Inventing? I don't understand.

I accumulated all of those tags from everybody - copy and pasted
directly from everyone when they gave them to me.\

What exactly is wrong here?
You can only carry forward a Reviewed-by tag if you've made no changes
to the patch.  Once you make a change, you need to drop it.

I would even suggest that if the rebase is anything but trivial, you
drop the Reviewed-by with a comment.  Most reviewers will appreciate
erring on the side of being conservative.

Will do - Anthony, I would really appreciate a review of Patch 14 as well.

As I hinted to Paolo: The current migration state machine is not sufficient for RDMA.

The setup state is not a "formal" state, it's more like a dummy state
which needs to be a real state communicated to management software
so that it can be tracked and timed for the purposes of RDMA pinning.

That's what patch #14 does - I've have done my best to both test it
and design it.

- Michael


Regards,

Anthony Liguori

- Michael


Reviewed-by: Chegu Vinod <address@hidden>
Reviewed-by: Eric Blake <address@hidden>
Tested-by: Chegu Vinod <address@hidden>
Tested-by: Michael R. Hines <address@hidden>
Signed-off-by: Michael R. Hines <address@hidden>
---
   hmp.c                         |   10 ++++++--
   include/migration/migration.h |    1 +
   migration-rdma.c              |   51 
+++++++++++++++++++++++------------------
   migration.c                   |   41 +++++++++++++++++++++++----------
   qapi-schema.json              |   13 ++++++++---
   5 files changed, 77 insertions(+), 39 deletions(-)

diff --git a/hmp.c b/hmp.c
index 148a3fb..2b75b28 100644
--- a/hmp.c
+++ b/hmp.c
@@ -154,8 +154,10 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
if (info->has_status) {
           monitor_printf(mon, "Migration status: %s\n", info->status);
-        monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n",
-                       info->total_time);
+        if (info->has_total_time) {
+            monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n",
+                           info->total_time);
+        }
           if (info->has_expected_downtime) {
               monitor_printf(mon, "expected downtime: %" PRIu64 " 
milliseconds\n",
                              info->expected_downtime);
@@ -164,6 +166,10 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
               monitor_printf(mon, "downtime: %" PRIu64 " milliseconds\n",
                              info->downtime);
           }
+        if (info->has_setup_time) {
+            monitor_printf(mon, "setup: %" PRIu64 " milliseconds\n",
+                           info->setup_time);
+        }
       }
if (info->has_ram) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index b49e68b..82e8713 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -49,6 +49,7 @@ struct MigrationState
       bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
       int64_t xbzrle_cache_size;
       double mbps;
+    int64_t setup_time;
   };
void process_incoming_migration(QEMUFile *f);
diff --git a/migration-rdma.c b/migration-rdma.c
index 853de18..165297e 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -699,12 +699,10 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
       return 0;
   }
-static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma,
-                                RDMALocalBlocks *rdma_local_ram_blocks)
+static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
   {
       int i;
-    uint64_t start = qemu_get_clock_ms(rt_clock);
-    (void)start;
+    RDMALocalBlocks *rdma_local_ram_blocks = &rdma->local_ram_blocks;
for (i = 0; i < rdma_local_ram_blocks->num_blocks; i++) {
           rdma_local_ram_blocks->block[i].mr =
@@ -721,8 +719,6 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma,
           rdma->total_registrations++;
       }
- DPRINTF("lock time: %" PRIu64 "\n", qemu_get_clock_ms(rt_clock) - start);
-
       if (i >= rdma_local_ram_blocks->num_blocks) {
           return 0;
       }
@@ -1262,7 +1258,8 @@ static void qemu_rdma_move_header(RDMAContext *rdma, int 
idx,
    */
   static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader 
*head,
                                      uint8_t *data, RDMAControlHeader *resp,
-                                   int *resp_idx)
+                                   int *resp_idx,
+                                   int (*callback)(RDMAContext *rdma))
   {
       int ret = 0;
       int idx = 0;
@@ -1315,6 +1312,14 @@ static int qemu_rdma_exchange_send(RDMAContext *rdma, 
RDMAControlHeader *head,
        * If we're expecting a response, block and wait for it.
        */
       if (resp) {
+        if (callback) {
+            DPRINTF("Issuing callback before receiving response...\n");
+            ret = callback(rdma);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+
           DDPRINTF("Waiting for response %s\n", control_desc[resp->type]);
           ret = qemu_rdma_exchange_get_response(rdma, resp, resp->type, idx + 
1);
@@ -1464,7 +1469,7 @@ static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
                       chunk, sge.length, current_index, offset);
ret = qemu_rdma_exchange_send(rdma, &head,
-                                (uint8_t *) &comp, NULL, NULL);
+                                (uint8_t *) &comp, NULL, NULL, NULL);
if (ret < 0) {
                       return -EIO;
@@ -1487,7 +1492,7 @@ static int qemu_rdma_write_one(QEMUFile *f, RDMAContext 
*rdma,
                       chunk, sge.length, current_index, offset);
ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
-                                    &resp, &reg_result_idx);
+                                    &resp, &reg_result_idx, NULL);
               if (ret < 0) {
                   return ret;
               }
@@ -2126,7 +2131,7 @@ static int qemu_rdma_put_buffer(void *opaque, const 
uint8_t *buf,
           head.len = r->len;
           head.type = RDMA_CONTROL_QEMU_FILE;
- ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL);
+        ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
if (ret < 0) {
               rdma->error_state = ret;
@@ -2482,7 +2487,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f, 
void *opaque,
               DPRINTF("Initial setup info requested.\n");
if (rdma->pin_all) {
-                ret = qemu_rdma_reg_whole_ram_blocks(rdma, 
&rdma->local_ram_blocks);
+                ret = qemu_rdma_reg_whole_ram_blocks(rdma);
                   if (ret) {
                       fprintf(stderr, "rdma migration: error dest "
                                       "registering ram blocks!\n");
@@ -2617,7 +2622,17 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void 
*opaque,
           head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
           DPRINTF("Sending registration setup for ram blocks...\n");
- ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp, &reg_result_idx);
+        /*
+         * Make sure that we parallelize the pinning on both sides.
+         * For very large guests, doing this serially takes a really
+         * long time, so we have to 'interleave' the pinning locally
+         * with the control messages by performing the pinning on this
+         * side before we receive the control response from the other
+         * side that the pinning has completed.
+         */
+        ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
+                    &reg_result_idx, rdma->pin_all ?
+                    qemu_rdma_reg_whole_ram_blocks : NULL);
           if (ret < 0) {
               ERROR(errp, "receiving remote info!\n");
               return ret;
@@ -2633,14 +2648,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void 
*opaque,
               return ret;
           }
- if (rdma->pin_all) {
-            ret = qemu_rdma_reg_whole_ram_blocks(rdma, 
&rdma->local_ram_blocks);
-            if (ret) {
-                fprintf(stderr, "rdma migration: error source "
-                                "registering ram blocks!\n");
-                return ret;
-            }
-        } else {
+        if (!rdma->pin_all) {
               int x = 0;
               for (x = 0; x < rdma->local_ram_blocks.num_blocks; x++) {
                   RDMALocalBlock *block = &(rdma->local_ram_blocks.block[x]);
@@ -2653,7 +2661,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void 
*opaque,
       DDDPRINTF("Sending registration finish %" PRIu64 "...\n", flags);
head.type = RDMA_CONTROL_REGISTER_FINISHED;
-    ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL);
+    ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
if (ret < 0) {
           goto err;
@@ -2802,7 +2810,6 @@ void rdma_start_outgoing_migration(void *opaque,
       DPRINTF("qemu_rdma_source_connect success\n");
s->file = qemu_fopen_rdma(rdma, "wb");
-    s->total_time = qemu_get_clock_ms(rt_clock);
       migrate_fd_connect(s);
       return;
   err:
diff --git a/migration.c b/migration.c
index 62c6b85..bb412df 100644
--- a/migration.c
+++ b/migration.c
@@ -36,7 +36,8 @@
   #endif
enum {
-    MIG_STATE_ERROR,
+    MIG_STATE_ERROR = -1,
+    MIG_STATE_NONE,
       MIG_STATE_SETUP,
       MIG_STATE_CANCELLED,
       MIG_STATE_ACTIVE,
@@ -63,7 +64,7 @@ static NotifierList migration_state_notifiers =
   MigrationState *migrate_get_current(void)
   {
       static MigrationState current_migration = {
-        .state = MIG_STATE_SETUP,
+        .state = MIG_STATE_NONE,
           .bandwidth_limit = MAX_THROTTLE,
           .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
           .mbps = -1,
@@ -184,9 +185,14 @@ MigrationInfo *qmp_query_migrate(Error **errp)
       MigrationState *s = migrate_get_current();
switch (s->state) {
-    case MIG_STATE_SETUP:
+    case MIG_STATE_NONE:
           /* no migration has happened ever */
           break;
+    case MIG_STATE_SETUP:
+        info->has_status = true;
+        info->status = g_strdup("setup");
+        info->has_total_time = false;
+        break;
       case MIG_STATE_ACTIVE:
           info->has_status = true;
           info->status = g_strdup("active");
@@ -195,6 +201,8 @@ MigrationInfo *qmp_query_migrate(Error **errp)
               - s->total_time;
           info->has_expected_downtime = true;
           info->expected_downtime = s->expected_downtime;
+        info->has_setup_time = true;
+        info->setup_time = s->setup_time;
info->has_ram = true;
           info->ram = g_malloc0(sizeof(*info->ram));
@@ -226,6 +234,8 @@ MigrationInfo *qmp_query_migrate(Error **errp)
           info->total_time = s->total_time;
           info->has_downtime = true;
           info->downtime = s->downtime;
+        info->has_setup_time = true;
+        info->setup_time = s->setup_time;
info->has_ram = true;
           info->ram = g_malloc0(sizeof(*info->ram));
@@ -257,7 +267,7 @@ void 
qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
       MigrationState *s = migrate_get_current();
       MigrationCapabilityStatusList *cap;
- if (s->state == MIG_STATE_ACTIVE) {
+    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
           error_set(errp, QERR_MIGRATION_ACTIVE);
           return;
       }
@@ -295,9 +305,9 @@ static void migrate_fd_cleanup(void *opaque)
       notifier_list_notify(&migration_state_notifiers, s);
   }
-static void migrate_finish_set_state(MigrationState *s, int new_state)
+static void migrate_set_state(MigrationState *s, int old_state, int new_state)
   {
-    if (__sync_val_compare_and_swap(&s->state, MIG_STATE_ACTIVE,
+    if (__sync_val_compare_and_swap(&s->state, old_state,
                                       new_state) == new_state) {
           trace_migrate_set_state(new_state);
       }
@@ -316,7 +326,8 @@ static void migrate_fd_cancel(MigrationState *s)
   {
       DPRINTF("cancelling migration\n");
- migrate_finish_set_state(s, MIG_STATE_CANCELLED);
+    migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_CANCELLED);
+    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_CANCELLED);
   }
void add_migration_state_change_notifier(Notifier *notify)
@@ -393,7 +404,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
       params.blk = blk;
       params.shared = inc;
- if (s->state == MIG_STATE_ACTIVE) {
+    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
           error_set(errp, QERR_MIGRATION_ACTIVE);
           return;
       }
@@ -517,6 +528,7 @@ static void *migration_thread(void *opaque)
   {
       MigrationState *s = opaque;
       int64_t initial_time = qemu_get_clock_ms(rt_clock);
+    int64_t setup_start = qemu_get_clock_ms(host_clock);
       int64_t initial_bytes = 0;
       int64_t max_size = 0;
       int64_t start_time = initial_time;
@@ -525,6 +537,11 @@ static void *migration_thread(void *opaque)
       DPRINTF("beginning savevm\n");
       qemu_savevm_state_begin(s->file, &s->params);
+ s->setup_time = qemu_get_clock_ms(host_clock) - setup_start;
+    migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
+
+    DPRINTF("setup complete\n");
+
       while (s->state == MIG_STATE_ACTIVE) {
           int64_t current_time;
           uint64_t pending_size;
@@ -546,14 +563,14 @@ static void *migration_thread(void *opaque)
                   qemu_savevm_state_complete(s->file);
                   qemu_mutex_unlock_iothread();
                   if (!qemu_file_get_error(s->file)) {
-                    migrate_finish_set_state(s, MIG_STATE_COMPLETED);
+                    migrate_set_state(s, MIG_STATE_ACTIVE, 
MIG_STATE_COMPLETED);
                       break;
                   }
               }
           }
if (qemu_file_get_error(s->file)) {
-            migrate_finish_set_state(s, MIG_STATE_ERROR);
+            migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
               break;
           }
           current_time = qemu_get_clock_ms(rt_clock);
@@ -604,8 +621,8 @@ static void *migration_thread(void *opaque)
void migrate_fd_connect(MigrationState *s)
   {
-    s->state = MIG_STATE_ACTIVE;
-    trace_migrate_set_state(MIG_STATE_ACTIVE);
+    s->state = MIG_STATE_SETUP;
+    trace_migrate_set_state(MIG_STATE_SETUP);
/* This is a best 1st approximation. ns to ms */
       s->expected_downtime = max_downtime/1000000;
diff --git a/qapi-schema.json b/qapi-schema.json
index 4e9cd6d..69faaad 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -578,6 +578,12 @@
   #        expected downtime in milliseconds for the guest in last walk
   #        of the dirty bitmap. (since 1.3)
   #
+# @setup-time: #optional amount of setup time spent _before_ the iterations
+#        begin but _after_ the QMP command is issued. This is designed to
+#        provide an accounting of any activities (such as RDMA pinning) which
+#        may be expensive, but do not actually occur during the iterative
+#        migration rounds themselves. (since 1.6)
+#
   # Since: 0.14.0
   ##
   { 'type': 'MigrationInfo',
@@ -586,7 +592,8 @@
              '*xbzrle-cache': 'XBZRLECacheStats',
              '*total-time': 'int',
              '*expected-downtime': 'int',
-           '*downtime': 'int'} }
+           '*downtime': 'int',
+           '*setup-time' : 'int'} }
##
   # @query-migrate






reply via email to

[Prev in Thread] Current Thread [Next in Thread]