qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC PATCH RDMA support v2: 6/6] send memory over RDMA as b


From: Michael R. Hines
Subject: [Qemu-devel] [RFC PATCH RDMA support v2: 6/6] send memory over RDMA as blocks are iterated
Date: Mon, 11 Feb 2013 17:49:57 -0500

From: "Michael R. Hines" <address@hidden>


Signed-off-by: Michael R. Hines <address@hidden>
---
 arch_init.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 savevm.c    |   59 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 139 insertions(+), 4 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index dada6de..76092cc 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -42,6 +42,7 @@
 #include "migration/migration.h"
 #include "exec/gdbstub.h"
 #include "hw/smbios.h"
+#include "qemu/rdma.h"
 #include "exec/address-spaces.h"
 #include "hw/pcspk.h"
 #include "migration/page_cache.h"
@@ -170,6 +171,15 @@ static int is_dup_page(uint8_t *page)
     VECTYPE val = SPLAT(page);
     int i;
 
+    /*
+     * RFC RDMA: The empirical cost of searching for zero pages here
+     *           plus the cost of communicating with the other side
+     *           seems to take significantly more time than simply
+     *           dumping the page into remote memory.
+     */
+    if (migrate_rdma_enabled())
+        return 0;
+
     for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) {
         if (!ALL_EQ(val, p[i])) {
             return 0;
@@ -282,6 +292,44 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, 
ram_addr_t offset,
     return size;
 }
 
+static size_t save_rdma_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
+                             int cont)
+{
+    size_t bytes_sent = 0;
+    ram_addr_t current_addr;
+
+    acct_info.norm_pages++;
+
+    /*
+     * use RDMA to send page
+     */
+    current_addr = block->offset + offset;
+    if (rdma_write(&rdma_mdata, current_addr,
+            TARGET_PAGE_SIZE)) {
+        fprintf(stderr, "rdma migration: write error!\n");
+        qemu_file_set_error(f, -EIO);
+        return 0;
+    }
+
+    /*
+     * do some polling
+     */
+    while (1) {
+        int ret = rdma_poll(&rdma_mdata);
+        if (ret == RDMA_WRID_NONE) {
+            break;
+        }
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: polling error!\n");
+            qemu_file_set_error(f, -EIO);
+            return 0;
+        }
+    }
+
+    bytes_sent += TARGET_PAGE_SIZE;
+    return bytes_sent;
+}
+
 #define ENCODING_FLAG_XBZRLE 0x1
 
 static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
@@ -474,6 +522,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
                 if (!last_stage) {
                     p = get_cached_data(XBZRLE.cache, current_addr);
                 }
+            } else if (migrate_rdma_enabled()) {
+                bytes_sent = save_rdma_page(f, block, offset, cont);
             }
 
             /* XBZRLE overflow or normal page */
@@ -601,12 +651,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
     return 0;
 }
 
+static int tprate = 1000;
+
 static int ram_save_iterate(QEMUFile *f, void *opaque)
 {
     int ret;
     int i;
-    int64_t t0;
-    int total_sent = 0;
+    int64_t t0, tp0;
+    int total_sent = 0, last_total_sent = 0;
 
     qemu_mutex_lock_ramlist();
 
@@ -625,23 +677,49 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
             break;
         }
         total_sent += bytes_sent;
+        last_total_sent += bytes_sent;
         acct_info.iterations++;
         /* we want to check in the 1st loop, just in case it was the 1st time
            and we had to sync the dirty bitmap.
            qemu_get_clock_ns() is a bit expensive, so we only check each some
            iterations
         */
+
+        /*
+         * RFC RDMA: Can we have something like this to periodically print
+         *           out throughput? This is just a rough-sketch that 
+         *           partially worked for me. I assume there a better way 
+         *           that everyone would prefer. Perhaps we could set a QMP 
+         *           command that toggled a "periodic printing" option that 
+         *           allowed more details to be printed on stdout.....?
+         */
         if ((i & 63) == 0) {
-            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
+            uint64_t curr = qemu_get_clock_ns(rt_clock);
+            uint64_t t1 = (curr - t0) / 1000000;
+            double tp;
             if (t1 > MAX_WAIT) {
                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                         t1, i);
                 break;
             }
+
+            if ((i % tprate) == 0) {
+                uint64_t tp1 = (curr - tp0) / 1000000;
+                tp = ((double) last_total_sent * 8.0 /
+                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
+                printf("throughput: %f mbps\n", tp);
+                last_total_sent = 0;
+                tp0 = curr;
+            }
         }
         i++;
     }
 
+    if (migrate_rdma_enabled() && rdma_write_flush(&rdma_mdata) < 0) {
+        qemu_file_set_error(f, -EIO);
+        return 0;
+    }
+
     qemu_mutex_unlock_ramlist();
 
     if (ret < 0) {
diff --git a/savevm.c b/savevm.c
index 304d1ef..4d0bef3 100644
--- a/savevm.c
+++ b/savevm.c
@@ -24,6 +24,7 @@
 
 #include "config-host.h"
 #include "qemu-common.h"
+#include "qemu/rdma.h"
 #include "hw/hw.h"
 #include "hw/qdev.h"
 #include "net/net.h"
@@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
     return f->last_error;
 }
 
-static void qemu_file_set_error(QEMUFile *f, int ret)
+void qemu_file_set_error(QEMUFile *f, int ret)
 {
     if (f->last_error == 0) {
         f->last_error = ret;
@@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
 {
     SaveStateEntry *se;
     int ret = 1;
+    static int first_time = 1;
 
     QTAILQ_FOREACH(se, &savevm_handlers, entry) {
         if (!se->ops || !se->ops->save_live_iterate) {
@@ -1643,6 +1645,30 @@ int qemu_savevm_state_iterate(QEMUFile *f)
         }
     }
     if (ret != 0) {
+#ifdef RDMA_EXTRA_SYNC
+        /*
+         * We use two "sync" infiniband messages happen during migration.
+         * One at the beginning and one at the end, just to be thorough.
+         * This is the first one.
+         */
+        if (first_time && migrate_rdma_enabled()) {
+            int r;
+            first_time = 0;
+            if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC)) {
+                fprintf(stderr,
+                        "rdma migration: error posting extra send sync!\n");
+                return -EIO;
+            }
+
+            r = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC);
+            if (r < 0) {
+                fprintf(stderr,
+                        "rdma migration: qemu_savevm_state_iterate"
+                        " sync polling error!\n");
+                return -EIO;
+            }
+        }
+#endif
         return ret;
     }
     ret = qemu_file_get_error(f);
@@ -1703,8 +1729,30 @@ int qemu_savevm_state_complete(QEMUFile *f)
         trace_savevm_section_end(se->section_id);
     }
 
+    /*
+     * We use two "sync" infiniband messages happen during migration.
+     * One at the beginning and one at the end, just to be thorough.
+     * This is the second one.
+     */
+    if (migrate_rdma_enabled()) {
+        if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_SYNC)) {
+            fprintf(stderr, "rdma migration: error posting send sync!\n");
+            return -EIO;
+        }
+    }
+
     qemu_put_byte(f, QEMU_VM_EOF);
 
+    /* wait for RDMA sync message to complete */
+    if (migrate_rdma_enabled()) {
+        int ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
+                            " sync polling error!\n");
+            return -EIO;
+        }
+     }
+
     return qemu_file_get_error(f);
 }
 
@@ -2014,6 +2062,15 @@ int qemu_loadvm_state(QEMUFile *f)
 
     cpu_synchronize_all_post_init();
 
+    /* wait for RDMA sync message */
+    if (migrate_rdma_enabled()) {
+        ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_RECV_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
+                            " sync polling error!\n");
+            goto out;
+        }
+    }
     ret = 0;
 
 out:
-- 
1.7.10.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]