Re: [Qemu-devel] [Qemu-stable] [PATCH] rdma: fix multiple VMs parallel m

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [Qemu-stable] [PATCH] rdma: fix multiple VMs parallel m

From:	Michael R. Hines
Subject:	Re: [Qemu-devel] [Qemu-stable] [PATCH] rdma: fix multiple VMs parallel migration
Date:	Tue, 24 Sep 2013 13:59:32 -0400
User-agent:	Mozilla/5.0 (X11; Linux i686; rv:17.0) Gecko/20130329 Thunderbird/17.0.5

On 09/04/2013 04:10 AM, Frank Yang wrote:

On 2013-9-3 13:03, Lei Li wrote:

Hi Frank,

I failed to apply this patch. Please make sure to use git-send-email, otherwise
it's a little hard to review. :)

On 08/30/2013 08:39 PM, Frank Yang wrote:

When several VMs migrate with RDMA at the same time, the increased pressure 
cause packet loss probabilistically and make source and destination wait for 
each other. There might be some of VMs blocked during the migration.

Fix the bug by using two completion queues, for sending and receiving 
respectively.
 From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001
From: Frank Yang <address@hidden <mailto:address@hidden>>
Date: Fri, 30 Aug 2013 17:53:34 +0800
Subject: [PATCH] rdma: fix multiple VMs parallel migration

The commit message should be here within the patch. You can use 'git commit 
--amend'
to add it.

Signed-off-by: Frank Yang <address@hidden <mailto:address@hidden>>
---
  migration-rdma.c | 57 ++++++++++++++++++++++++++++++++++++--------------------
  1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/migration-rdma.c b/migration-rdma.c
index 3d1266f..d0eacbb 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -362,7 +362,8 @@ typedef struct RDMAContext {
      struct ibv_qp *qp;                      /* queue pair */
      struct ibv_comp_channel *comp_channel;  /* completion channel */
      struct ibv_pd *pd;                      /* protection domain */
-    struct ibv_cq *cq;                      /* completion queue */
+    struct ibv_cq *send_cq;                 /* send completion queue */
+    struct ibv_cq *recv_cq;                 /* receive completion queue */
      /*
       * If a previous write failed (perhaps because of a failed
@@ -1006,9 +1007,12 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
       * Completion queue can be filled by both read and write work requests,
       * so must reflect the sum of both possible queue sizes.
       */
-    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
+    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2),
              NULL, rdma->comp_channel, 0);
-    if (!rdma->cq) {
+    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL,
+            rdma->comp_channel, 0);
+
+    if (!rdma->send_cq || !rdma->recv_cq) {
          fprintf(stderr, "failed to allocate completion queue\n");
          goto err_alloc_pd_cq;
      }
@@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
      attr.cap.max_recv_wr = 3;
      attr.cap.max_send_sge = 1;
      attr.cap.max_recv_sge = 1;
-    attr.send_cq = rdma->cq;
-    attr.recv_cq = rdma->cq;
+    attr.send_cq = rdma->send_cq;
+    attr.recv_cq = rdma->recv_cq;
      attr.qp_type = IBV_QPT_RC;
      ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
@@ -1361,13 +1365,18 @@ static void qemu_rdma_signal_unregister(RDMAContext 
*rdma, uint64_t index,
   * Return the work request ID that completed.
   */
  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
-                               uint32_t *byte_len)
+                               uint32_t *byte_len, int wrid_requested)
  {
      int ret;
      struct ibv_wc wc;
      uint64_t wr_id;
-    ret = ibv_poll_cq(rdma->cq, 1, &wc);
+    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+        wrid_requested == RDMA_WRID_SEND_CONTROL) {
+        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
+    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
+    }
      if (!ret) {
          *wr_id_out = RDMA_WRID_NONE;
@@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
      void *cq_ctx;
      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
-    if (ibv_req_notify_cq(rdma->cq, 0)) {
-        return -1;
-    }
      /* poll cq first */
      while (wr_id != wrid_requested) {
-        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
          if (ret < 0) {
              return ret;
          }
@@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
      }
      while (1) {
+        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+            wrid_requested == RDMA_WRID_SEND_CONTROL) {
+            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
+                return -1;
+            }
+        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
+                return -1;
+            }
+        }
+
          /*
           * Coroutine doesn't start until process_incoming_migration()
           * so don't yield unless we know we're running inside of a coroutine.
@@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
          num_cq_events++;
-        if (ibv_req_notify_cq(cq, 0)) {
-            goto err_block_for_wrid;
-        }
-
          while (wr_id != wrid_requested) {
-            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
              if (ret < 0) {
                  goto err_block_for_wrid;
              }
@@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
          ibv_destroy_qp(rdma->qp);
          rdma->qp = NULL;
      }
-    if (rdma->cq) {
-        ibv_destroy_cq(rdma->cq);
-        rdma->cq = NULL;
+    if (rdma->send_cq) {
+        ibv_destroy_cq(rdma->send_cq);
+        rdma->send_cq = NULL;
+    }
+    if (rdma->recv_cq) {
+        ibv_destroy_cq(rdma->recv_cq);
+        rdma->recv_cq = NULL;
      }
      if (rdma->comp_channel) {
  ibv_destroy_comp_channel(rdma->comp_channel);
@@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
       */
      while (1) {
          uint64_t wr_id, wr_id_in;
-        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
+        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, RDMA_WRID_RDMA_WRITE);
          if (ret < 0) {
              fprintf(stderr, "rdma migration: polling error! %d\n", ret);
              goto err;
--
1.8.3.msysgit.0

Understood. Thank you. The following patch should be fine.

 From 7b7d2c5b51c53c23f7194d35b469dedd892ef89f Mon Sep 17 00:00:00 2001
From: Frank Yang <address@hidden>
Date: Tue, 3 Sep 2013 18:26:54 +0800
Subject: [PATCH] rdma: fix multiple VMs parallel migration

Signed-off-by: Frank Yang <address@hidden>
---
migration-rdma.c | 64 +++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/migration-rdma.c b/migration-rdma.c
index 3d1266f..30f8c11 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -362,7 +362,8 @@ typedef struct RDMAContext {
     struct ibv_qp *qp;                      /* queue pair */
     struct ibv_comp_channel *comp_channel;  /* completion channel */
     struct ibv_pd *pd;                      /* protection domain */
-    struct ibv_cq *cq;                      /* completion queue */
+    struct ibv_cq *send_cq;                 /* send completion queue */
+    struct ibv_cq *recv_cq;                 /* receive completion queue */

     /*
      * If a previous write failed (perhaps because of a failed
@@ -1003,12 +1004,18 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
     }

     /*
-     * Completion queue can be filled by both read and write work requests,
-     * so must reflect the sum of both possible queue sizes.
+     * Create two completion queues for sending and receiving
+     * respectively.
+     * Send completion queue can be filled by both send and
+     * write work requests, so must reflect the sum of both
+     * possible queue sizes.
      */
-    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
+    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2),
             NULL, rdma->comp_channel, 0);
-    if (!rdma->cq) {
+    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL,
+            rdma->comp_channel, 0);
+
+    if (!rdma->send_cq || !rdma->recv_cq) {
         fprintf(stderr, "failed to allocate completion queue\n");
         goto err_alloc_pd_cq;
     }
@@ -1040,8 +1047,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
     attr.cap.max_recv_wr = 3;
     attr.cap.max_send_sge = 1;
     attr.cap.max_recv_sge = 1;
-    attr.send_cq = rdma->cq;
-    attr.recv_cq = rdma->cq;
+    attr.send_cq = rdma->send_cq;
+    attr.recv_cq = rdma->recv_cq;
     attr.qp_type = IBV_QPT_RC;

     ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
@@ -1361,13 +1368,18 @@ static void qemu_rdma_signal_unregister(RDMAContext 
*rdma, uint64_t index,
  * Return the work request ID that completed.
  */
static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
-                               uint32_t *byte_len)
+                               uint32_t *byte_len, int wrid_requested)
{
     int ret;
     struct ibv_wc wc;
     uint64_t wr_id;

-    ret = ibv_poll_cq(rdma->cq, 1, &wc);
+    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+        wrid_requested == RDMA_WRID_SEND_CONTROL) {
+        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
+    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
+    }

     if (!ret) {
         *wr_id_out = RDMA_WRID_NONE;
@@ -1460,12 +1472,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
     void *cq_ctx;
     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;

-    if (ibv_req_notify_cq(rdma->cq, 0)) {
-        return -1;
-    }
     /* poll cq first */
     while (wr_id != wrid_requested) {
-        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
         if (ret < 0) {
             return ret;
         }
@@ -1487,6 +1496,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
     }

     while (1) {
+        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+        wrid_requested == RDMA_WRID_SEND_CONTROL) {
+        if (ibv_req_notify_cq(rdma->send_cq, 0)) {
+                return -1;
+        }
+        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
+                return -1;
+            }
+        }
+
         /*
          * Coroutine doesn't start until process_incoming_migration()
          * so don't yield unless we know we're running inside of a coroutine.
@@ -1502,12 +1522,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,

         num_cq_events++;

-        if (ibv_req_notify_cq(cq, 0)) {
-            goto err_block_for_wrid;
-        }
-
         while (wr_id != wrid_requested) {
-            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
             if (ret < 0) {
                 goto err_block_for_wrid;
             }
@@ -2236,9 +2252,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
         ibv_destroy_qp(rdma->qp);
         rdma->qp = NULL;
     }
-    if (rdma->cq) {
-        ibv_destroy_cq(rdma->cq);
-        rdma->cq = NULL;
+    if (rdma->send_cq) {
+        ibv_destroy_cq(rdma->send_cq);
+        rdma->send_cq = NULL;
+    }
+    if (rdma->recv_cq) {
+        ibv_destroy_cq(rdma->recv_cq);
+        rdma->recv_cq = NULL;
     }
     if (rdma->comp_channel) {
         ibv_destroy_comp_channel(rdma->comp_channel);
@@ -2770,7 +2790,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
      */
     while (1) {
         uint64_t wr_id, wr_id_in;
-        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
+        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, RDMA_WRID_RDMA_WRITE);
         if (ret < 0) {
             fprintf(stderr, "rdma migration: polling error! %d\n", ret);
             goto err;
--
1.8.3.msysgit.0

Sorry, my bad. Please follow this patch:

From: Frank Yang <address@hidden>

Signed-off-by: Frank Yang <address@hidden>
---
  migration-rdma.c | 64 +++++++++++++++++++++++++++++++++++++-------------------
  1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/migration-rdma.c b/migration-rdma.c
index 3d1266f..f3206c4 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -362,7 +362,8 @@ typedef struct RDMAContext {
      struct ibv_qp *qp;                      /* queue pair */
      struct ibv_comp_channel *comp_channel;  /* completion channel */
      struct ibv_pd *pd;                      /* protection domain */
-    struct ibv_cq *cq;                      /* completion queue */
+    struct ibv_cq *send_cq;                 /* send completion queue */
+    struct ibv_cq *recv_cq;                 /* receive completion queue */

/*

       * If a previous write failed (perhaps because of a failed
@@ -1003,12 +1004,18 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
      }

/*

-     * Completion queue can be filled by both read and write work requests,
-     * so must reflect the sum of both possible queue sizes.
+     * Create two completion queues for sending and receiving
+     * respectively.
+     * Send completion queue can be filled by both send and
+     * write work requests, so must reflect the sum of both
+     * possible queue sizes.
       */
-    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
+    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2),
              NULL, rdma->comp_channel, 0);
-    if (!rdma->cq) {
+    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL,
+            rdma->comp_channel, 0);
+
+    if (!rdma->send_cq || !rdma->recv_cq) {
          fprintf(stderr, "failed to allocate completion queue\n");
          goto err_alloc_pd_cq;
      }
@@ -1040,8 +1047,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
      attr.cap.max_recv_wr = 3;
      attr.cap.max_send_sge = 1;
      attr.cap.max_recv_sge = 1;
-    attr.send_cq = rdma->cq;
-    attr.recv_cq = rdma->cq;
+    attr.send_cq = rdma->send_cq;
+    attr.recv_cq = rdma->recv_cq;
      attr.qp_type = IBV_QPT_RC;

ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);

@@ -1361,13 +1368,18 @@ static void qemu_rdma_signal_unregister(RDMAContext 
*rdma, uint64_t index,
   * Return the work request ID that completed.
   */
  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
-                               uint32_t *byte_len)
+                               uint32_t *byte_len, int wrid_requested)
  {
      int ret;
      struct ibv_wc wc;
      uint64_t wr_id;

- ret = ibv_poll_cq(rdma->cq, 1, &wc);

+    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+        wrid_requested == RDMA_WRID_SEND_CONTROL) {
+        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
+    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
+    }

if (!ret) {

          *wr_id_out = RDMA_WRID_NONE;
@@ -1460,12 +1472,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
      void *cq_ctx;
      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;

- if (ibv_req_notify_cq(rdma->cq, 0)) {

-        return -1;
-    }
      /* poll cq first */
      while (wr_id != wrid_requested) {
-        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
          if (ret < 0) {
              return ret;
          }
@@ -1487,6 +1496,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
      }

while (1) {

+        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+            wrid_requested == RDMA_WRID_SEND_CONTROL) {
+            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
+                return -1;
+            }
+        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
+                return -1;
+            }
+        }
+
          /*
           * Coroutine doesn't start until process_incoming_migration()
           * so don't yield unless we know we're running inside of a coroutine.
@@ -1502,12 +1522,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,

num_cq_events++;- if (ibv_req_notify_cq(cq, 0)) {

-            goto err_block_for_wrid;
-        }
-
          while (wr_id != wrid_requested) {
-            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
              if (ret < 0) {
                  goto err_block_for_wrid;
              }
@@ -2236,9 +2252,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
          ibv_destroy_qp(rdma->qp);
          rdma->qp = NULL;
      }
-    if (rdma->cq) {
-        ibv_destroy_cq(rdma->cq);
-        rdma->cq = NULL;
+    if (rdma->send_cq) {
+        ibv_destroy_cq(rdma->send_cq);
+        rdma->send_cq = NULL;
+    }
+    if (rdma->recv_cq) {
+        ibv_destroy_cq(rdma->recv_cq);
+        rdma->recv_cq = NULL;
      }
      if (rdma->comp_channel) {
          ibv_destroy_comp_channel(rdma->comp_channel);
@@ -2770,7 +2790,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
       */
      while (1) {
          uint64_t wr_id, wr_id_in;
-        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
+        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, RDMA_WRID_RDMA_WRITE);
          if (ret < 0) {
              fprintf(stderr, "rdma migration: polling error! %d\n", ret);
              goto err;


This seems like a harmless enough patch.

Please create a properly-formatted V2 patch and resend to the mailing list.

- Michael

[Prev in Thread]

Current Thread

[Next in Thread]

Re: [Qemu-devel] [Qemu-stable][PATCH] rdma: fix multiple VMs parallel migration, Lei Li, 2013/09/03
- Re: [Qemu-devel] [Qemu-stable][PATCH] rdma: fix multiple VMs parallel migration, Frank Yang, 2013/09/03
  - Re: [Qemu-devel] [Qemu-stable][PATCH] rdma: fix multiple VMs parallel migration, Frank Yang, 2013/09/04
    - Re: [Qemu-devel] [Qemu-stable] [PATCH] rdma: fix multiple VMs parallel migration, Michael R. Hines <=

Prev by Date: Re: [Qemu-devel] [Qemu-stable] [PATCH] rdma: memory leak InetSocketAddress
Next by Date: [Qemu-devel] [PULL 00/01] seccomp: trivial changes
Previous by thread: Re: [Qemu-devel] [Qemu-stable][PATCH] rdma: fix multiple VMs parallel migration
Next by thread: [Qemu-devel] [PATCH 0/4] tcg/optimize: fixes and improvements
Index(es):
- Date
- Thread