qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriverSt


From: Liu Yuan
Subject: [Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriverState
Date: Mon, 1 Sep 2014 15:43:12 +0800

This allow VM continues to process even if some devices are broken meanwhile
with proper configuration.

We mark the device broken when the protocol tier notify back some broken
state(s) of the device, such as diconnection via driver operations. We could
also reset the device as sound when the protocol tier is repaired.

Origianlly .threshold controls how we should decide the success of read/write
and return the failure only if the success count of read/write is less than
.threshold specified by users. But it doesn't record the states of underlying
states and will impact performance a bit in some cases.

For example, we have 3 children and .threshold is set 2. If one of the devices
broken, we should still return success and continue to run VM. But for every
IO operations, we will blindly send the requests to the broken device.

To store broken state into driver state we can save requests to borken devices
and resend the requests to the repaired ones by setting broken as false.

This is especially useful for network based protocol such as sheepdog, which
has a auto-reconnection mechanism and will never report EIO if the connection
is broken but just store the requests to its local queue and wait for resending.
Without broken state, quorum request will not come back until the connection is
re-established. So we have to skip the broken deivces to allow VM to continue
running with networked backed child (glusterfs, nfs, sheepdog, etc).

With the combination of read-pattern and threshold, we can easily mimic the DRVD
behavior with following configuration:

 read-pattern=fifo,threshold=1 will two children.

Cc: Eric Blake <address@hidden>
Cc: Benoit Canet <address@hidden>
Cc: Kevin Wolf <address@hidden>
Cc: Stefan Hajnoczi <address@hidden>
Signed-off-by: Liu Yuan <address@hidden>
---
 block/quorum.c            | 102 ++++++++++++++++++++++++++++++++++++++--------
 include/block/block_int.h |   3 ++
 2 files changed, 87 insertions(+), 18 deletions(-)

diff --git a/block/quorum.c b/block/quorum.c
index b9eeda3..7b07e35 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -120,6 +120,7 @@ struct QuorumAIOCB {
     int rewrite_count;          /* number of replica to rewrite: count down to
                                  * zero once writes are fired
                                  */
+    int issued_count;           /* actual read&write issued count */
 
     QuorumVotes votes;
 
@@ -170,8 +171,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
     if (acb->is_read) {
         /* on the quorum case acb->child_iter == s->num_children - 1 */
         for (i = 0; i <= acb->child_iter; i++) {
-            qemu_vfree(acb->qcrs[i].buf);
-            qemu_iovec_destroy(&acb->qcrs[i].qiov);
+            if (acb->qcrs[i].buf) {
+                qemu_vfree(acb->qcrs[i].buf);
+                qemu_iovec_destroy(&acb->qcrs[i].qiov);
+            }
         }
     }
 
@@ -207,6 +210,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
     acb->count = 0;
     acb->success_count = 0;
     acb->rewrite_count = 0;
+    acb->issued_count = 0;
     acb->votes.compare = quorum_sha256_compare;
     QLIST_INIT(&acb->votes.vote_list);
     acb->is_read = false;
@@ -286,6 +290,22 @@ static void quorum_copy_qiov(QEMUIOVector *dest, 
QEMUIOVector *source)
     }
 }
 
+static int next_fifo_child(QuorumAIOCB *acb)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+    int i;
+
+    for (i = acb->child_iter; i < s->num_children; i++) {
+        if (!s->bs[i]->broken) {
+            break;
+        }
+    }
+    if (i == s->num_children) {
+        return -1;
+    }
+    return i;
+}
+
 static void quorum_aio_cb(void *opaque, int ret)
 {
     QuorumChildRequest *sacb = opaque;
@@ -293,11 +313,18 @@ static void quorum_aio_cb(void *opaque, int ret)
     BDRVQuorumState *s = acb->common.bs->opaque;
     bool rewrite = false;
 
+    if (ret < 0) {
+        s->bs[acb->child_iter]->broken = true;
+    }
+
     if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
         /* We try to read next child in FIFO order if we fail to read */
-        if (ret < 0 && ++acb->child_iter < s->num_children) {
-            read_fifo_child(acb);
-            return;
+        if (ret < 0) {
+            acb->child_iter = next_fifo_child(acb);
+            if (acb->child_iter > 0) {
+                read_fifo_child(acb);
+                return;
+            }
         }
 
         if (ret == 0) {
@@ -315,9 +342,9 @@ static void quorum_aio_cb(void *opaque, int ret)
     } else {
         quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret);
     }
-    assert(acb->count <= s->num_children);
-    assert(acb->success_count <= s->num_children);
-    if (acb->count < s->num_children) {
+    assert(acb->count <= acb->issued_count);
+    assert(acb->success_count <= acb->issued_count);
+    if (acb->count < acb->issued_count) {
         return;
     }
 
@@ -647,22 +674,46 @@ free_exit:
     return rewrite;
 }
 
+static bool has_enough_children(BDRVQuorumState *s)
+{
+    int good = 0, i;
+
+    for (i = 0; i < s->num_children; i++) {
+        if (s->bs[i]->broken) {
+            continue;
+        }
+        good++;
+    }
+
+    if (good >= s->threshold) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
 static BlockDriverAIOCB *read_quorum_children(QuorumAIOCB *acb)
 {
     BDRVQuorumState *s = acb->common.bs->opaque;
     int i;
 
+    if (!has_enough_children(s)) {
+        quorum_aio_release(acb);
+        return NULL;
+    }
+
     for (i = 0; i < s->num_children; i++) {
+        if (s->bs[i]->broken) {
+            continue;
+        }
         acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size);
         qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
         qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf);
-    }
-
-    for (i = 0; i < s->num_children; i++) {
         acb->qcrs[i].aiocb = bdrv_aio_readv(s->bs[i], acb->sector_num,
                                             &acb->qcrs[i].qiov,
                                             acb->nb_sectors, quorum_aio_cb,
                                             &acb->qcrs[i]);
+        acb->issued_count++;
     }
 
     return &acb->common;
@@ -679,7 +730,7 @@ static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb)
     acb->qcrs[i].aiocb = bdrv_aio_readv(s->bs[i], acb->sector_num,
                                         &acb->qcrs[i].qiov, acb->nb_sectors,
                                         quorum_aio_cb, &acb->qcrs[i]);
-
+    acb->issued_count = 1;
     return &acb->common;
 }
 
@@ -693,6 +744,7 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState 
*bs,
     BDRVQuorumState *s = bs->opaque;
     QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
                                       nb_sectors, cb, opaque);
+
     acb->is_read = true;
 
     if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
@@ -701,6 +753,11 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState 
*bs,
     }
 
     acb->child_iter = 0;
+    acb->child_iter = next_fifo_child(acb);
+    if (acb->child_iter < 0) {
+        quorum_aio_release(acb);
+        return NULL;
+    }
     return read_fifo_child(acb);
 }
 
@@ -716,10 +773,19 @@ static BlockDriverAIOCB 
*quorum_aio_writev(BlockDriverState *bs,
                                       cb, opaque);
     int i;
 
+    if (!has_enough_children(s)) {
+        quorum_aio_release(acb);
+        return NULL;
+    }
+
     for (i = 0; i < s->num_children; i++) {
+        if (s->bs[i]->broken) {
+            continue;
+        }
         acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov,
                                              nb_sectors, &quorum_aio_cb,
                                              &acb->qcrs[i]);
+        acb->issued_count++;
     }
 
     return &acb->common;
@@ -926,6 +992,12 @@ static int quorum_open(BlockDriverState *bs, QDict 
*options, int flags,
     }
 
     s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
+    /* and validate it against s->num_children */
+    ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+    if (ret < 0) {
+        goto exit;
+    }
+
     ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
     if (ret < 0) {
         error_setg(&local_err, "Please set read-pattern as fifo or quorum\n");
@@ -934,12 +1006,6 @@ static int quorum_open(BlockDriverState *bs, QDict 
*options, int flags,
     s->read_pattern = ret;
 
     if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
-        /* and validate it against s->num_children */
-        ret = quorum_valid_threshold(s->threshold, s->num_children, 
&local_err);
-        if (ret < 0) {
-            goto exit;
-        }
-
         /* is the driver in blkverify mode */
         if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
             s->num_children == 2 && s->threshold == 2) {
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 9fdec7f..599110b 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -402,6 +402,9 @@ struct BlockDriverState {
 
     /* The error object in use for blocking operations on backing_hd */
     Error *backing_blocker;
+
+    /* True if the associated device is broken */
+    bool broken;
 };
 
 int get_tmp_filename(char *filename, int size);
-- 
1.9.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]