qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] sheepdog: fix overlapping metadata update


From: Liu Yuan
Subject: [Qemu-devel] [PATCH] sheepdog: fix overlapping metadata update
Date: Wed, 29 Jul 2015 12:02:35 +0800

From: Liu Yuan <address@hidden>

Current sheepdog driver use a range update_inode(min_idx, max_idx) for batching
the updates. But there is subtle problem by determining min_idx and max_idx:

for a single create request, min_idx == max_idx, so actually we just update one
one bit as expected.

Suppose we have 2 create request, create(10) and create(20), then min == 10,
max==20 even though we just need to update index 10 and index 20, 
update_inode(10,20)
will actually update range from 10 to 20. This would work if all the 
update_inode()
requests won't overlap. But unfortunately, this is not true for some corner 
case.
So the problem arise as following:

req 1: update_inode(10,20)
req 2: update_inode(15,22)

req 1 and req 2 might have different value between [15,20] and cause problems
and can be illustrated as following by adding a printf in sd_write_done:

@@ -1976,6 +1976,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)

     mn = s->min_dirty_data_idx;
     mx = s->max_dirty_data_idx;
+    printf("min %u, max %u\n", mn, mx);
     if (mn <= mx) {
         /* we need to update the vdi object. */
         offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +

...
min 4294967295, max 0
min 9221, max 9222
min 9224, max 9728
min 9223, max 9223
min 9729, max 9730
min 9731, max 9732
min 9733, max 9734
min 9736, max 10240
min 9735, max 10241
...

Every line represents a update_inode(min, max) last 2 lines show 2 requests
actually overlap while I ran mkfs.ext4 on a sheepdog volume. The overlapped
requests might be reordered via network and cause inode[idx] back to 0 after
being set by last request. Then a wrong remove request will be executed by sheep
internally and accidentally remove a victim object, which is reported at:

https://bugs.launchpad.net/sheepdog-project/+bug/1456421

The fix is simple that we just update inode one by one for aio_req. Since
aio_req is never overlapped, we'll have inode update never overlapped.

Cc: Jeff Cody <address@hidden>
Cc: Kevin Wolf <address@hidden>
Cc: Stefan Hajnoczi <address@hidden>
Cc: Teruaki Ishizaki <address@hidden>
Cc: Vasiliy Tolstov <address@hidden>
Cc: Hitoshi Mitake <address@hidden>
Signed-off-by: Liu Yuan <address@hidden>
---
 block/sheepdog.c | 77 ++++++++++++++++----------------------------------------
 1 file changed, 22 insertions(+), 55 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index bd7cbed..d1eeb81 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -342,9 +342,6 @@ typedef struct BDRVSheepdogState {
 
     SheepdogInode inode;
 
-    uint32_t min_dirty_data_idx;
-    uint32_t max_dirty_data_idx;
-
     char name[SD_MAX_VDI_LEN];
     bool is_snapshot;
     uint32_t cache_flags;
@@ -782,6 +779,26 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
     }
 }
 
+static void  update_inode(BDRVSheepdogState *s, AIOReq *aio_req)
+{
+    struct iovec iov;
+    uint32_t offset, data_len;
+    SheepdogAIOCB *acb = aio_req->aiocb;
+    int idx = data_oid_to_idx(aio_req->oid);
+
+    offset = SD_INODE_HEADER_SIZE + sizeof(uint32_t) * idx;
+    data_len = sizeof(uint32_t);
+
+    iov.iov_base = &s->inode;
+    iov.iov_len = sizeof(s->inode);
+    aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
+                            data_len, offset, 0, false, 0, offset);
+    QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+    add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
+
+    return;
+}
+
 /*
  * Receive responses of the I/O requests.
  *
@@ -820,25 +837,15 @@ static void coroutine_fn aio_read_response(void *opaque)
 
     switch (acb->aiocb_type) {
     case AIOCB_WRITE_UDATA:
-        /* this coroutine context is no longer suitable for co_recv
-         * because we may send data to update vdi objects */
-        s->co_recv = NULL;
         if (!is_data_obj(aio_req->oid)) {
             break;
         }
         idx = data_oid_to_idx(aio_req->oid);
 
         if (aio_req->create) {
-            /*
-             * If the object is newly created one, we need to update
-             * the vdi object (metadata object).  min_dirty_data_idx
-             * and max_dirty_data_idx are changed to include updated
-             * index between them.
-             */
             if (rsp.result == SD_RES_SUCCESS) {
                 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
-                s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
-                s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+                update_inode(s, aio_req);
             }
             /*
              * Some requests may be blocked because simultaneous
@@ -1518,8 +1525,6 @@ static int sd_open(BlockDriverState *bs, QDict *options, 
int flags,
     }
 
     memcpy(&s->inode, buf, sizeof(s->inode));
-    s->min_dirty_data_idx = UINT32_MAX;
-    s->max_dirty_data_idx = 0;
 
     bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
     pstrcpy(s->name, sizeof(s->name), vdi);
@@ -1962,44 +1967,6 @@ static int sd_truncate(BlockDriverState *bs, int64_t 
offset)
     return ret;
 }
 
-/*
- * This function is called after writing data objects.  If we need to
- * update metadata, this sends a write request to the vdi object.
- * Otherwise, this switches back to sd_co_readv/writev.
- */
-static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
-{
-    BDRVSheepdogState *s = acb->common.bs->opaque;
-    struct iovec iov;
-    AIOReq *aio_req;
-    uint32_t offset, data_len, mn, mx;
-
-    mn = s->min_dirty_data_idx;
-    mx = s->max_dirty_data_idx;
-    if (mn <= mx) {
-        /* we need to update the vdi object. */
-        offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
-            mn * sizeof(s->inode.data_vdi_id[0]);
-        data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
-
-        s->min_dirty_data_idx = UINT32_MAX;
-        s->max_dirty_data_idx = 0;
-
-        iov.iov_base = &s->inode;
-        iov.iov_len = sizeof(s->inode);
-        aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
-                                data_len, offset, 0, false, 0, offset);
-        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-        add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
-
-        acb->aio_done_func = sd_finish_aiocb;
-        acb->aiocb_type = AIOCB_WRITE_UDATA;
-        return;
-    }
-
-    sd_finish_aiocb(acb);
-}
-
 /* Delete current working VDI on the snapshot chain */
 static bool sd_delete(BDRVSheepdogState *s)
 {
@@ -2231,7 +2198,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState 
*bs, int64_t sector_num,
     }
 
     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
-    acb->aio_done_func = sd_write_done;
+    acb->aio_done_func = sd_finish_aiocb;
     acb->aiocb_type = AIOCB_WRITE_UDATA;
 
     ret = sd_co_rw_vector(acb);
-- 
1.9.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]