qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] qcow2/virtio corruption: Don't allocate the same cl


From: Kevin Wolf
Subject: [Qemu-devel] [PATCH] qcow2/virtio corruption: Don't allocate the same cluster twice
Date: Wed, 6 May 2009 18:39:10 +0200

A write on a qcow2 image that results in the allocation of a new cluster can be
divided into two parts: There is a part which happens before the actual data is
written, this is about allocating clusters in the image file. The second part
happens in the AIO callback handler and is about making L2 entries for the
newly allocated clusters.

When two AIO requests happen to touch the same free cluster, there is a chance
that the second request still sees the cluster as free because the callback of
the first request has not yet run. This means that it reserves another cluster
for the same virtual hard disk offset and hooks it up in its own callback,
overwriting what the first callback has done. Long story cut short: Bad Things
happen.

This patch maintains a list of in-flight requests that have allocated new
clusters. A second request touching the same cluster doesn't find an entry yet
in the L2 table, but can look it up in the list now. The second request is
limited so that it either doesn't touch the allocation of the first request
(so it can have a non-overlapping allocation) or that it doesn't exceed the
end of the allocation of the first request (so it can reuse this allocation
and doesn't need to do anything itself).

Signed-off-by: Kevin Wolf <address@hidden>
---
 block-qcow2.c |   40 ++++++++++++++++++++++++++++++++++++++++
 1 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/block-qcow2.c b/block-qcow2.c
index 1f33125..d78d1e7 100644
--- a/block-qcow2.c
+++ b/block-qcow2.c
@@ -140,6 +140,7 @@ typedef struct BDRVQcowState {
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
     uint64_t cluster_cache_offset;
+    LIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs;
 
     uint64_t *refcount_table;
     uint64_t refcount_table_offset;
@@ -351,6 +352,8 @@ static int qcow_open(BlockDriverState *bs, const char 
*filename, int flags)
     if (refcount_init(bs) < 0)
         goto fail;
 
+    LIST_INIT(&s->cluster_allocs);
+
     /* read qcow2 extensions */
     if (header.backing_file_offset)
         ext_end = header.backing_file_offset;
@@ -953,9 +956,11 @@ static uint64_t 
alloc_compressed_cluster_offset(BlockDriverState *bs,
 typedef struct QCowL2Meta
 {
     uint64_t offset;
+    uint64_t cluster_offset;
     int n_start;
     int nb_available;
     int nb_clusters;
+    LIST_ENTRY(QCowL2Meta) next;
 } QCowL2Meta;
 
 static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
@@ -1007,6 +1012,9 @@ static int alloc_cluster_link_l2(BlockDriverState *bs, 
uint64_t cluster_offset,
     for (i = 0; i < j; i++)
         free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1);
 
+    /* Take the request off the list of running requests */
+    LIST_REMOVE(m, next);
+
     ret = 0;
 err:
     qemu_free(old_cluster);
@@ -1035,6 +1043,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
     int l2_index, ret;
     uint64_t l2_offset, *l2_table, cluster_offset;
     int nb_clusters, i = 0;
+    QCowL2Meta *old_alloc;
 
     ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
     if (ret == 0)
@@ -1083,6 +1092,36 @@ static uint64_t alloc_cluster_offset(BlockDriverState 
*bs,
     }
     nb_clusters = i;
 
+    /*
+     * Check if there already is an AIO write request in flight which allocates
+     * the same cluster. In this case take the cluster_offset which was
+     * allocated for the previous write.
+     */
+    LIST_FOREACH(old_alloc, &s->cluster_allocs, next) {
+
+        uint64_t end_offset = offset + nb_clusters * s->cluster_size;
+        uint64_t old_offset = old_alloc->offset;
+        uint64_t old_end_offset = old_alloc->offset +
+            old_alloc->nb_clusters * s->cluster_size;
+
+        if (end_offset < old_offset || offset > old_end_offset) {
+            /* No intersection */
+        } else if (offset < old_offset) {
+            /* Stop at the start of a running allocation */
+            nb_clusters = (old_offset - offset) >> s->cluster_bits;
+        } else {
+            /* Reuse the previously allocated clusters */
+            if (end_offset > old_end_offset) {
+                nb_clusters = (old_end_offset - offset) >> s->cluster_bits;
+            }
+            cluster_offset = old_alloc->cluster_offset + (offset - old_offset);
+            m->nb_clusters = 0;
+            goto out;
+        }
+    }
+
+    LIST_INSERT_HEAD(&s->cluster_allocs, m, next);
+
     /* allocate a new cluster */
 
     cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
@@ -1091,6 +1130,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
     m->offset = offset;
     m->n_start = n_start;
     m->nb_clusters = nb_clusters;
+    m->cluster_offset = cluster_offset;
 
 out:
     m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
-- 
1.6.0.6





reply via email to

[Prev in Thread] Current Thread [Next in Thread]