Re: [Qemu-devel] [PATCH v4 1/2] qcow2: Add qcow2_shrink_l1_and_l2

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v4 1/2] qcow2: Add qcow2_shrink_l1_and_l2_table

From:	Max Reitz
Subject:	Re: [Qemu-devel] [PATCH v4 1/2] qcow2: Add qcow2_shrink_l1_and_l2_table for qcow2 shrinking
Date:	Mon, 20 Oct 2014 13:47:28 +0200
User-agent:	Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.1.1

On 2014-10-14 at 19:53, Jun Li wrote:

This patch is the realization of new function qcow2_shrink_l1_and_l2_table.
This function will shrink/discard l1 and l2 table when do qcow2 shrinking.

Signed-off-by: Jun Li <address@hidden>
---
v4:
   Add deal with COW clusters in l2 table. When using COW, some of (l2_entry >>
s->cluster_bits) will larger than s->refcount_table_size, so need to discard
this l2_entry.
v3:
   Fixed host cluster leak.
---
  block/qcow2-cluster.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++
  block/qcow2.c         |  40 +++++++++--
  block/qcow2.h         |   2 +
  3 files changed, 224 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index f7dd8c0..0664b8a 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -29,6 +29,9 @@
  #include "block/qcow2.h"
  #include "trace.h"

+static int l2_load(BlockDriverState *bs, uint64_t l2_offset,

+                   uint64_t **l2_table);
+
  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                          bool exact_size)
  {
@@ -135,6 +138,189 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t 
min_size,
      return ret;
  }

+int qcow2_shrink_l1_and_l2_table(BlockDriverState *bs, uint64_t new_l1_size,

+                                 int new_l2_index, bool exact_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int new_l1_size2, ret, i;
+    uint64_t *new_l1_table;
+    int64_t new_l1_table_offset;
+    int64_t old_l1_table_offset, old_l1_size;
+    uint8_t data[12];
+
+    new_l1_size2 = sizeof(uint64_t) * new_l1_size;


new_l1_size is uint64_t, new_l1_size2 is int. Not a good idea.

+    new_l1_table = qemu_try_blockalign(bs->file,
+                                       align_offset(new_l1_size2, 512));


As long as with 512 you mean the sector size, s/512/BDRV_SECTOR_SIZE/.

+    if (new_l1_table == NULL) {
+        return -ENOMEM;
+    }
+    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));


Same here.

+
+    /* shrinking l1 table */
+    memcpy(new_l1_table, s->l1_table, new_l1_size2);


Please add an assert(new_l1_size <= s->l1_size) before this.

+
+    /* write new table (align to cluster) */
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);


Well, it's not really growing, but fine.

+    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
+    if (new_l1_table_offset < 0) {
+        qemu_vfree(new_l1_table);
+        return new_l1_table_offset;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* the L1 position has not yet been updated, so these clusters must
+     * indeed be completely free */
+    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
+                                        new_l1_size2);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
+
+    for (i = 0; i < new_l1_size; i++) {
+        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);


You could have used cpu_to_be64s(&new_l1_table[i]) here.

+    }
+
+    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset,
+                           new_l1_table, new_l1_size2);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    for (i = 0; i < new_l1_size; i++) {
+        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);


Again, cpu_to_be64s() is an alternative.

+    }
+
+    /* set new table */
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
+    cpu_to_be32w((uint32_t *)data, new_l1_size);
+    stq_be_p(data + 4, new_l1_table_offset);

Why not cpu_to_be64w()? I find it easier to read and it definitely fitsin better.

+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
+                           data, sizeof(data));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    old_l1_table_offset = s->l1_table_offset;
+    s->l1_table_offset = new_l1_table_offset;
+    uint64_t *old_l1_table = s->l1_table;


Declarations have to be at the start of the block.

+    s->l1_table = new_l1_table;
+    old_l1_size = s->l1_size;
+    s->l1_size = new_l1_size;
+
+    int num = old_l1_size - s->l1_size;


Same here.

+
+    while (num >= 0) {
+        uint64_t l2_offset;
+        int ret;
+        uint64_t *l2_table, l2_entry;
+        int last_free_cluster = 0;
+
+        l2_offset = old_l1_table[s->l1_size + num - 1] & L1E_OFFSET_MASK;
+        if (l2_offset == 0) {
+            goto retry;
+        }
+
+        if (num == 0) {
+            if (new_l2_index == 0) {
+                goto retry;
+            }
+            last_free_cluster = new_l2_index + 1;
+        }
+
+        /* load l2_table into cache */
+        ret = l2_load(bs, l2_offset, &l2_table);
+
+        if (ret < 0) {
+            goto fail;

Too bad we're freeing new_l1_table in fail. We should be freeingold_l1_table.

+        }
+
+        for (i = s->l2_size; i > 0; i--) {
+            l2_entry = be64_to_cpu(l2_table[i - 1]);

Okay... I'd probably have gone with for (i = s->l2_size - 1; i >= 0;i--) and then l2_entry = be64_to_cpu(l2_table[i]).

+
+            /* Deal with COW clusters in l2 table */
+            if ((i <= last_free_cluster)) {

I'd like some comments on what last_free_cluster is. Also, the innerparentheses should be removed.

+                if ((l2_entry >> s->cluster_bits) >
+                 s->refcount_table_size) {


Misalignment.

+                    goto lable1;


It's "label", and "label1" is not a very helpful name.

+                }
+                continue;
+            }
+
+            switch (qcow2_get_cluster_type(l2_entry)) {
+            case QCOW2_CLUSTER_UNALLOCATED:
+                if (!bs->backing_hd) {
+                    continue;
+                }
+                break;
+
+            case QCOW2_CLUSTER_ZERO:
+                continue;
+
+            case QCOW2_CLUSTER_NORMAL:
+            case QCOW2_CLUSTER_COMPRESSED:
+                break;
+
+            default:
+                abort();
+            }
+
+        lable1:
+            qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+
+            uint64_t data64;
+            if (s->qcow_version >= 3) {
+                l2_table[i - 1] = cpu_to_be64(QCOW_OFLAG_ZERO);
+                data64 = cpu_to_be64(QCOW_OFLAG_ZERO);
+            } else {
+                l2_table[i - 1] = cpu_to_be64(0);
+                data64 = cpu_to_be64(0);
+            }
+
+            /* before discard the specify item of l2_entry,
+             * set this entry with 0.
+             */


Why?

+            ret = bdrv_pwrite_sync(bs->file,
+                                   l2_offset + (i-1) * sizeof(uint64_t),
+                                   &data64, sizeof(data64));


Why aren't you just going through the cache?

+            if (ret < 0) {
+                goto fail;
+            }
+
+            /* Then decrease the refcount */
+            qcow2_free_any_clusters(bs, l2_entry, 1, QCOW2_DISCARD_MAX);
+        }

So I think what this code tried to achieve was freeing all data clustersreferenced from L2 tables which are about to be dropped. Since this willchange the data in these clusters anyway (and these clusters are notguest-accessible any more, so we don't have to care), just mark all ofthese L2 entries unallocated, don't bother with making them zero clusters.

Simply marking the image dirty would be much easier, however. Theproblem is that this doesn't work for compat=0.10. I'd be fine (and bythat, I mean I'd like it) with supporting shrinking only for compat=1.1images.

+
+        ret = qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+        if (ret < 0) {
+            goto fail;
+        }
+        if (last_free_cluster == 0) {
+            qemu_vfree(l2_table);
+            qcow2_free_clusters(bs, l2_offset, s->cluster_size - 1,
+                                QCOW2_DISCARD_OTHER);
+        }
+    retry:
+        num--;
+    }
+
+    qemu_vfree(old_l1_table);
+
+    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * 
sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
+    return 0;
+ fail:
+    qemu_vfree(new_l1_table);
+    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
+                        QCOW2_DISCARD_OTHER);
+    return ret;
+}
+

So, all in all, I'd strongly propose just not supporting shrinking forcompat=0.10; and for compat=1.1, we can simply mark the image dirty andthen shrink the L1 table.

  /*
   * l2_load
   *
diff --git a/block/qcow2.c b/block/qcow2.c
index fb28493..bed1558 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2097,7 +2097,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t 
offset)
      int64_t new_l1_size;
      int ret;

- if (offset & 511) {

+    if (offset <= 0 || offset & 511) {
          error_report("The new size must be a multiple of 512");


This error message is inappropriate for offset <= 0.

I think an assert(offset > 0) should be fine, too. bdrv_truncate() maynot be called with a negative offset, as far as I know.

          return -EINVAL;
      }
@@ -2108,10 +2108,42 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t 
offset)
          return -ENOTSUP;
      }

- /* shrinking is currently not supported */

+    /* shrinking image */
      if (offset < bs->total_sectors * 512) {
-        error_report("qcow2 doesn't support shrinking images yet");
-        return -ENOTSUP;
+


Spurious empty line?

+        /* As l1 table, l2 table, refcount table, refcount block table
+         * and file header of the qcow2 image need to use some clusters,
+         * so should subtract these metadata from offset.
+         */
+        int64_t nb_l1 = (s->l1_size >> 3) >> s->cluster_bits;

s->l1_size is the number of entries in the L1 table. s->cluster_bits canbe used to convert a number of bytes to a number of clusters. I guesswhat you want is (s->l1_size << 3) >> s->cluster_bits. But sinces->l1_size << 3 does not have to a multiple of the cluster size, I thinkDIV_ROUND_UP(s->l1_size * sizeof(uint64_t), s->cluster_size) may be best(or DIV_ROUND_UP((uint64_t)s->l1_size * sizeof(uint64_t),s->cluster_size), since s->l1_size is only an int).

+        int64_t nb_l2 = offset / (s->l2_size << s->cluster_bits);

This should be DIV_ROUND_UP(offset, s->l2_size << s->cluster_bits).Maybe even DIV_ROUND_UP(offset, (uint64_t)s->l2_size << s->cluster_bits)since s->l2_size is only an int.

+        int64_t nb_refcount_block_table = offset / ((s->cluster_size >>
+                                          REFCOUNT_SHIFT) << s->cluster_bits);
+        int64_t nb_refcount_table = (nb_refcount_block_table << 3) >>
+                                     s->cluster_bits;

The number of refblocks and the reftable size required does not dependon the offset which is the guest-visible image size. Instead, it dependson the host image size. Since the refblock count and the reftable sizedirectly influences that host image size, this is not a trivial matter.See qcow2_create2() in the "if (prealloc == PREALLOC_MODE_FULL ||prealloc == PREALLOC_MODE_FALLOC)" block for an example. Note that thatfunction is not necessarily correct (a truly correct function for thisshould not be using floating point calculation, at the very least), butit was enough for that case. In this case however, we should do it rightif we want to shrink the refcount table and reduce the number ofrefcount blocks.

+        int64_t tatol_nb = nb_l2 + nb_l1 + nb_refcount_block_table +


Should be "total_nb".

+                           nb_refcount_table + 1;
+        int64_t offset_for_shrink = offset - (tatol_nb << s->cluster_bits);

offset is not the host image size but the guest disk size. It should be"file_size = offset + (total_nb << s->cluster_bits)" or something like that.

+        int new_l2_index = offset_to_l2_index(s, offset_for_shrink);

This would then be s/offset_for_shrink/offset/. Well, at least it shouldbe. It would not be, because this new_l2_index is something strange,probably it's supposed to be the number of L2 indices to be culled. Itshouldn't be. I think you want to pass the number of L2 indices to keepto qcow2_shrink_l1_and_l2_table(), that is, offset_to_l2_index(s, offset).

+
+        new_l1_size = size_to_l1(s, offset_for_shrink);


s/offset_for_shrink/offset/.

+        ret = qcow2_shrink_l1_and_l2_table(bs, new_l1_size, new_l2_index, 
true);
+        if (ret < 0) {
+            return ret;
+        }
+
+        int64_t actual_size = bdrv_get_allocated_file_size(bs);
+
+        if (offset < actual_size) {

Again, offset is the guest disk size. With the file_size calculationabove, what you probably want is "if (file_size < actual_size)".

+            int fd = qemu_open(bs->filename, O_WRONLY,
+                               O_APPEND);
+            if (fd >= 0) {
+                if (ftruncate(fd, offset) < 0) {

This should then be file_size instead of offset as well. Also, pleasedon't use qemu_open() and ftruncate(), but bdrv_truncate() on bs->file.

+                    return -errno;
+                }
+                qemu_close(fd);
+            }
+        }
      }

new_l1_size = size_to_l1(s, offset);

diff --git a/block/qcow2.h b/block/qcow2.h
index 7d61e61..a9c8acb 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -516,6 +516,8 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int 
ign, int64_t offset,
  /* qcow2-cluster.c functions */
  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                          bool exact_size);
+int qcow2_shrink_l1_and_l2_table(BlockDriverState *bs, uint64_t new_l1_size,
+                                 int new_l2_index, bool exact_size);
  int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
  void qcow2_l2_cache_reset(BlockDriverState *bs);
  int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);

Max

[Prev in Thread]

Current Thread

[Next in Thread]

[Qemu-devel] [PATCH v4 0/2] qcow2: Patch for shrinking qcow2 disk image, Jun Li, 2014/10/14
- [Qemu-devel] [PATCH v4 1/2] qcow2: Add qcow2_shrink_l1_and_l2_table for qcow2 shrinking, Jun Li, 2014/10/14
  - Re: [Qemu-devel] [PATCH v4 1/2] qcow2: Add qcow2_shrink_l1_and_l2_table for qcow2 shrinking, Max Reitz <=
    - Re: [Qemu-devel] [PATCH v4 1/2] qcow2: Add qcow2_shrink_l1_and_l2_table for qcow2 shrinking, Jun Li, 2014/10/22
- [Qemu-devel] [PATCH v4 2/2] qcow2: add update refcount table realization for update_refcount, Jun Li, 2014/10/14

Prev by Date: Re: [Qemu-devel] [PATCH] intel_iommu: fix VTD_SID_TO_BUS
Next by Date: [Qemu-devel] [PATCH] block: qemu-iotests change _supported_proto to file once more.
Previous by thread: [Qemu-devel] [PATCH v4 1/2] qcow2: Add qcow2_shrink_l1_and_l2_table for qcow2 shrinking
Next by thread: Re: [Qemu-devel] [PATCH v4 1/2] qcow2: Add qcow2_shrink_l1_and_l2_table for qcow2 shrinking
Index(es):
- Date
- Thread