[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [partial RFC 2/2] qcow2: Deduplicati on write mechanism.
From: |
Benoît Canet |
Subject: |
[Qemu-devel] [partial RFC 2/2] qcow2: Deduplicati on write mechanism. |
Date: |
Thu, 27 Sep 2012 16:29:58 +0200 |
---
block/Makefile.objs | 2 +-
block/qcow2-dedup.c | 368 +++++++++++++++++++++++++++++++++++++++++++++++++++
block/qcow2.c | 74 ++++++++++-
block/qcow2.h | 40 +++++-
4 files changed, 481 insertions(+), 3 deletions(-)
create mode 100644 block/qcow2-dedup.c
diff --git a/block/Makefile.objs b/block/Makefile.objs
index b5754d3..37a8aab 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,5 +1,5 @@
block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o
vvfat.o
-block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
qcow2-cache.o
+block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
qcow2-cache.o qcow2-dedup.o
block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-obj-y += qed-check.o
block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
diff --git a/block/qcow2-dedup.c b/block/qcow2-dedup.c
new file mode 100644
index 0000000..12ea96f
--- /dev/null
+++ b/block/qcow2-dedup.c
@@ -0,0 +1,368 @@
+/*
+ * Deduplication for the QCOW2 format
+ *
+ * Copyright (C) Nodalink, SARL. 2012
+ *
+ * Author:
+ * Benoît Canet <address@hidden>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block_int.h"
+#include "qemu-common.h"
+#include "qcow2.h"
+#include <openssl/sha.h>
+#include <openssl/evp.h>
+
+static int qcow2_dedup_insert_in_rb_tree(BlockDriverState *bs,
+ uint8_t *hash,
+ int64_t cluster_offset)
+{
+ return 0;
+}
+
+static int64_t qcow2_dedup_rb_tree_lookup(BlockDriverState *bs, uint8_t *hash)
+{
+ return 0;
+}
+
+static void qcow2_dedup_rb_tree_remove(BlockDriverState *bs, uint8_t *hash)
+{
+}
+
+static uint8_t *qcow2_get_dedup_cluster_hash(BlockDriverState *bs,
+ uint8_t *data)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint8_t *hash = g_malloc0(SHA256_DIGEST_LENGTH);
+ EVP_Digest(data, s->cluster_sectors << 9,
+ hash, NULL, EVP_sha256(), NULL);
+ return hash;
+}
+
+static int qcow2_dedup_write_hash(BlockDriverState *bs,
+ uint8_t *hash,
+ int64_t cluster_offset)
+{
+ return 0;
+}
+
+static uint8_t *qcow2_dedup_read_hash(BlockDriverState *bs,
+ uint64_t cluster_offset)
+{
+ return 0;
+}
+
+static int qcow2_dedup_inc_refcount(BlockDriverState *bs,
+ uint64_t cluster_offset)
+{
+ return 0;
+}
+
+static int qcow2_dedup_link_l2(BlockDriverState *bs, uint64_t cluster_offset)
+{
+ return 0;
+}
+
+static int qcow2_read_missing_dedup_cluster_data(BlockDriverState *bs,
+ uint8_t *data,
+ uint64_t sector,
+ int nr)
+{
+ QEMUIOVector qiov;
+ struct iovec iov;
+ int ret;
+
+ iov.iov_len = nr * BDRV_SECTOR_SIZE;
+ iov.iov_base = data;
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ ret = bs->drv->bdrv_co_readv(bs, sector, nr, &qiov);
+ if (ret < 0) {
+ error_report("failed to read %d sectors at offset %" PRIu64 "\n",
+ nr, sector);
+ }
+
+ return ret;
+}
+
+/*
+ * This function prepare a buffer containing all the required data
+ * in order to compute hashes and look for duplicate clusters.
+ * It read the missing data before and after the requested qiov
+ * and concatenate with the qiov content.
+ */
+int qcow2_dedup_read_missing_and_concatenate(BlockDriverState *bs,
+ QEMUIOVector *qiov,
+ uint64_t sector,
+ int remaining_sectors,
+ uint8_t *dedup_cluster_data,
+ int *dedup_cluster_data_nr)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+ uint64_t dedup_cluster_beginning_sector;
+ uint64_t dedup_cluster_ending_sector;
+ int dedup_cluster_beginning_nr;
+ int dedup_cluster_ending_nr;
+
+ /* compute how much and where to read at the beginning */
+ dedup_cluster_beginning_nr = sector & (s->cluster_sectors - 1);
+ dedup_cluster_beginning_sector = sector - dedup_cluster_beginning_nr;
+
+ /* same for the end */
+ dedup_cluster_ending_sector = sector + remaining_sectors;
+ dedup_cluster_ending_nr = s->cluster_sectors -
+ (dedup_cluster_ending_sector &
+ (s->cluster_sectors - 1));
+
+ /* compute total size in sectors and allocate memory */
+ *dedup_cluster_data_nr = dedup_cluster_beginning_nr +
+ dedup_cluster_ending_nr +
+ remaining_sectors;
+ dedup_cluster_data = qemu_blockalign(bs, *dedup_cluster_data_nr *
+ BDRV_SECTOR_SIZE);
+
+ /* read beginning */
+ ret = qcow2_read_missing_dedup_cluster_data(bs,
+ dedup_cluster_data,
+ dedup_cluster_beginning_sector,
+ dedup_cluster_beginning_nr);
+
+ if (ret < 0) {
+ goto fail_exit;
+ }
+
+ /* append qiov content */
+ qemu_iovec_to_buf(qiov, 0, dedup_cluster_data +
+ dedup_cluster_beginning_nr * BDRV_SECTOR_SIZE,
+ qiov->size);
+
+ /* read and add ending */
+ ret = qcow2_read_missing_dedup_cluster_data(bs,
+ dedup_cluster_data +
+ (dedup_cluster_beginning_nr + qiov->size) * BDRV_SECTOR_SIZE,
+ dedup_cluster_ending_sector,
+ dedup_cluster_ending_nr);
+
+ if (ret < 0) {
+ goto fail_exit;
+ }
+
+ return 0;
+
+fail_exit:
+ qemu_vfree(dedup_cluster_data);
+ return ret;
+}
+
+/* this function lookup the physical offset of a cluster
+ * after having computed it's hash.
+ * a precomputed hash can be used to do the lookup if hash != NULL
+ */
+static int64_t qcow2_dedup_lookup_hash(BlockDriverState *bs,
+ uint8_t *data,
+ int cluster_offset,
+ uint8_t **hash)
+{
+ if (!(*hash)) {
+ *hash = qcow2_get_dedup_cluster_hash(bs,
+ data + (cluster_offset << 9));
+ }
+ return qcow2_dedup_rb_tree_lookup(bs, *hash);
+}
+
+static QCowDedupHash *qcow2_get_dedup_hash(uint8_t *hash)
+{
+ QCowDedupHash *dedup_hash;
+ dedup_hash = g_new0(QCowDedupHash, 1);
+ dedup_hash->hash = hash;
+ return dedup_hash;
+}
+
+static void qcow2_put_dedup_hash(QCowDedupHash *dedup_hash)
+{
+ g_free(dedup_hash->hash);
+ g_free(dedup_hash);
+}
+
+/* this function tries to deduplicate a given cluster,
+ * it returns a true on success and store a QCowDedupHash
+ * structure on dedup_hash.
+ * it use a precomputed hash if hash != NULL.
+ */
+static bool qcow2_dedup_cluster(BlockDriverState *bs,
+ uint8_t *data,
+ int64_t sector_num,
+ int skip_before_dedup_clusters_nr,
+ QCowDedupHash *dedup_hash,
+ uint8_t **hash)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t deduped_cluster_offset;
+ uint64_t to_free_cluster_offset;
+ int64_t cluster_offset;
+ int ret;
+ int pnum = s->cluster_sectors;
+ dedup_hash = NULL;
+
+ cluster_offset = sector_num & (s->cluster_sectors - 1);
+ deduped_cluster_offset = qcow2_dedup_lookup_hash(bs,
+ data,
+ skip_before_dedup_clusters_nr,
+ hash);
+ /* duplicated cluster found */
+ if (deduped_cluster_offset != -1) {
+ /* fixme : handle error
+ * is the order right for the transaction to be
+ * atomic ?
+ */
+ qcow2_dedup_inc_refcount(bs, deduped_cluster_offset);
+ qcow2_dedup_link_l2(bs, deduped_cluster_offset);
+
+ /* If we are changing an existing physical cluster by a deduped
+ * one decrease its refcount.
+ */
+ ret = qcow2_get_cluster_offset(bs, cluster_offset << 9,
+ &pnum, &to_free_cluster_offset);
+ if (ret < 0) {
+ free(*hash);
+ return false;
+ }
+ if (to_free_cluster_offset) {
+ *hash = qcow2_dedup_read_hash(bs, to_free_cluster_offset);
+ qcow2_dedup_rb_tree_remove(bs, *hash);
+ g_free(hash);
+ qcow2_free_clusters(bs, to_free_cluster_offset,
+ s->cluster_sectors << 9);
+ }
+ }
+ dedup_hash = qcow2_get_dedup_hash(*hash);
+ *hash = NULL;
+ return deduped_cluster_offset == -1 ? false : true;
+}
+
+int qcow2_dedup(BlockDriverState *bs,
+ uint8_t *dedup_cluster_data,
+ int dedup_cluster_data_nr,
+ int64_t sector_num,
+ int remaining_sectors,
+ int *next_non_dedupable_sectors_nr,
+ int *skip_before_dedup_clusters_nr,
+ uint8_t **next_call_first_hash)
+{
+ BDRVQcowState *s = bs->opaque;
+ int deduped_clusters_nr = 0;
+ int left_to_test_clusters_nr;
+ int dedup_cluster_beginning_nr;
+ uint8_t *hash = NULL;
+ QCowDedupHash *non_duplicated_dedup_hash = NULL;
+ /* should already be zero when entering this function */
+ *next_non_dedupable_sectors_nr = 0;
+
+ dedup_cluster_beginning_nr = sector_num & (s->cluster_sectors - 1);
+
+ left_to_test_clusters_nr = dedup_cluster_data_nr -
+ *skip_before_dedup_clusters_nr;
+
+ /* Deduplicate all that can be */
+ while (left_to_test_clusters_nr-- &&
+ qcow2_dedup_cluster(bs,
+ dedup_cluster_data,
+ sector_num,
+ (*skip_before_dedup_clusters_nr)++,
+ non_duplicated_dedup_hash,
+ next_call_first_hash)) {
+ deduped_clusters_nr++;
+ }
+
+ /* We consumed the precomputed hash -> set it to NULL */
+ *next_call_first_hash = NULL;
+
+ /* We deduped everything till the end */
+ if (!non_duplicated_dedup_hash) {
+ *next_non_dedupable_sectors_nr = 0;
+ goto exit;
+ }
+
+ /* Memorize the hash of the first non duplicated cluster.
+ * we will store it when writing the cluster to disk.
+ */
+ QTAILQ_INSERT_TAIL(&s->undedupable_hashes, non_duplicated_dedup_hash,
next);
+ *next_non_dedupable_sectors_nr += s->cluster_sectors;
+
+ /* Count how many non duplicated sector can be written without dedupe
+ * and memorize the hashes for the disk writing code.
+ * We make sure we pass hash == NULL to force computation of the hash.
+ */
+ hash = NULL;
+ while (left_to_test_clusters_nr-- &&
+ -1 == qcow2_dedup_lookup_hash(bs,
+ dedup_cluster_data,
+ (*skip_before_dedup_clusters_nr)++,
+ &hash)) {
+ *next_non_dedupable_sectors_nr += s->cluster_sectors;
+ non_duplicated_dedup_hash = qcow2_get_dedup_hash(hash);
+ QTAILQ_INSERT_TAIL(&s->undedupable_hashes,
+ non_duplicated_dedup_hash, next);
+ hash = NULL;
+ }
+
+ /* We find a duplicated cluster before stopping iterating
+ * store it for next call.
+ */
+ if (non_duplicated_dedup_hash->hash != hash) {
+ *next_call_first_hash = hash;
+ }
+
+exit:
+ if (!deduped_clusters_nr) {
+ return 0;
+ }
+ return deduped_clusters_nr * s->cluster_sectors -
+ dedup_cluster_beginning_nr;
+}
+
+
+/* This function write the hashes of the blocks which are not duplicated
+ */
+void qcow2_dedup_write_new_hashes(BlockDriverState *bs,
+ uint64_t cluster_offset,
+ int count)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowDedupHash *dedup_hash, *next_dedup_hash;
+ int i = 0;
+
+ QTAILQ_FOREACH_SAFE(dedup_hash, &s->undedupable_hashes,
+ next, next_dedup_hash) {
+ qcow2_dedup_write_hash(bs, dedup_hash->hash,
+ cluster_offset + i * s->cluster_sectors);
+ qcow2_dedup_insert_in_rb_tree(bs, dedup_hash->hash,
+ cluster_offset + i * s->cluster_sectors);
+ QTAILQ_REMOVE(&s->undedupable_hashes,
+ dedup_hash,
+ next);
+ qcow2_put_dedup_hash(dedup_hash);
+ i++;
+ if (i == count) {
+ break;
+ }
+ }
+}
diff --git a/block/qcow2.c b/block/qcow2.c
index 8f183f1..2471d9a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -765,6 +765,12 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState
*bs,
QEMUIOVector hd_qiov;
uint64_t bytes_done = 0;
uint8_t *cluster_data = NULL;
+ uint8_t *dedup_cluster_data = NULL;
+ uint8_t *next_call_first_hash;
+ int dedup_cluster_data_nr;
+ int deduped_sectors_nr;
+ int skip_before_dedup_clusters_nr;
+ int next_non_dedupable_sectors_nr;
QCowL2Meta l2meta = {
.nb_clusters = 0,
};
@@ -780,11 +786,66 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState
*bs,
qemu_co_mutex_lock(&s->lock);
+ if (s->has_dedup) {
+ /* if deduplication is on we make sure dedup_cluster_data
+ * contains a multiple of cluster size of data in order
+ * to compute the hashes
+ */
+ ret = qcow2_dedup_read_missing_and_concatenate(bs,
+ qiov,
+ sector_num,
+ remaining_sectors,
+ dedup_cluster_data,
+ &dedup_cluster_data_nr);
+
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ next_non_dedupable_sectors_nr = 0;
+ skip_before_dedup_clusters_nr = 0;
+ next_call_first_hash = NULL;
while (remaining_sectors != 0) {
trace_qcow2_writev_start_part(qemu_coroutine_self());
+
+ if (s->has_dedup && next_non_dedupable_sectors_nr == 0) {
+ /* Try to deduplicate as much cluster as possible */
+ deduped_sectors_nr = qcow2_dedup(bs,
+ dedup_cluster_data,
+ dedup_cluster_data_nr,
+ sector_num,
+ remaining_sectors,
+ &next_non_dedupable_sectors_nr,
+ &skip_before_dedup_clusters_nr,
+ &next_call_first_hash);
+
+ remaining_sectors -= deduped_sectors_nr;
+ sector_num += deduped_sectors_nr;
+ bytes_done += deduped_sectors_nr * 512;
+
+ /* no more data to write -> exit
+ * Can be < 0 because of the presence of sectors we read in
+ * qcow2_read_missing_dedup_sectors_and_concatenate.
+ */
+ if (remaining_sectors < 0) {
+ break;
+ }
+
+ /* if we deduped something trace it */
+ if (deduped_sectors_nr) {
+ trace_qcow2_writev_done_part(qemu_coroutine_self(),
+ deduped_sectors_nr);
+ trace_qcow2_writev_start_part(qemu_coroutine_self());
+ }
+ }
+
index_in_cluster = sector_num & (s->cluster_sectors - 1);
- n_end = index_in_cluster + remaining_sectors;
+ n_end = index_in_cluster +
+ next_non_dedupable_sectors_nr < remaining_sectors ?
+ next_non_dedupable_sectors_nr : remaining_sectors;
+
if (s->crypt_method &&
n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
@@ -826,6 +887,15 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState
*bs,
cur_nr_sectors * 512);
}
+ /* Write the non duplicated clusters hashes to disk
+ * just before writing actual data.
+ */
+ if (s->has_dedup) {
+ qcow2_dedup_write_new_hashes(bs, cluster_offset,
+ l2meta.nb_clusters > 0 ?
+ l2meta.nb_clusters : 1);
+ }
+
BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
qemu_co_mutex_unlock(&s->lock);
trace_qcow2_writev_data(qemu_coroutine_self(),
@@ -845,6 +915,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState
*bs,
run_dependent_requests(s, &l2meta);
+ next_non_dedupable_sectors_nr -= cur_nr_sectors;
remaining_sectors -= cur_nr_sectors;
sector_num += cur_nr_sectors;
bytes_done += cur_nr_sectors * 512;
@@ -859,6 +930,7 @@ fail:
qemu_iovec_destroy(&hd_qiov);
qemu_vfree(cluster_data);
+ qemu_vfree(dedup_cluster_data);
trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
return ret;
diff --git a/block/qcow2.h b/block/qcow2.h
index b4eb654..fd2657b 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -114,8 +114,10 @@ enum {
enum {
QCOW2_INCOMPAT_DIRTY_BITNR = 0,
QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+ QCOW2_INCOMPAT_DEDUP_BITNR = 1,
+ QCOW2_INCOMPAT_DEDUP = 1 << QCOW2_INCOMPAT_DEDUP_BITNR,
- QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY,
+ QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY | QCOW2_INCOMPAT_DEDUP,
};
/* Compatible feature bits */
@@ -132,6 +134,11 @@ typedef struct Qcow2Feature {
char name[46];
} QEMU_PACKED Qcow2Feature;
+typedef struct QCowDedupHash {
+ uint8_t *hash;
+ QTAILQ_ENTRY(QCowDedupHash) next;
+} QCowDedupHash;
+
typedef struct BDRVQcowState {
int cluster_bits;
int cluster_size;
@@ -148,6 +155,7 @@ typedef struct BDRVQcowState {
Qcow2Cache* l2_table_cache;
Qcow2Cache* refcount_block_cache;
+ Qcow2Cache *dedup_block_cache;
uint8_t *cluster_cache;
uint8_t *cluster_data;
@@ -160,6 +168,11 @@ typedef struct BDRVQcowState {
int64_t free_cluster_index;
int64_t free_byte_offset;
+ bool has_dedup;
+ uint64_t *dedup_table;
+ uint64_t dedup_table_offset;
+ uint32_t dedup_table_size;
+
CoMutex lock;
uint32_t crypt_method; /* current crypt method, 0 if no key yet */
@@ -181,6 +194,7 @@ typedef struct BDRVQcowState {
size_t unknown_header_fields_size;
void* unknown_header_fields;
QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext;
+ QTAILQ_HEAD(, QCowDedupHash) undedupable_hashes;
} BDRVQcowState;
/* XXX: use std qcow open function ? */
@@ -333,4 +347,28 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache
*c, uint64_t offset,
void **table);
int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+/* qcow2-dedup functions */
+int qcow2_dedup_read_missing_and_concatenate(BlockDriverState *bs,
+ QEMUIOVector *qiov,
+ uint64_t sector,
+ int remaining_sectors,
+ uint8_t *dedup_cluster_data,
+ int *dedup_cluster_data_nr);
+
+int qcow2_dedup(BlockDriverState *bs,
+ uint8_t *dedup_cluster_data,
+ int dedup_cluster_data_nr,
+ int64_t sector_num,
+ int remaining_sectors,
+ int *next_non_dedupable_sectors_nr,
+ int *skip_before_dedup_cluster_nr,
+ uint8_t **next_call_first_hash);
+
+void qcow2_dedup_write_new_hashes(BlockDriverState *bs,
+ uint64_t cluster_offset,
+ int count);
+
+void qcow2_dedup_free_cluster(BlockDriverState *bs,
+ uint64_t cluster_offset);
+
#endif
--
1.7.10.4