qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH V9 06/11] quorum: Add quorum mechanism.


From: Benoît Canet
Subject: Re: [Qemu-devel] [PATCH V9 06/11] quorum: Add quorum mechanism.
Date: Mon, 28 Oct 2013 13:31:19 +0100
User-agent: Mutt/1.5.21 (2010-09-15)

Le Friday 04 Oct 2013 à 16:48:12 (+0200), Max Reitz a écrit :
> On 2013-10-02 14:39, Benoît Canet wrote:
> >Use gnutls's SHA-256 to compare versions.
> Wouldn't CRC32 suffice? (I don't really oppose using SHA, but taking
> in gnutls as a dependency just for comparing several memory areas
> seems a bit much to me)

Initially it gzip's addler32 was used but someone was concerned with the risk
of collisions.
Anyway the code fallback using hashes only when something wrong is detected so
it won't impact the normal case.

Best regards

Benoît

> 
> >Signed-off-by: Benoit Canet <address@hidden>
> >---
> >  block/Makefile.objs       |   2 +-
> >  block/quorum.c            | 321 
> > +++++++++++++++++++++++++++++++++++++++++++++-
> >  configure                 |  36 ++++++
> >  include/monitor/monitor.h |   2 +
> >  monitor.c                 |   2 +
> >  5 files changed, 361 insertions(+), 2 deletions(-)
> >
> >diff --git a/block/Makefile.objs b/block/Makefile.objs
> >index 05a65c2..adcdc21 100644
> >--- a/block/Makefile.objs
> >+++ b/block/Makefile.objs
> >@@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o 
> >qcow2-snapshot.o qcow2-c
> >  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
> >  block-obj-y += qed-check.o
> >  block-obj-y += vhdx.o
> >-block-obj-y += quorum.o
> >+block-obj-$(CONFIG_QUORUM) += quorum.o
> >  block-obj-y += parallels.o blkdebug.o blkverify.o
> >  block-obj-y += snapshot.o qapi.o
> >  block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
> >diff --git a/block/quorum.c b/block/quorum.c
> >index f0fc0e9..e235ac1 100644
> >--- a/block/quorum.c
> >+++ b/block/quorum.c
> >@@ -13,7 +13,43 @@
> >   * See the COPYING file in the top-level directory.
> >   */
> >+#include <gnutls/gnutls.h>
> >+#include <gnutls/crypto.h>
> >  #include "block/block_int.h"
> >+#include "qapi/qmp/qjson.h"
> >+
> >+#define HASH_LENGTH 32
> >+
> >+/* This union hold a vote hash value */
> *holds
> 
> >+typedef union QuorumVoteValue {
> >+    char h[HASH_LENGTH];       /* SHA-256 hash */
> >+    int64_t l;                 /* simpler 64 bits hash */
> >+} QuorumVoteValue;
> >+
> >+/* A vote item */
> >+typedef struct QuorumVoteItem {
> >+    int index;
> >+    QLIST_ENTRY(QuorumVoteItem) next;
> >+} QuorumVoteItem;
> >+
> >+/* this structure is a vote version. A version is the set of vote sharing 
> >the
> *set of votes
> 
> >+ * same vote value.
> >+ * The set of vote will be tracked with the items field and it's count is
> *set of votes or *vote set; also s/it's count/its cardinality/ or
> something like that
> 
> >+ * vote_count.
> >+ */
> >+typedef struct QuorumVoteVersion {
> >+    QuorumVoteValue value;
> >+    int index;
> >+    int vote_count;
> >+    QLIST_HEAD(, QuorumVoteItem) items;
> >+    QLIST_ENTRY(QuorumVoteVersion) next;
> >+} QuorumVoteVersion;
> >+
> >+/* this structure hold a group of vote versions together */
> *holds
> 
> >+typedef struct QuorumVotes {
> >+    QLIST_HEAD(, QuorumVoteVersion) vote_list;
> >+    int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
> >+} QuorumVotes;
> >  /* the following structure hold the state of one quorum instance */
> >  typedef struct {
> >@@ -60,10 +96,14 @@ struct QuorumAIOCB {
> >      int success_count;          /* number of successfully completed AIOCB 
> > */
> >      bool *finished;             /* completion signal for cancel */
> >+    QuorumVotes votes;
> >+
> >      bool is_read;
> >      int vote_ret;
> >  };
> >+static void quorum_vote(QuorumAIOCB *acb);
> >+
> >  static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> >  {
> >      QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> >@@ -111,6 +151,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
> >          acb->aios[i].ret = 0;
> >      }
> >+    if (acb->vote_ret) {
> >+        ret = acb->vote_ret;
> >+    }
> >+
> >      acb->common.cb(acb->common.opaque, ret);
> >      if (acb->finished) {
> >          *acb->finished = true;
> >@@ -122,6 +166,11 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
> >      qemu_aio_release(acb);
> >  }
> >+static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
> >+{
> >+    return memcmp(a->h, b->h, HASH_LENGTH);
> >+}
> >+
> >  static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> >                                     BlockDriverState *bs,
> >                                     QEMUIOVector *qiov,
> >@@ -141,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> >      acb->count = 0;
> >      acb->success_count = 0;
> >      acb->finished = NULL;
> >+    acb->votes.compare = quorum_sha256_compare;
> >      acb->is_read = false;
> >      acb->vote_ret = 0;
> >@@ -170,9 +220,278 @@ static void quorum_aio_cb(void *opaque, int ret)
> >          return;
> >      }
> >+    /* Do the vote on read */
> >+    if (acb->is_read) {
> >+        quorum_vote(acb);
> >+    }
> >+
> >      quorum_aio_finalize(acb);
> >  }
> >+static void quorum_report_bad(QuorumAIOCB *acb, int index)
> >+{
> >+    QObject *data;
> >+    data = qobject_from_jsonf("{ 'children-index': %i"
> I'd prefer child-index. Generally, remember that the singular of
> "children" is "child".
> 
> >+                              ", 'sector-num': %" PRId64
> >+                              ", 'sectors-count': %i }",
> >+                              index,
> >+                              acb->sector_num,
> >+                              acb->nb_sectors);
> >+    monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data);
> How about decrementing the refcount of data?
> 
> >+}
> >+
> >+static void quorum_report_failure(QuorumAIOCB *acb)
> >+{
> >+    QObject *data;
> >+    data = qobject_from_jsonf("{ 'sector-num': %" PRId64
> >+                              ", 'sectors-count': %i }",
> >+                              acb->sector_num,
> >+                              acb->nb_sectors);
> >+    monitor_protocol_event(QEVENT_QUORUM_FAILURE, data);
> Same here.
> 
> >+}
> >+
> >+static void quorum_report_bad_versions(QuorumAIOCB *acb,
> >+                                      QuorumVoteValue *value)
> >+{
> >+    QuorumVoteVersion *version;
> >+    QuorumVoteItem *item;
> >+
> >+    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
> >+        if (!acb->votes.compare(&version->value, value)) {
> >+            continue;
> >+        }
> >+        QLIST_FOREACH(item, &version->items, next) {
> >+            quorum_report_bad(acb, item->index);
> >+        }
> >+    }
> >+}
> >+
> >+static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
> >+{
> >+    int i;
> >+    assert(dest->niov == source->niov);
> >+    assert(dest->size == source->size);
> >+    for (i = 0; i < source->niov; i++) {
> >+        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
> >+        memcpy(dest->iov[i].iov_base,
> >+               source->iov[i].iov_base,
> >+               source->iov[i].iov_len);
> >+    }
> >+}
> >+
> >+static void quorum_count_vote(QuorumVotes *votes,
> >+                              QuorumVoteValue *value,
> >+                              int index)
> >+{
> >+    QuorumVoteVersion *v = NULL, *version = NULL;
> >+    QuorumVoteItem *item;
> >+
> >+    /* look if we have something with this hash */
> >+    QLIST_FOREACH(v, &votes->vote_list, next) {
> >+        if (!votes->compare(&v->value, value)) {
> >+            version = v;
> >+            break;
> >+        }
> >+    }
> >+
> >+    /* It's a version not yet in the list add it */
> >+    if (!version) {
> >+        version = g_new0(QuorumVoteVersion, 1);
> >+        QLIST_INIT(&version->items);
> >+        memcpy(&version->value, value, sizeof(version->value));
> >+        version->index = index;
> >+        version->vote_count = 0;
> >+        QLIST_INSERT_HEAD(&votes->vote_list, version, next);
> >+    }
> >+
> >+    version->vote_count++;
> >+
> >+    item = g_new0(QuorumVoteItem, 1);
> >+    item->index = index;
> >+    QLIST_INSERT_HEAD(&version->items, item, next);
> >+}
> >+
> >+static void quorum_free_vote_list(QuorumVotes *votes)
> >+{
> >+    QuorumVoteVersion *version, *next_version;
> >+    QuorumVoteItem *item, *next_item;
> >+
> >+    QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
> >+        QLIST_REMOVE(version, next);
> >+        QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
> >+            QLIST_REMOVE(item, next);
> >+            g_free(item);
> >+        }
> >+        g_free(version);
> >+    }
> >+}
> >+
> >+static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue 
> >*hash)
> >+{
> >+    int j, ret;
> >+    gnutls_hash_hd_t dig;
> >+    QEMUIOVector *qiov = &acb->aios[i].qiov;
> >+
> >+    ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
> >+
> >+    if (ret < 0) {
> >+        return ret;
> >+    }
> >+
> >+    for (j = 0; j < qiov->niov; j++) {
> >+        ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
> >+        if (ret < 0) {
> >+            return ret;
> gnutls_hash_deinit?
> 
> >+        }
> >+    }
> >+
> >+    gnutls_hash_deinit(dig, (void *) hash);
> >+
> >+    return 0;
> >+}
> >+
> >+static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
> >+{
> >+    int i = 0;
> >+    QuorumVoteVersion *candidate, *winner = NULL;
> >+
> >+    QLIST_FOREACH(candidate, &votes->vote_list, next) {
> >+        if (candidate->vote_count > i) {
> >+            i = candidate->vote_count;
> >+            winner = candidate;
> >+        }
> >+    }
> >+
> >+    return winner;
> >+}
> >+
> >+static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
> >+{
> >+    int i;
> >+    int result;
> >+
> >+    assert(a->niov == b->niov);
> >+    for (i = 0; i < a->niov; i++) {
> >+        assert(a->iov[i].iov_len == b->iov[i].iov_len);
> >+        result = memcmp(a->iov[i].iov_base,
> >+                        b->iov[i].iov_base,
> >+                        a->iov[i].iov_len);
> >+        if (result) {
> >+            return false;
> >+        }
> >+    }
> >+
> >+    return true;
> >+}
> >+
> >+static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb,
> >+                                          const char *fmt, ...)
> >+{
> >+    va_list ap;
> >+
> >+    va_start(ap, fmt);
> >+    fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ",
> >+            acb->sector_num, acb->nb_sectors);
> >+    vfprintf(stderr, fmt, ap);
> >+    fprintf(stderr, "\n");
> >+    va_end(ap);
> >+    exit(1);
> >+}
> >+
> >+static bool quorum_compare(QuorumAIOCB *acb,
> >+                           QEMUIOVector *a,
> >+                           QEMUIOVector *b)
> >+{
> >+    BDRVQuorumState *s = acb->bqs;
> >+    ssize_t offset;
> >+
> >+    /* This driver will replace blkverify in this particular case */
> >+    if (s->is_blkverify) {
> >+        offset = qemu_iovec_compare(a, b);
> >+        if (offset != -1) {
> >+            quorum_err(acb, "contents mismatch in sector %" PRId64,
> >+                       acb->sector_num +
> >+                       (uint64_t)(offset / BDRV_SECTOR_SIZE));
> >+        }
> >+        return true;
> >+    }
> >+
> >+    return quorum_iovec_compare(a, b);
> >+}
> >+
> >+static void quorum_vote(QuorumAIOCB *acb)
> >+{
> >+    bool quorum = true;
> >+    int i, j, ret;
> >+    QuorumVoteValue hash;
> >+    BDRVQuorumState *s = acb->bqs;
> >+    QuorumVoteVersion *winner;
> >+
> >+    /* get the index of the first successful read */
> >+    for (i = 0; i < s->total; i++) {
> >+        if (!acb->aios[i].ret) {
> >+            break;
> >+        }
> >+    }
> >+
> >+    /* compare this read with all other successful read looking for quorum 
> >*/
> >+    for (j = i + 1; j < s->total; j++) {
> >+        if (acb->aios[j].ret) {
> >+            continue;
> >+        }
> >+        quorum = quorum_compare(acb, &acb->aios[i].qiov, 
> >&acb->aios[j].qiov);
> >+        if (!quorum) {
> >+            break;
> >+       }
> >+    }
> >+
> >+    /* Every successful read agrees -> Quorum */
> >+    if (quorum) {
> >+        quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov);
> If no reads were successful at all, quorum will be true and i will
> be s->total. Therefore, this will result in an array access with an
> index out of bounds here.
> 
> Generally, why is this function executed at all if any read failed?
> If a read fails, the quorum read function will return an error, thus
> the caller will ignore the result anyway.
> 
> >+        return;
> >+    }
> >+
> >+    /* compute hashs for each successful read, also store indexes */
> >+    for (i = 0; i < s->total; i++) {
> >+        if (acb->aios[i].ret) {
> >+            continue;
> >+        }
> >+        ret = quorum_compute_hash(acb, i, &hash);
> >+        /* if ever the hash computation failed */
> >+        if (ret < 0) {
> >+            acb->vote_ret = ret;
> >+            goto free_exit;
> >+        }
> >+        quorum_count_vote(&acb->votes, &hash, i);
> >+    }
> >+
> >+    /* vote to select the most represented version */
> >+    winner = quorum_get_vote_winner(&acb->votes);
> >+    /* every vote version are differents -> error */
> >+    if (!winner) {
> >+        quorum_report_failure(acb);
> >+        acb->vote_ret = -EIO;
> >+        goto free_exit;
> >+    }
> >+
> >+    /* if the winner count is smaller then threshold read fail */
> s/then/than/, s/ read fail/, the read fails/
> 
> >+    if (winner->vote_count < s->threshold) {
> >+        quorum_report_failure(acb);
> >+        acb->vote_ret = -EIO;
> >+        goto free_exit;
> >+    }
> >+
> >+    /* we have a winner: copy it */
> >+    quorum_copy_qiov(acb->qiov, &acb->aios[winner->index].qiov);
> >+
> >+    /* some versions are bad print them */
> >+    quorum_report_bad_versions(acb, &winner->value);
> >+
> >+free_exit:
> >+    /* free lists */
> >+    quorum_free_vote_list(&acb->votes);
> >+}
> >+
> >  static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
> >                                           int64_t sector_num,
> >                                           QEMUIOVector *qiov,
> >@@ -194,7 +513,7 @@ static BlockDriverAIOCB 
> >*quorum_aio_readv(BlockDriverState *bs,
> >      }
> >      for (i = 0; i < s->total; i++) {
> >-        bdrv_aio_readv(&s->bs[i], sector_num, qiov, nb_sectors,
> >+        bdrv_aio_readv(&s->bs[i], sector_num, &acb->aios[i].qiov, 
> >nb_sectors,
> >                         quorum_aio_cb, &acb->aios[i]);
> >      }
> >diff --git a/configure b/configure
> >index 23dbaaf..efbe7d9 100755
> >--- a/configure
> >+++ b/configure
> >@@ -246,6 +246,7 @@ gtk=""
> >  gtkabi="2.0"
> >  tpm="no"
> >  libssh2=""
> >+quorum="no"
> >  # parse CC options first
> >  for opt do
> >@@ -972,6 +973,10 @@ for opt do
> >    ;;
> >    --enable-libssh2) libssh2="yes"
> >    ;;
> >+  --disable-quorum) quorum="no"
> >+  ;;
> >+  --enable-quorum) quorum="yes"
> >+  ;;
> >    *) echo "ERROR: unknown option $opt"; show_help="yes"
> >    ;;
> >    esac
> >@@ -1203,6 +1208,8 @@ echo "  --gcov=GCOV              use specified gcov 
> >[$gcov_tool]"
> >  echo "  --enable-tpm             enable TPM support"
> >  echo "  --disable-libssh2        disable ssh block device support"
> >  echo "  --enable-libssh2         enable ssh block device support"
> >+echo "  --disable-quorum         disable quorum block filter support"
> >+echo "  --enable-quorum          enable quorum block filter support"
> >  echo ""
> >  echo "NOTE: The object files are built at the place where configure is 
> > launched"
> >  exit 1
> >@@ -1895,6 +1902,30 @@ EOF
> >  fi
> >  ##########################################
> >+# Quorum probe (check for gnutls)
> >+if test "$quorum" != "no" ; then
> >+cat > $TMPC <<EOF
> >+#include <gnutls/gnutls.h>
> >+#include <gnutls/crypto.h>
> >+int main(void) {char data[4096], digest[32];
> >+gnutls_hash_fast(GNUTLS_DIG_SHA256, data, 4096, digest);
> >+return 0;
> >+}
> >+EOF
> >+quorum_tls_cflags=`$pkg_config --cflags gnutls 2> /dev/null`
> >+quorum_tls_libs=`$pkg_config --libs gnutls 2> /dev/null`
> >+if compile_prog "$quorum_tls_cflags" "$quorum_tls_libs" ; then
> >+  qcow_tls=yes
> >+  libs_softmmu="$quorum_tls_libs $libs_softmmu"
> >+  libs_tools="$quorum_tls_libs $libs_softmmu"
> >+  QEMU_CFLAGS="$QEMU_CFLAGS $quorum_tls_cflags"
> >+else
> >+  echo "gnutls > 2.10.0 required to compile Quorum"
> >+  exit 1
> >+fi
> >+fi
> >+
> >+##########################################
> >  # VNC SASL detection
> >  if test "$vnc" = "yes" -a "$vnc_sasl" != "no" ; then
> >    cat > $TMPC <<EOF
> >@@ -3758,6 +3789,7 @@ echo "TPM support       $tpm"
> >  echo "libssh2 support   $libssh2"
> >  echo "TPM passthrough   $tpm_passthrough"
> >  echo "QOM debugging     $qom_cast_debug"
> >+echo "Quorum            $quorum"
> >  if test "$sdl_too_old" = "yes"; then
> >  echo "-> Your SDL version is too old - please upgrade to have SDL support"
> >@@ -4156,6 +4188,10 @@ if test "$libssh2" = "yes" ; then
> >    echo "CONFIG_LIBSSH2=y" >> $config_host_mak
> >  fi
> >+if test "$quorum" = "yes" ; then
> >+  echo "CONFIG_QUORUM=y" >> $config_host_mak
> >+fi
> >+
> >  if test "$virtio_blk_data_plane" = "yes" ; then
> >    echo 'CONFIG_VIRTIO_BLK_DATA_PLANE=$(CONFIG_VIRTIO)' >> $config_host_mak
> >  fi
> >diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
> >index 10fa0e3..51902ed 100644
> >--- a/include/monitor/monitor.h
> >+++ b/include/monitor/monitor.h
> >@@ -49,6 +49,8 @@ typedef enum MonitorEvent {
> >      QEVENT_SPICE_MIGRATE_COMPLETED,
> >      QEVENT_GUEST_PANICKED,
> >      QEVENT_BLOCK_IMAGE_CORRUPTED,
> >+    QEVENT_QUORUM_FAILURE,
> >+    QEVENT_QUORUM_REPORT_BAD,
> >      /* Add to 'monitor_event_names' array in monitor.c when
> >       * defining new events here */
> >diff --git a/monitor.c b/monitor.c
> >index 74f3f1b..89d139f 100644
> >--- a/monitor.c
> >+++ b/monitor.c
> >@@ -507,6 +507,8 @@ static const char *monitor_event_names[] = {
> >      [QEVENT_SPICE_MIGRATE_COMPLETED] = "SPICE_MIGRATE_COMPLETED",
> >      [QEVENT_GUEST_PANICKED] = "GUEST_PANICKED",
> >      [QEVENT_BLOCK_IMAGE_CORRUPTED] = "BLOCK_IMAGE_CORRUPTED",
> >+    [QEVENT_QUORUM_FAILURE] = "QUORUM_FAILURE",
> >+    [QEVENT_QUORUM_REPORT_BAD] = "QUORUM_REPORT_BAD",
> Could you document these events in docs/qmp/qmp-events.txt?
> 
> Max
> 
> >  };
> >  QEMU_BUILD_BUG_ON(ARRAY_SIZE(monitor_event_names) != QEVENT_MAX)
> 
> 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]