Re: [Qemu-devel] [RFC] qcow2 journalling draft

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [RFC] qcow2 journalling draft

From:	Max Reitz
Subject:	Re: [Qemu-devel] [RFC] qcow2 journalling draft
Date:	Wed, 04 Sep 2013 10:32:54 +0200
User-agent:	Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130805 Thunderbird/17.0.8

On 2013-09-03 15:45, Kevin Wolf wrote:

This contains an extension of the qcow2 spec that introduces journalling
to the image format, plus some preliminary type definitions and
function prototypes in the qcow2 code.

Journalling functionality is a crucial feature for the design of data
deduplication, and it will improve the core part of qcow2 by avoiding
cluster leaks on crashes as well as provide an easier way to get a
reliable implementation of performance features like Delayed COW.

At this point of the RFC, it would be most important to review the
on-disk structure. Once we're confident that it can do everything we
want, we can start going into more detail on the qemu side of things.

Signed-off-by: Kevin Wolf <address@hidden>
---
  block/Makefile.objs   |   2 +-
  block/qcow2-journal.c |  55 ++++++++++++++
  block/qcow2.h         |  78 +++++++++++++++++++
  docs/specs/qcow2.txt  | 204 +++++++++++++++++++++++++++++++++++++++++++++++++-
  4 files changed, 337 insertions(+), 2 deletions(-)
  create mode 100644 block/qcow2-journal.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 3bb85b5..59be314 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,5 +1,5 @@
  block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o 
vpc.o vvfat.o
-block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o 
qcow2-cache.o
+block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o 
qcow2-cache.o qcow2-journal.o
  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
  block-obj-y += qed-check.o
  block-obj-y += vhdx.o
diff --git a/block/qcow2-journal.c b/block/qcow2-journal.c
new file mode 100644
index 0000000..5b20239
--- /dev/null
+++ b/block/qcow2-journal.c
@@ -0,0 +1,55 @@
+/*
+ * qcow2 journalling functions
+ *
+ * Copyright (c) 2013 Kevin Wolf <address@hidden>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qcow2.h"
+
+#define QCOW2_JOURNAL_MAGIC 0x716a6f75726e616cULL  /* "qjournal" */
+#define QCOW2_JOURNAL_BLOCK_MAGIC 0x716a626b  /* "qjbk" */
+
+typedef struct Qcow2JournalHeader {
+    uint64_t    magic;
+    uint32_t    journal_size;
+    uint32_t    block_size;
+    uint32_t    synced_index;
+    uint32_t    synced_seq;
+    uint32_t    committed_seq;
+    uint32_t    checksum;
+} QEMU_PACKED Qcow2JournalHeader;
+
+/*
+ * One big transaction per journal block. The transaction is committed either
+ * time based or when a microtransaction (single set of operations that must be
+ * performed atomically) doesn't fit in the same block any more.
+ */
+typedef struct Qcow2JournalBlock {
+    uint32_t    magic;
+    uint32_t    checksum;
+    uint32_t    seq;
+    uint32_t    desc_offset; /* Allow block header extensions */
+    uint32_t    desc_bytes;
+    uint32_t    nb_data_blocks;
+} QEMU_PACKED Qcow2JournalBlock;
+

Why is this in the C file...

diff --git a/block/qcow2.h b/block/qcow2.h
index 1000239..2aee1fd 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -157,6 +157,10 @@ typedef struct Qcow2DiscardRegion {
      QTAILQ_ENTRY(Qcow2DiscardRegion) next;
  } Qcow2DiscardRegion;

+typedef struct Qcow2Journal {

+
+} Qcow2Journal;
+
  typedef struct BDRVQcowState {
      int cluster_bits;
      int cluster_size;
@@ -479,4 +483,78 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache 
*c, uint64_t offset,
      void **table);
  int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);

+/* qcow2-journal.c functions */

+
+typedef struct Qcow2JournalTransaction Qcow2JournalTransaction;
+
+enum Qcow2JournalEntryTypeID {
+    QJ_DESC_NOOP    = 0,
+    QJ_DESC_WRITE   = 1,
+    QJ_DESC_COPY    = 2,
+
+    /* required after a cluster is freed and used for other purposes, so that
+     * new (unjournalled) data won't be overwritten with stale metadata */
+    QJ_DESC_REVOKE  = 3,
+};
+
+typedef struct Qcow2JournalEntryType {
+    enum Qcow2JournalEntryTypeID id;
+    int (*sync)(void *buf, size_t size);
+} Qcow2JournalEntryType;
+
+typedef struct Qcow2JournalDesc {
+    uint16_t    type;
+    uint16_t    size;
+} QEMU_PACKED Qcow2JournalDesc;
+
+typedef struct Qcow2JournalDescWrite {
+    Qcow2JournalDesc common;
+    struct {
+        uint32_t length;
+        uint64_t target_offset;
+        uint32_t data_block_index;
+    } write[];
+} QEMU_PACKED Qcow2JournalDescData;
+
+typedef struct Qcow2JournalDescCopy {
+    Qcow2JournalDesc common;
+    struct {
+        uint32_t length;
+        uint64_t target_offset;
+        uint64_t source_offset;
+    } copy[];
+} QEMU_PACKED Qcow2JournalDescCopy;
+
+typedef struct Qcow2JournalRevoke {
+    Qcow2JournalDesc common;
+    struct {
+        uint32_t length;
+        uint64_t target_offset;
+    } revoke[];
+} QEMU_PACKED Qcow2JournalDescRevoke;

And this in the header? All these structures are on-disk, therefore I'dpersonally move them all into the header or all into the C file.

+
+void qcow2_journal_register_entry_type(Qcow2JournalEntryType *type);
+
+/* When commit_interval seconds have passed since the last commit, or
+ * uncommitted journal data of at least commit_datasize bytes has accumulated
+ * (whatever occurs first), transactions are committed. */
+int qcow2_journal_init(Qcow2Journal **journal, uint64_t start_offset,
+                       int commit_interval, size_t commit_datasize);
+int qcow2_journal_destroy(Qcow2Journal *journal);
+
+/* These functions create microtransactions, i.e. a set of operations that must
+ * be executed atomically. In general, qemu doesn't map this to one qcow2
+ * on-disk transaction (which would leave a lot of space unused), but handles
+ * multiple microtransaction with one on-disk transaction. */
+Qcow2JournalTransaction *qcow2_journal_begin_transaction(Qcow2Journal 
*journal);
+void qcow2_journal_add(Qcow2JournalTransaction *ta, Qcow2JournalDesc *desc);
+void qcow2_journal_end_transaction(Qcow2JournalTransaction *ta);
+
+/* Commits all completed microtransactions (i.e. qcow2_journal_end_transaction
+ * has already been called) */
+int qcow2_journal_commit(Qcow2Journal *journal);
+
+/* Syncs all committed transactions */
+int qcow2_journal_sync(Qcow2Journal *journal);
+
  #endif
diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
index 33eca36..7578a4b 100644
--- a/docs/specs/qcow2.txt
+++ b/docs/specs/qcow2.txt
@@ -85,6 +85,10 @@ in the description of a field.
                                  be written to (unless for regaining
                                  consistency).

+ Bit 2: Journal dirty. A replay of the main journal is

+                                needed in order to regain consistency before
+                                accessing the image.
+
                      Bits 2-63:  Reserved (set to 0)

80 - 87: compatible_features

@@ -103,7 +107,11 @@ in the description of a field.
                      write to an image with unknown auto-clear features if it
                      clears the respective bits from this field first.

- Bits 0-63: Reserved (set to 0)

+                    Bit 0:      Journal valid bit. This bit indicates that the
+                                image contains a valid main journal starting at
+                                journal_offset.

I second Stefan's question here. An application which does not supportjournaling will not be able to open an image with a dirty journalanyway; and it simply will not make use of the journal if it's clean(but it also won't do any transactions which would require modifying thejournal). Therefore, if any feature bit is introduced at all, I'd put itinto the compatible section.

+
+                    Bits 1-63:  Reserved (set to 0)

96 - 99: refcount_order

                      Describes the width of a reference count block entry 
(width
@@ -114,6 +122,16 @@ in the description of a field.
                      Length of the header structure in bytes. For version 2
                      images, the length is always assumed to be 72 bytes.

+ 104 - 111: journal_offset

+                    Offset into the image file at which the main image journal
+                    starts. Must be aligned to a cluster boundary. 0 means that
+                    no journal is used.
+
+                    This field is only valid if the journal feature bit in
+                    autoclear_features is set. If the field is invalid or the
+                    header is too short to contain the field, the field is
+                    assumed to be 0 (no journal is used)
+
  Directly after the image header, optional sections called header extensions 
can
  be stored. Each extension has a structure like the following:

@@ -355,3 +373,187 @@ Snapshot table entry:

          variable:   Unique ID string for the snapshot (not null terminated)

variable: Name of the snapshot (not null terminated)

+
+
+== Journal ==
+
+Journals are used to allow safe updates of metadata without impacting
+performance by requiring flushes to order updates to different parts of the
+metadata. They consist of transactions, which in turn contain operations that
+are effectively executed atomically. A qcow2 image can have a main image
+journal that deals with cluster management operations, and additional specific
+journals can be used by other features like data deduplication.
+
+
+As far as the on-disk format is concerned, a transaction is in one of the
+following states:
+
+    Incomplete:     This is the initial state of any transaction, while new
+                    operations can still be added. When opening an image with a
+                    dirty journal, incomplete transactions are discarded.
+
+    Committed:      When all operations that must be performed atomically
+                    during the transaction have been written and are stable on
+                    disk, the transaction can be committed by increasing the
+                    commited sequence number in the journal heder. A
+                    transaction in this state may not be changed. When opening
+                    an image with a dirty image, committed transactions should

s/dirty image/dirty journal/

+                    be replayed.
+
+    Synced:         A transaction is synced if all of its operations have been
+                    performed, all data written is stable on disk, and the
+                    synced sequence number is increased in the journal header.
+                    Synced transactions are no longer needed in the journal and
+                    can be overwritten. They are ignored during replay.
+
+The use of a sequence number implies that transactions are processed
+sequentially and an earlier transaction can never be unsynced/uncommitted if a
+later one is synced/committed.
+
+
+A journal is organised in journal blocks, all of which have a reference count
+of exactly 1. It starts with a block containing the following journal header:
+
+    Byte  0 -  7:   Magic ("qjournal" ASCII string)

Why exactly is there a magic string? To recover the journal if thecorresponding header field got corrupted?

+
+          8 - 11:   Journal size in bytes, including the header
+
+         12 - 15:   Journal block size order (block size in bytes = 1 << order)
+                    The block size must be at least 512 bytes and must not
+                    exceed the cluster size.
+
+         16 - 19:   Journal block index of the descriptor for the last
+                    transaction that has been synced, starting with 1 for the
+                    journal block after the header. 0 is used for empty
+                    journals.
+
+         20 - 23:   Sequence number of the last transaction that has been
+                    synced. 0 is recommended as the initial value.
+
+         24 - 27:   Sequence number of the last transaction that has been
+                    committed. When replaying a journal, all transactions
+                    after the last synced one up to the last commit one must be
+                    synced. Note that this may include a wraparound of sequence
+                    numbers.
+
+         28 -  31:  Checksum (one's complement of the sum of all bytes in the
+                    header journal block except those of the checksum field)
+
+         32 - 511:  Reserved (set to 0)
+
+
+The header is followed by journal blocks that are either descriptor or data
+blocks. The block index at byte 16 points to the first valid descriptor, except
+for completely empty journals, where it can be 0. The next descriptor can be
+found by skipping a descriptor and its associated data blocks. When the journal
+size is exceeded, a wraparound occurs, essentially forming a ring buffer.
+
+A wraparound may not occur in the middle of a single transaction, but only
+between two transactions. For the necessary padding an empty descriptor with
+any number of data blocks can be used as the last entry of the ring.
+
+The chain of valid descriptors ends if a descriptor is reached whose sequence
+number isn't the successor of the previous sequence number. This means in
+particular that the journal must be ordered chronologically and has ascending
+sequence numbers (except in the case of a sequence number wraparound).
+All blocks from the end of the descriptor chain until the starting point are
+unused.

So the journal is contiguous on disk including all the descriptors? Isthe journal size in this header fixed (and thus the journal will bepreallocated) or how will that be achieved?

+
+
+Descriptor blocks describe one transaction each and have the following
+structure:
+
+    Byte  0 -  3:   Magic ("qjbk" ASCII string)
+
+          4 -  7:   Checksum (one's complement of the sum of all bytes in the
+                    descriptor block except those of the checksum field, and
+                    all bytes in the associated data blocks)
+
+          8 - 11:   Sequence number of the transaction
+
+         12 - 15:   Byte offset into the descriptor block at which descriptors
+                    start
+
+         16 - 19:   Total length of descriptors in this block in bytes
+
+         20 - 23:   Number of following data blocks that are associated with
+                    this transaction.
+
+         24 -  n:   (Future extensions)
+
+          n -  m:   Array of descriptors as described below. The exact values
+                    of n and m are determined by the above fields.
+
+All descriptors start with a common part:
+
+    Byte  0 -  1:   Descriptor type
+                        0 - No-op descriptor
+                        1 - Write data block
+                        2 - Copy data
+                        3 - Revoke
+                        4 - Deduplication hash insertion
+                        5 - Deduplication hash deletion
+
+          2 -  3:   Size of the descriptor in bytes
+
+          4 -  n:   Type-specific data
+
+The following section specifies the purpose (i.e. the action that is to be
+performed when syncing) and type-specific data layout of each descriptor type:
+
+  * No-op descriptor: No action is to be performed when syncing this descriptor
+
+          4 -  n:   Ignored
+
+  * Write data block: Write literal data associated with this transaction from
+    the journal to a given offset.
+
+          4 -  7:   Length of the data to write in bytes
+
+          8 - 15:   Offset in the image file to write the data to
+
+         16 - 19:   Index of the journal block at which the data to write
+                    starts. The data must be stored sequentially and be fully
+                    contained in the data blocks associated with the
+                    transaction.
+
+    The type-specific data can be repeated, specifying multiple chunks of data
+    to be written in one operation. This means the size of the descriptor must
+    be 4 + 16 * n.
+
+  * Copy data: Copy data from one offset in the image to another one. This can
+    be used for journalling copy-on-write operations.
+
+          4 -  7:   Length of the data to write in bytes
+
+          8 - 15:   Target offset in the image file
+
+         16 - 23:   Source offset in the image file
+
+    The type-specific data can be repeated, specifying multiple chunks of data
+    to be copied in one operation. This means the size of the descriptor must
+    be 4 + 20 * n.
+
+  * Revoke: Marks operations on a given range in the imag file invalid for all
+    earlier transactions (this does not include the transaction containing the
+    revoke). They must not be executed on a sync operation (e.g. because the
+    range in question has been freed and may have been reused for other, not
+    journalled data structures that must not be overwritten with stale data).
+    Note that this may mean that operations are to be executed partially.
+
+          4 -  7:   Length of the range in bytes
+
+          8 - 15:   Offset of the range in the image file
+
+    The type-specific data can be repeated, specifying multiple ranges for
+    which operations should be revoked. This means the size of the descriptor
+    must be 4 + 12 * n.
+
+  * Deduplication hash insertion: Associates a hash value with a cluster.
+
+    TODO
+
+  * Deduplication hash deletion: Marks a hash value invalid (e.g. because the
+    hashed data has changed)
+
+    TODO

Max

[Prev in Thread]

Current Thread

[Next in Thread]

Re: [Qemu-devel] [RFC] qcow2 journalling draft, (continued)
- Re: [Qemu-devel] [RFC] qcow2 journalling draft, Max Reitz <=
  - Re: [Qemu-devel] [RFC] qcow2 journalling draft, Kevin Wolf, 2013/09/04
- Re: [Qemu-devel] [RFC] qcow2 journalling draft, Stefan Hajnoczi, 2013/09/05
  - Re: [Qemu-devel] [RFC] qcow2 journalling draft, Kevin Wolf, 2013/09/05
    - Re: [Qemu-devel] [RFC] qcow2 journalling draft, Benoît Canet, 2013/09/05
- Re: [Qemu-devel] [RFC] qcow2 journalling draft, Fam Zheng, 2013/09/06

Prev by Date: Re: [Qemu-devel] [PATCH v2 05/10] raven: set a correct PCI I/O memory region
Next by Date: Re: [Qemu-devel] [PATCH] ne2000: mark I/O as LITTLE_ENDIAN
Previous by thread: Re: [Qemu-devel] [RFC] qcow2 journalling draft
Next by thread: Re: [Qemu-devel] [RFC] qcow2 journalling draft
Index(es):
- Date
- Thread