qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 13/14] raw-posix: Implement image locking


From: Fam Zheng
Subject: [Qemu-devel] [PATCH 13/14] raw-posix: Implement image locking
Date: Mon, 31 Oct 2016 23:38:33 +0800

This implements open flag sensible image locking for local file
and host device protocol.

virtlockd in libvirt locks the first byte, so we start looking at the
file bytes from 1.

Quoting what was proposed by Kevin Wolf <address@hidden>, there are
four locking modes by combining two bits (BDRV_O_RDWR and
BDRV_O_SHARE_RW), and implemented by taking two locks:

Lock bytes:

* byte 1: I can't allow other processes to write to the image
* byte 2: I am writing to the image

Lock modes:

* shared writer (BDRV_O_RDWR | BDRV_O_SHARE_RW): Take shared lock on
  byte 2. Test whether byte 1 is locked using an exclusive lock, and
  fail if so.

* exclusive writer (BDRV_O_RDWR only): Take shared lock on byte 2. Test
  whether byte 1 is locked using an exclusive lock, and fail if so. Then
  take shared lock on byte 1. I suppose this is racy, but we can
  probably tolerate that.

* reader that can tolerate writers (BDRV_O_SHARE_RW only): Don't do anything

* reader that can't tolerate writers (neither bit is set): Take shared
  lock on byte 1. Test whether byte 2 is locked, and fail if so.

The complication is in the transactional reopen.  To make the reopen
logic managable, and allow better reuse, the code is internally
organized with a table from old mode to the new one.

Signed-off-by: Fam Zheng <address@hidden>
---
 block/raw-posix.c | 710 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 660 insertions(+), 50 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 7c62fc3..07ab117 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -131,8 +131,44 @@ do { \
 
 #define MAX_BLOCKSIZE  4096
 
+/* Posix file locking bytes. Libvirt takes byte 0, so start from byte 1. */
+#define RAW_LOCK_BYTE_MIN 1
+#define RAW_LOCK_BYTE_NO_OTHER_WRITER 1
+#define RAW_LOCK_BYTE_WRITE     2
+#define RAW_LOCK_BYTE_MAX 2
+
+/*
+ ** shared writer: Take shared lock on byte 2. Test whether byte 1 is
+ *  locked using an exclusive lock, and fail if so.
+ *
+ ** exclusive writer: Take shared lock on byte 2. Test whether byte 1 is
+ *  locked using an exclusive lock, and fail if so. Then take shared lock
+ *  on byte 1. I suppose this is racy, but we can probably tolerate that.
+ *
+ ** reader that can tolerate writers: Don't do anything
+ *
+ ** reader that can't tolerate writers: Take shared lock on byte 1. Test
+ *  whether byte 2 is locked, and fail if so.
+ */
+
+typedef enum {
+    /* Read only and accept other writers. */
+    RAW_L_READ_SHARE_RW,
+    /* Read only and try to forbid other writers. */
+    RAW_L_READ,
+    /* Read write and accept other writers. */
+    RAW_L_WRITE_SHARE_RW,
+    /* Read write and try to forbit other writers. */
+    RAW_L_WRITE,
+} BDRVRawLockMode;
+
 typedef struct BDRVRawState {
     int fd;
+    /* A dup of @fd to make manipulating lock easier, especially during reopen,
+     * where this will accept BDRVRawReopenState.lock_fd. */
+    int lock_fd;
+    bool disable_lock;
+    bool lock_on_invalidate;
     int type;
     int open_flags;
     size_t buf_align;
@@ -146,10 +182,13 @@ typedef struct BDRVRawState {
     bool use_linux_aio:1;
     bool has_fallocate;
     bool needs_alignment;
+    BDRVRawLockMode cur_lock_mode;
 } BDRVRawState;
 
 typedef struct BDRVRawReopenState {
     int fd;
+    /* A dup of @fd used for acquiring lock. */
+    int lock_fd;
     int open_flags;
 } BDRVRawReopenState;
 
@@ -368,6 +407,77 @@ static void raw_parse_flags(int bdrv_flags, int 
*open_flags)
     }
 }
 
+static int raw_lock_fd(int fd, BDRVRawLockMode mode, Error **errp)
+{
+    int ret;
+    assert(fd >= 0);
+    /* Locking byte 1 avoids interfereing with virtlockd. */
+    switch (mode) {
+    case RAW_L_READ_SHARE_RW:
+        ret = qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN,
+                             RAW_LOCK_BYTE_MAX - RAW_LOCK_BYTE_MIN + 1);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to unlock fd");
+            goto fail;
+        }
+        break;
+    case RAW_L_READ:
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock share byte");
+            goto fail;
+        }
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, true);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock write byte 
exclusively");
+            goto fail;
+        }
+        qemu_unlock_fd(fd, RAW_LOCK_BYTE_WRITE, 1);
+        break;
+    case RAW_L_WRITE_SHARE_RW:
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock write byte");
+            goto fail;
+        }
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock share byte 
exclusively");
+            goto fail;
+        }
+        ret = qemu_unlock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to unlock share byte");
+            goto fail;
+        }
+        break;
+    case RAW_L_WRITE:
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock write byte");
+            goto fail;
+        }
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock share byte 
exclusively");
+            goto fail;
+        }
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to downgrade share byte");
+            goto fail;
+        }
+        break;
+    default:
+        abort();
+    }
+    return 0;
+fail:
+    qemu_unlock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1);
+    qemu_unlock_fd(fd, RAW_LOCK_BYTE_WRITE, 1);
+    return -errno;
+}
+
 static void raw_parse_filename(const char *filename, QDict *options,
                                Error **errp)
 {
@@ -393,10 +503,115 @@ static QemuOptsList raw_runtime_opts = {
             .type = QEMU_OPT_STRING,
             .help = "host AIO implementation (threads, native)",
         },
+        {
+            .name = "disable-lock",
+            .type = QEMU_OPT_BOOL,
+            .help = "don't lock the file",
+        },
         { /* end of list */ }
     },
 };
 
+static BDRVRawLockMode raw_get_lock_mode(int flags)
+{
+    switch (flags & (BDRV_O_RDWR | BDRV_O_SHARE_RW)) {
+    case BDRV_O_RDWR:
+        return RAW_L_WRITE;
+    case BDRV_O_RDWR | BDRV_O_SHARE_RW:
+        return RAW_L_WRITE_SHARE_RW;
+    case BDRV_O_SHARE_RW:
+        return RAW_L_READ_SHARE_RW;
+    case 0:
+        return RAW_L_READ;
+    default:
+        abort();
+    }
+}
+
+static int raw_dup_flags(int fd, int old_flags, int new_flags,
+                         const char *filename, Error **errp)
+{
+    int ret = -1;
+    int fcntl_flags = O_APPEND | O_NONBLOCK;
+#ifdef O_NOATIME
+    fcntl_flags |= O_NOATIME;
+#endif
+
+#ifdef O_ASYNC
+    /* Not all operating systems have O_ASYNC, and those that don't
+     * will not let us track the state into rs->open_flags (typically
+     * you achieve the same effect with an ioctl, for example I_SETSIG
+     * on Solaris). But we do not use O_ASYNC, so that's fine.
+     */
+    assert((old_flags & O_ASYNC) == 0);
+#endif
+
+    if ((new_flags & ~fcntl_flags) == (old_flags & ~fcntl_flags)) {
+        /* dup the original fd */
+        ret = qemu_dup(fd);
+        if (ret >= 0) {
+            if (fcntl_setfl(ret, new_flags)) {
+                int new_fd = ret;
+                ret = -errno;
+                qemu_close(new_fd);
+            }
+        }
+    }
+
+    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
+    if (ret < 0) {
+        const char *normalized_filename = filename;
+        ret = raw_normalize_devicepath(&normalized_filename);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not normalize device path");
+        } else {
+            assert(!(new_flags & O_CREAT));
+            ret = qemu_open(normalized_filename, new_flags);
+            if (ret == -1) {
+                error_setg_errno(errp, errno, "Could not open file with new 
flags");
+                ret = -errno;
+            }
+        }
+    }
+    return ret;
+}
+
+static int raw_lock_image(BlockDriverState *bs, int bdrv_flags, Error **errp)
+{
+    int ret;
+    BDRVRawState *s = bs->opaque;
+    BDRVRawLockMode lock_mode;
+
+    if (bdrv_flags & BDRV_O_INACTIVE) {
+        s->disable_lock = true;
+        s->lock_on_invalidate = true;
+    }
+    if (!s->disable_lock) {
+        lock_mode = raw_get_lock_mode(bdrv_flags);
+        if (!(bdrv_flags & BDRV_O_RDWR) && access(bs->filename, W_OK) != 0) {
+            s->disable_lock = true;
+        }
+    }
+    if (!s->disable_lock && lock_mode != RAW_L_READ_SHARE_RW) {
+        int lock_flags = s->open_flags;
+        if (!(bdrv_flags & BDRV_O_SHARE_RW)) {
+            lock_flags |= O_RDWR;
+        }
+        ret = raw_dup_flags(s->fd, s->open_flags, lock_flags, bs->filename,
+                                   errp);
+        if (ret < 0) {
+            return ret;
+        }
+        s->lock_fd = ret;
+        ret = raw_lock_fd(s->lock_fd, lock_mode, errp);
+        if (ret) {
+            return ret;
+        }
+        s->cur_lock_mode = lock_mode;
+    }
+    return 0;
+}
+
 static int raw_open_common(BlockDriverState *bs, QDict *options,
                            int bdrv_flags, int open_flags, Error **errp)
 {
@@ -440,6 +655,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
     raw_parse_flags(bdrv_flags, &s->open_flags);
 
     s->fd = -1;
+    s->lock_fd = -1;
     fd = qemu_open(filename, s->open_flags, 0644);
     if (fd < 0) {
         ret = -errno;
@@ -451,6 +667,15 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
     }
     s->fd = fd;
 
+    s->disable_lock = qemu_opt_get_bool(opts, "disable-lock", false);
+
+    if (!s->disable_lock) {
+        ret = raw_lock_image(bs, bdrv_flags, errp);
+        if (ret) {
+            goto fail;
+        }
+    }
+
 #ifdef CONFIG_LINUX_AIO
      /* Currently Linux does AIO only for files opened with O_DIRECT */
     if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
@@ -538,6 +763,398 @@ static int raw_open(BlockDriverState *bs, QDict *options, 
int flags,
     return raw_open_common(bs, options, flags, 0, errp);
 }
 
+typedef enum {
+    RAW_LT_PREPARE,
+    RAW_LT_COMMIT,
+    RAW_LT_ABORT
+} RawLockTransOp;
+
+typedef int (*RawReopenFunc)(RawLockTransOp op,
+                             int old_lock_fd, int new_lock_fd,
+                             BDRVRawLockMode old_lock,
+                             BDRVRawLockMode new_lock,
+                             Error **errp);
+
+static int raw_lt_nop(RawLockTransOp op,
+                            int old_lock_fd, int new_lock_fd,
+                            BDRVRawLockMode old_lock,
+                            BDRVRawLockMode new_lock,
+                            Error **errp)
+{
+    assert(old_lock == new_lock || new_lock == RAW_L_READ_SHARE_RW);
+    return 0;
+}
+
+static int raw_lt_from_unlock(RawLockTransOp op,
+                              int old_lock_fd, int new_lock_fd,
+                              BDRVRawLockMode old_lock,
+                              BDRVRawLockMode new_lock,
+                              Error **errp)
+{
+    assert(old_lock != new_lock);
+    assert(old_lock == RAW_L_READ_SHARE_RW);
+    switch (op) {
+    case RAW_LT_PREPARE:
+        return raw_lock_fd(new_lock_fd, new_lock, errp);
+        break;
+    case RAW_LT_COMMIT:
+    case RAW_LT_ABORT:
+        break;
+    }
+
+    return 0;
+}
+
+static int raw_lt_read_to_write_share_rw(RawLockTransOp op,
+                                         int old_lock_fd, int new_lock_fd,
+                                         BDRVRawLockMode old_lock,
+                                         BDRVRawLockMode new_lock,
+                                         Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_READ);
+    assert(new_lock == RAW_L_WRITE_SHARE_RW);
+    switch (op) {
+    case RAW_LT_PREPARE:
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock new fd (write 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock new fd (share 
byte)");
+            break;
+        }
+        ret = qemu_unlock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to unlock old fd (share 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
true);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to upgrade new fd (share 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            /* This is very unlikely, but catch it anyway. */
+            error_setg_errno(errp, errno, "Failed to downgrade new fd (share 
byte)");
+        }
+        break;
+    case RAW_LT_COMMIT:
+        break;
+    case RAW_LT_ABORT:
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            error_report("Failed to restore lock on old fd (share byte)");
+        }
+        break;
+    }
+    return ret ? -errno : 0;
+}
+
+static int raw_lt_read_to_write(RawLockTransOp op,
+                                int old_lock_fd, int new_lock_fd,
+                                BDRVRawLockMode old_lock,
+                                BDRVRawLockMode new_lock,
+                                Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_READ);
+    assert(new_lock == RAW_L_WRITE);
+    switch (op) {
+    case RAW_LT_PREPARE:
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock new fd (write 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock new fd (share 
byte)");
+            break;
+        }
+        ret = qemu_unlock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to unlock old fd (share 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
true);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to upgrade new fd (share 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to restore old fd (share 
byte) b");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock new fd (share 
byte)");
+            break;
+        }
+        break;
+    case RAW_LT_COMMIT:
+        break;
+    case RAW_LT_ABORT:
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            error_report("Failed to restore lock on old fd (share byte)");
+        }
+        break;
+    }
+    return ret ? -errno : 0;
+}
+
+static int raw_lt_write_share_rw_to_read(RawLockTransOp op,
+                                         int old_lock_fd, int new_lock_fd,
+                                         BDRVRawLockMode old_lock,
+                                         BDRVRawLockMode new_lock,
+                                         Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_WRITE_SHARE_RW);
+    assert(new_lock == RAW_L_READ);
+    switch (op) {
+    case RAW_LT_PREPARE:
+        /* Make sure there are no other writers. */
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1, true);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock old fd (write 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock new fd (share 
byte)");
+            break;
+        }
+        break;
+    case RAW_LT_COMMIT:
+        ret = qemu_unlock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1);
+        if (ret) {
+            error_report("Failed to unlock old fd (write byte)");
+            break;
+        }
+        break;
+    case RAW_LT_ABORT:
+        break;
+    }
+    return ret ? -errno : 0;
+}
+
+static int raw_lt_write_share_rw_to_write(RawLockTransOp op,
+                                          int old_lock_fd, int new_lock_fd,
+                                          BDRVRawLockMode old_lock,
+                                          BDRVRawLockMode new_lock,
+                                          Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_WRITE_SHARE_RW);
+    assert(new_lock == RAW_L_WRITE);
+    switch (op) {
+    case RAW_LT_PREPARE:
+        /* Make sure there are no other writers. */
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1, true);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock old fd (write 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock new fd (share 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to downgrade old fd (write 
byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock new fd (write 
byte)");
+            break;
+        }
+        break;
+    case RAW_LT_COMMIT:
+        break;
+    case RAW_LT_ABORT:
+        break;
+    }
+    return ret ? -errno : 0;
+}
+
+static int raw_lt_write_to_read(RawLockTransOp op,
+                                int old_lock_fd, int new_lock_fd,
+                                BDRVRawLockMode old_lock,
+                                BDRVRawLockMode new_lock,
+                                Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_WRITE);
+    assert(new_lock == RAW_L_READ);
+    switch (op) {
+    case RAW_LT_PREPARE:
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, 
false);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to lock new fd (share 
byte)");
+            break;
+        }
+        break;
+    case RAW_LT_COMMIT:
+        ret = qemu_unlock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1);
+        if (ret) {
+            error_report("Failed to unlock old fd (write byte)");
+            break;
+        }
+        break;
+    case RAW_LT_ABORT:
+        break;
+    }
+    return ret ? -errno : 0;
+}
+
+static int raw_lt_write_to_write_share_rw(RawLockTransOp op,
+                                          int old_lock_fd, int new_lock_fd,
+                                          BDRVRawLockMode old_lock,
+                                          BDRVRawLockMode new_lock,
+                                          Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_WRITE);
+    assert(new_lock == RAW_L_WRITE_SHARE_RW);
+    switch (op) {
+    case RAW_LT_PREPARE:
+        break;
+    case RAW_LT_COMMIT:
+        ret = qemu_unlock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1);
+        if (ret) {
+            error_report("Failed to unlock old fd (share byte)");
+            break;
+        }
+        break;
+    case RAW_LT_ABORT:
+        break;
+    }
+    return ret ? -errno : 0;
+}
+
+/**
+ * Transactionally moving between possible locking states is tricky and must be
+ * done carefully. That is mostly because downgrading an exclusive lock to
+ * shared or unlocked is not guaranteed to be revertable. As a result, in such
+ * cases we have to defer the downgraing to "commit", given that no revert will
+ * happen after that point, and that downgrading a lock should never fail.
+ *
+ * On the other hand, upgrading a lock (e.g. from unlocked or shared to
+ * exclusive lock) must happen in "prepare" because it may fail.
+ *
+ * Manage the operation matrix with this state transition table to make
+ * fulfulling above conditions easier.
+ */
+static const struct RawReopenFuncRecord {
+    BDRVRawLockMode old_lock;
+    BDRVRawLockMode new_lock;
+    RawReopenFunc func;
+    bool need_lock_fd;
+    bool close_old_lock_fd;
+} reopen_functions[] = {
+
+    {RAW_L_READ_SHARE_RW, RAW_L_READ_SHARE_RW, raw_lt_nop, false, false},
+    {RAW_L_READ_SHARE_RW, RAW_L_READ, raw_lt_from_unlock, true},
+    {RAW_L_READ_SHARE_RW, RAW_L_WRITE_SHARE_RW, raw_lt_from_unlock, true},
+    {RAW_L_READ_SHARE_RW, RAW_L_WRITE, raw_lt_from_unlock, true},
+
+    {RAW_L_READ, RAW_L_READ_SHARE_RW, raw_lt_nop, false, true},
+    {RAW_L_READ, RAW_L_READ, raw_lt_nop, false, false},
+    {RAW_L_READ, RAW_L_WRITE_SHARE_RW, raw_lt_read_to_write_share_rw, true},
+    {RAW_L_READ, RAW_L_WRITE, raw_lt_read_to_write, true},
+
+    {RAW_L_WRITE_SHARE_RW, RAW_L_READ_SHARE_RW, raw_lt_nop, false, true},
+    {RAW_L_WRITE_SHARE_RW, RAW_L_READ, raw_lt_write_share_rw_to_read, true},
+    {RAW_L_WRITE_SHARE_RW, RAW_L_WRITE_SHARE_RW, raw_lt_nop, false, false},
+    {RAW_L_WRITE_SHARE_RW, RAW_L_WRITE, raw_lt_write_share_rw_to_write, true},
+
+    {RAW_L_WRITE, RAW_L_READ_SHARE_RW, raw_lt_nop, false, true},
+    {RAW_L_WRITE, RAW_L_READ, raw_lt_write_to_read, true},
+    {RAW_L_WRITE, RAW_L_WRITE_SHARE_RW, raw_lt_write_to_write_share_rw, true},
+    {RAW_L_WRITE, RAW_L_WRITE, raw_lt_nop, false, false},
+};
+
+static int raw_reopen_handle_lock(BDRVReopenState *state,
+                                  RawLockTransOp op,
+                                  Error **errp)
+{
+    BDRVRawReopenState *rs = state->opaque;
+    BDRVRawState *s = state->bs->opaque;
+    BDRVRawLockMode old_lock, new_lock;
+    const struct RawReopenFuncRecord *rec;
+    int ret;
+
+    old_lock = s->cur_lock_mode;
+    if (qdict_get_try_bool(state->options, "disable-lock", false)) {
+        new_lock = RAW_L_READ_SHARE_RW;
+    } else {
+        new_lock = raw_get_lock_mode(state->flags);
+    }
+    qdict_del(state->options, "disable-lock");
+
+    for (rec = &reopen_functions[0];
+         rec < &reopen_functions[ARRAY_SIZE(reopen_functions)];
+         rec++) {
+        if (rec->old_lock == old_lock && rec->new_lock == new_lock) {
+            break;
+        }
+    }
+    assert(rec != &reopen_functions[ARRAY_SIZE(reopen_functions)]);
+
+    switch (op) {
+    case RAW_LT_PREPARE:
+        if (rec->need_lock_fd) {
+            int lock_flags = rs->open_flags;
+            if (!(state->flags & BDRV_O_SHARE_RW)) {
+                lock_flags |= O_RDWR;
+            }
+            ret = raw_dup_flags(rs->fd, s->open_flags, lock_flags,
+                                state->bs->filename, errp);
+            if (ret < 0) {
+                return ret;
+            }
+            rs->lock_fd = ret;
+        } else {
+            rs->lock_fd = -1;
+        }
+        return rec->func(op, s->lock_fd, rs->lock_fd, old_lock, new_lock, 
errp);
+    case RAW_LT_COMMIT:
+        rec->func(op, s->lock_fd, rs->lock_fd, old_lock, new_lock, errp);
+        if ((rec->need_lock_fd || rec->close_old_lock_fd) && s->lock_fd >= 0) {
+            qemu_close(s->lock_fd);
+        }
+        if (rec->need_lock_fd) {
+            s->lock_fd = rs->lock_fd;
+        }
+        s->cur_lock_mode = new_lock;
+        break;
+    case RAW_LT_ABORT:
+        rec->func(op, s->lock_fd, rs->lock_fd, old_lock, new_lock, errp);
+        if (rec->need_lock_fd) {
+            if (rs->lock_fd >= 0) {
+                qemu_close(rs->lock_fd);
+                rs->lock_fd = -1;
+            }
+        }
+        break;
+    }
+    return 0;
+}
+
 static int raw_reopen_prepare(BDRVReopenState *state,
                               BlockReopenQueue *queue, Error **errp)
 {
@@ -560,61 +1177,24 @@ static int raw_reopen_prepare(BDRVReopenState *state,
 
     raw_parse_flags(state->flags, &rs->open_flags);
 
-    rs->fd = -1;
-
-    int fcntl_flags = O_APPEND | O_NONBLOCK;
-#ifdef O_NOATIME
-    fcntl_flags |= O_NOATIME;
-#endif
-
-#ifdef O_ASYNC
-    /* Not all operating systems have O_ASYNC, and those that don't
-     * will not let us track the state into rs->open_flags (typically
-     * you achieve the same effect with an ioctl, for example I_SETSIG
-     * on Solaris). But we do not use O_ASYNC, so that's fine.
-     */
-    assert((s->open_flags & O_ASYNC) == 0);
-#endif
-
-    if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
-        /* dup the original fd */
-        rs->fd = qemu_dup(s->fd);
-        if (rs->fd >= 0) {
-            ret = fcntl_setfl(rs->fd, rs->open_flags);
-            if (ret) {
-                qemu_close(rs->fd);
-                rs->fd = -1;
-            }
-        }
+    ret = raw_dup_flags(s->fd, s->open_flags, rs->open_flags,
+                        state->bs->filename, errp);
+    if (ret < 0) {
+        return ret;
     }
 
-    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
-    if (rs->fd == -1) {
-        const char *normalized_filename = state->bs->filename;
-        ret = raw_normalize_devicepath(&normalized_filename);
-        if (ret < 0) {
-            error_setg_errno(errp, -ret, "Could not normalize device path");
-        } else {
-            assert(!(rs->open_flags & O_CREAT));
-            rs->fd = qemu_open(normalized_filename, rs->open_flags);
-            if (rs->fd == -1) {
-                error_setg_errno(errp, errno, "Could not reopen file");
-                ret = -1;
-            }
-        }
-    }
+    rs->fd = ret;
 
     /* Fail already reopen_prepare() if we can't get a working O_DIRECT
      * alignment with the new fd. */
-    if (rs->fd != -1) {
-        raw_probe_alignment(state->bs, rs->fd, &local_err);
-        if (local_err) {
-            qemu_close(rs->fd);
-            rs->fd = -1;
-            error_propagate(errp, local_err);
-            ret = -EINVAL;
-        }
+    raw_probe_alignment(state->bs, rs->fd, &local_err);
+    if (local_err) {
+        qemu_close(rs->fd);
+        rs->fd = -1;
+        error_propagate(errp, local_err);
+        return -EINVAL;
     }
+    ret = raw_reopen_handle_lock(state, RAW_LT_PREPARE, errp);
 
     return ret;
 }
@@ -626,6 +1206,8 @@ static void raw_reopen_commit(BDRVReopenState *state)
 
     s->open_flags = rs->open_flags;
 
+    raw_reopen_handle_lock(state, RAW_LT_COMMIT, &error_abort);
+
     qemu_close(s->fd);
     s->fd = rs->fd;
 
@@ -643,6 +1225,8 @@ static void raw_reopen_abort(BDRVReopenState *state)
         return;
     }
 
+    raw_reopen_handle_lock(state, RAW_LT_ABORT, &error_abort);
+
     if (rs->fd >= 0) {
         qemu_close(rs->fd);
         rs->fd = -1;
@@ -1332,6 +1916,10 @@ static void raw_close(BlockDriverState *bs)
         qemu_close(s->fd);
         s->fd = -1;
     }
+    if (s->lock_fd >= 0) {
+        qemu_close(s->lock_fd);
+        s->lock_fd = -1;
+    }
 }
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -1832,6 +2420,27 @@ static int raw_get_info(BlockDriverState *bs, 
BlockDriverInfo *bdi)
     return 0;
 }
 
+static int raw_inactivate(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int r = 0;
+
+    if (s->cur_lock_mode != RAW_L_READ_SHARE_RW) {
+        r = raw_lock_fd(s->lock_fd, RAW_L_READ_SHARE_RW, NULL);
+    }
+    return r;
+}
+
+static void raw_invalidate_cache(BlockDriverState *bs, Error **errp)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (s->lock_on_invalidate) {
+        s->disable_lock = false;
+        raw_lock_image(bs, bdrv_get_flags(bs), errp);
+    }
+}
+
 static QemuOptsList raw_create_opts = {
     .name = "raw-create-opts",
     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
@@ -1885,7 +2494,8 @@ BlockDriver bdrv_file = {
     .bdrv_get_info = raw_get_info,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
-
+    .bdrv_inactivate = raw_inactivate,
+    .bdrv_invalidate_cache = raw_invalidate_cache,
     .create_opts = &raw_create_opts,
 };
 
-- 
2.7.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]