qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Verita


From: ashish mittal
Subject: Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support
Date: Tue, 15 Nov 2016 14:38:27 -0800

Hi Stefan


On Wed, Sep 28, 2016 at 2:45 PM, Stefan Hajnoczi <address@hidden> wrote:
> On Tue, Sep 27, 2016 at 09:09:49PM -0700, Ashish Mittal wrote:
>
> Review of .bdrv_open() and .bdrv_aio_writev() code paths.
>
> The big issues I see in this driver and libqnio:
>
> 1. Showstoppers like broken .bdrv_open() and leaking memory on every
>    reply message.
> 2. Insecure due to missing input validation (network packets and
>    configuration) and incorrect string handling.
> 3. Not fully asynchronous so QEMU and the guest may hang.
>
> Please think about the whole codebase and not just the lines I've
> pointed out in this review when fixing these sorts of issues.  There may
> be similar instances of these bugs elsewhere and it's important that
> they are fixed so that this can be merged.
>
>> +/*
>> + * Structure per vDisk maintained for state
>> + */
>> +typedef struct BDRVVXHSState {
>> +    int                     fds[2];
>> +    int64_t                 vdisk_size;
>> +    int64_t                 vdisk_blocks;
>> +    int64_t                 vdisk_flags;
>> +    int                     vdisk_aio_count;
>> +    int                     event_reader_pos;
>> +    VXHSAIOCB               *qnio_event_acb;
>> +    void                    *qnio_ctx;
>> +    QemuSpin                vdisk_lock; /* Lock to protect BDRVVXHSState */
>> +    QemuSpin                vdisk_acb_lock;  /* Protects ACB */
>
> These comments are insufficient for documenting locking.  Not all fields
> are actually protected by these locks.  Please order fields according to
> lock coverage:
>
> typedef struct VXHSAIOCB {
>     ...
>
>     /* Protected by BDRVVXHSState->vdisk_acb_lock */
>     int                 segments;
>     ...
> };
>
> typedef struct BDRVVXHSState {
>     ...
>
>     /* Protected by vdisk_lock */
>     QemuSpin                vdisk_lock;
>     int                     vdisk_aio_count;
>     QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq;
>     ...
> }
>
>> +static void vxhs_qnio_iio_close(BDRVVXHSState *s, int idx)
>> +{
>> +    /*
>> +     * Close vDisk device
>> +     */
>> +    if (s->vdisk_hostinfo[idx].vdisk_rfd >= 0) {
>> +        iio_devclose(s->qnio_ctx, 0, s->vdisk_hostinfo[idx].vdisk_rfd);
>
> libqnio comment:
> Why does iio_devclose() take an unused cfd argument?  Perhaps it can be
> dropped.
>
>> +        s->vdisk_hostinfo[idx].vdisk_rfd = -1;
>> +    }
>> +
>> +    /*
>> +     * Close QNIO channel against cached channel-fd
>> +     */
>> +    if (s->vdisk_hostinfo[idx].qnio_cfd >= 0) {
>> +        iio_close(s->qnio_ctx, s->vdisk_hostinfo[idx].qnio_cfd);
>
> libqnio comment:
> Why does iio_devclose() take an int32_t cfd argument but iio_close()
> takes a uint32_t cfd argument?
>
>> +        s->vdisk_hostinfo[idx].qnio_cfd = -1;
>> +    }
>> +}
>> +
>> +static int vxhs_qnio_iio_open(int *cfd, const char *of_vsa_addr,
>> +                              int *rfd, const char *file_name)
>> +{
>> +    /*
>> +     * Open qnio channel to storage agent if not opened before.
>> +     */
>> +    if (*cfd < 0) {
>> +        *cfd = iio_open(global_qnio_ctx, of_vsa_addr, 0);
>
> libqnio comments:
>
> 1.
> There is a buffer overflow in qnio_create_channel().  strncpy() is used
> incorrectly so long hostname or port (both can be 99 characters long)
> will overflow channel->name[] (64 characters) or channel->port[] (8
> characters).
>
>     strncpy(channel->name, hostname, strlen(hostname) + 1);
>     strncpy(channel->port, port, strlen(port) + 1);
>
> The third argument must be the size of the *destination* buffer, not the
> source buffer.  Also note that strncpy() doesn't NUL-terminate the
> destination string so you must do that manually to ensure there is a NUL
> byte at the end of the buffer.
>
> 2.
> channel is leaked in the "Failed to open single connection" error case
> in qnio_create_channel().
>
> 3.
> If host is longer the 63 characters then the ioapi_ctx->channels and
> qnio_ctx->channels maps will use different keys due to string truncation
> in qnio_create_channel().  This means "Channel already exists" in
> qnio_create_channel() and possibly other things will not work as
> expected.
>
>> +        if (*cfd < 0) {
>> +            trace_vxhs_qnio_iio_open(of_vsa_addr);
>> +            return -ENODEV;
>> +        }
>> +    }
>> +
>> +    /*
>> +     * Open vdisk device
>> +     */
>> +    *rfd = iio_devopen(global_qnio_ctx, *cfd, file_name, 0);
>
> libqnio comment:
> Buffer overflow in iio_devopen() since chandev[128] is not large enough
> to hold channel[100] + " " + devpath[arbitrary length] chars:
>
>     sprintf(chandev, "%s %s", channel, devpath);
>
>> +
>> +    if (*rfd < 0) {
>> +        if (*cfd >= 0) {
>
> This check is always true.  Otherwise the return -ENODEV would have been
> taken above.  The if statement isn't necessary.
>
>> +static void vxhs_check_failover_status(int res, void *ctx)
>> +{
>> +    BDRVVXHSState *s = ctx;
>> +
>> +    if (res == 0) {
>> +        /* found failover target */
>> +        s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx;
>> +        s->vdisk_ask_failover_idx = 0;
>> +        trace_vxhs_check_failover_status(
>> +                   s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>> +                   s->vdisk_guid);
>> +        qemu_spin_lock(&s->vdisk_lock);
>> +        OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s);
>> +        qemu_spin_unlock(&s->vdisk_lock);
>> +        vxhs_handle_queued_ios(s);
>> +    } else {
>> +        /* keep looking */
>> +        trace_vxhs_check_failover_status_retry(s->vdisk_guid);
>> +        s->vdisk_ask_failover_idx++;
>> +        if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) {
>> +            /* pause and cycle through list again */
>> +            sleep(QNIO_CONNECT_RETRY_SECS);
>
> This code is called from a QEMU thread via vxhs_aio_rw().  It is not
> permitted to call sleep() since it will freeze QEMU and probably the
> guest.
>
> If you need a timer you can use QEMU's timer APIs.  See aio_timer_new(),
> timer_new_ns(), timer_mod(), timer_del(), timer_free().
>
>> +            s->vdisk_ask_failover_idx = 0;
>> +        }
>> +        res = vxhs_switch_storage_agent(s);
>> +    }
>> +}
>> +
>> +static int vxhs_failover_io(BDRVVXHSState *s)
>> +{
>> +    int res = 0;
>> +
>> +    trace_vxhs_failover_io(s->vdisk_guid);
>> +
>> +    s->vdisk_ask_failover_idx = 0;
>> +    res = vxhs_switch_storage_agent(s);
>> +
>> +    return res;
>> +}
>> +
>> +static void vxhs_iio_callback(int32_t rfd, uint32_t reason, void *ctx,
>> +                       uint32_t error, uint32_t opcode)
>
> This function is doing too much.  Especially the failover code should
> run in the AioContext since it's complex.  Don't do failover here
> because this function is outside the AioContext lock.  Do it from
> AioContext using a QEMUBH like block/rbd.c.
>
>> +static int32_t
>> +vxhs_qnio_iio_writev(void *qnio_ctx, uint32_t rfd, QEMUIOVector *qiov,
>> +                     uint64_t offset, void *ctx, uint32_t flags)
>> +{
>> +    struct iovec cur;
>> +    uint64_t cur_offset = 0;
>> +    uint64_t cur_write_len = 0;
>> +    int segcount = 0;
>> +    int ret = 0;
>> +    int i, nsio = 0;
>> +    int iovcnt = qiov->niov;
>> +    struct iovec *iov = qiov->iov;
>> +
>> +    errno = 0;
>> +    cur.iov_base = 0;
>> +    cur.iov_len = 0;
>> +
>> +    ret = iio_writev(qnio_ctx, rfd, iov, iovcnt, offset, ctx, flags);
>
> libqnio comments:
>
> 1.
> There are blocking connect(2) and getaddrinfo(3) calls in iio_writev()
> so this may hang for arbitrary amounts of time.  This is not permitted
> in .bdrv_aio_readv()/.bdrv_aio_writev().  Please make qnio actually
> asynchronous.
>
> 2.
> Where does client_callback() free reply?  It looks like every reply
> message causes a memory leak!
>
> 3.
> Buffer overflow in iio_writev() since device[128] cannot fit the device
> string generated from the vdisk_guid.
>
> 4.
> Buffer overflow in iio_writev() due to
> strncpy(msg->hinfo.target,device,strlen(device)) where device[128] is
> larger than target[64].  Also note the previous comments about strncpy()
> usage.
>
> 5.
> I don't see any endianness handling or portable alignment of struct
> fields in the network protocol code.  Binary network protocols need to
> take care of these issue for portability.  This means libqnio compiled
> for different architectures will not work.  Do you plan to support any
> other architectures besides x86?
>

No, we support only x86 and do not plan to support any other arch.
Please let me know if this necessitates any changes to the configure
script.

> 6.
> The networking code doesn't look robust: kvset uses assert() on input
> from the network so the other side of the connection could cause SIGABRT
> (coredump), the client uses the msg pointer as the cookie for the
> response packet so the server can easily crash the client by sending a
> bogus cookie value, etc.  Even on the client side these things are
> troublesome but on a server they are guaranteed security issues.  I
> didn't look into it deeply.  Please audit the code.
>

By design, our solution on OpenStack platform uses a closed set of
nodes communicating on dedicated networks. VxHS servers on all the
nodes are on a dedicated network. Clients (qemu) connects to these
only after reading the server IP from the XML (read by libvirt). The
XML cannot be modified without proper access. Therefore, IMO this
problem would be  relevant only if someone were to use qnio as a
generic mode of communication/data transfer, but for our use-case, we
will not run into this problem. Is this explanation acceptable?

Will reply to other comments in this email that are still not addressed. Thanks!


>> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s,
>> +                              int *cfd, int *rfd, Error **errp)
>> +{
>> +    QDict *backing_options = NULL;
>> +    QemuOpts *opts, *tcp_opts;
>> +    const char *vxhs_filename;
>> +    char *of_vsa_addr = NULL;
>> +    Error *local_err = NULL;
>> +    const char *vdisk_id_opt;
>> +    char *file_name = NULL;
>> +    size_t num_servers = 0;
>> +    char *str = NULL;
>> +    int ret = 0;
>> +    int i;
>> +
>> +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
>> +    qemu_opts_absorb_qdict(opts, options, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>> +    vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME);
>> +    if (vxhs_filename) {
>> +        trace_vxhs_qemu_init_filename(vxhs_filename);
>> +    }
>> +
>> +    vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
>> +    if (!vdisk_id_opt) {
>> +        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +    s->vdisk_guid = g_strdup(vdisk_id_opt);
>> +    trace_vxhs_qemu_init_vdisk(vdisk_id_opt);
>> +
>> +    num_servers = qdict_array_entries(options, VXHS_OPT_SERVER);
>> +    if (num_servers < 1) {
>> +        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
>> +        ret = -EINVAL;
>> +        goto out;
>> +    } else if (num_servers > VXHS_MAX_HOSTS) {
>> +        error_setg(&local_err, QERR_INVALID_PARAMETER, "server");
>> +        error_append_hint(errp, "Maximum %d servers allowed.\n",
>> +                          VXHS_MAX_HOSTS);
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +    trace_vxhs_qemu_init_numservers(num_servers);
>> +
>> +    for (i = 0; i < num_servers; i++) {
>> +        str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i);
>> +        qdict_extract_subqdict(options, &backing_options, str);
>> +
>> +        /* Create opts info from runtime_tcp_opts list */
>> +        tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, 
>> &error_abort);
>> +        qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err);
>> +        if (local_err) {
>> +            qdict_del(backing_options, str);
>
> backing_options is leaked and there's no need to delete the str key.
>
>> +            qemu_opts_del(tcp_opts);
>> +            g_free(str);
>> +            ret = -EINVAL;
>> +            goto out;
>> +        }
>> +
>> +        s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(tcp_opts,
>> +                                                            VXHS_OPT_HOST));
>> +        s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(tcp_opts,
>> +                                                                 
>> VXHS_OPT_PORT),
>> +                                                    NULL, 0);
>
> This will segfault if the port option was missing.
>
>> +
>> +        s->vdisk_hostinfo[i].qnio_cfd = -1;
>> +        s->vdisk_hostinfo[i].vdisk_rfd = -1;
>> +        trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip,
>> +                             s->vdisk_hostinfo[i].port);
>
> It's not safe to use the %s format specifier for a trace event with a
> NULL value.  In the case where hostip is NULL this could crash on some
> systems.
>
>> +
>> +        qdict_del(backing_options, str);
>> +        qemu_opts_del(tcp_opts);
>> +        g_free(str);
>> +    }
>
> backing_options is leaked.
>
>> +
>> +    s->vdisk_nhosts = i;
>> +    s->vdisk_cur_host_idx = 0;
>> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
>> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
>> +                                
>> s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>> +                                
>> s->vdisk_hostinfo[s->vdisk_cur_host_idx].port);
>
> Can we get here with num_servers == 0?  In that case this would access
> uninitialized memory.  I guess num_servers == 0 does not make sense and
> there should be an error case for it.
>
>> +
>> +    /*
>> +     * .bdrv_open() and .bdrv_create() run under the QEMU global mutex.
>> +     */
>> +    if (global_qnio_ctx == NULL) {
>> +        global_qnio_ctx = vxhs_setup_qnio();
>
> libqnio comment:
> The client epoll thread should mask all signals (like
> qemu_thread_create()).  Otherwise it may receive signals that it cannot
> deal with.
>
>> +        if (global_qnio_ctx == NULL) {
>> +            error_setg(&local_err, "Failed vxhs_setup_qnio");
>> +            ret = -EINVAL;
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    ret = vxhs_qnio_iio_open(cfd, of_vsa_addr, rfd, file_name);
>> +    if (!ret) {
>> +        error_setg(&local_err, "Failed qnio_iio_open");
>> +        ret = -EIO;
>> +    }
>
> The return value of vxhs_qnio_iio_open() is 0 for success or -errno for
> error.
>
> I guess you never ran this code!  The block driver won't even open
> successfully.
>
>> +
>> +out:
>> +    g_free(file_name);
>> +    g_free(of_vsa_addr);
>> +    qemu_opts_del(opts);
>> +
>> +    if (ret < 0) {
>> +        for (i = 0; i < num_servers; i++) {
>> +            g_free(s->vdisk_hostinfo[i].hostip);
>> +        }
>> +        g_free(s->vdisk_guid);
>> +        s->vdisk_guid = NULL;
>> +        errno = -ret;
>
> There is no need to set errno here.  The return value already contains
> the error and the caller doesn't look at errno.
>
>> +    }
>> +    error_propagate(errp, local_err);
>> +
>> +    return ret;
>> +}
>> +
>> +static int vxhs_open(BlockDriverState *bs, QDict *options,
>> +              int bdrv_flags, Error **errp)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +    AioContext *aio_context;
>> +    int qemu_qnio_cfd = -1;
>> +    int device_opened = 0;
>> +    int qemu_rfd = -1;
>> +    int ret = 0;
>> +    int i;
>> +
>> +    ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp);
>> +    if (ret < 0) {
>> +        trace_vxhs_open_fail(ret);
>> +        return ret;
>> +    }
>> +
>> +    device_opened = 1;
>> +    s->qnio_ctx = global_qnio_ctx;
>> +    s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd;
>> +    s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd;
>> +    s->vdisk_size = 0;
>> +    QSIMPLEQ_INIT(&s->vdisk_aio_retryq);
>> +
>> +    /*
>> +     * Create a pipe for communicating between two threads in different
>> +     * context. Set handler for read event, which gets triggered when
>> +     * IO completion is done by non-QEMU context.
>> +     */
>> +    ret = qemu_pipe(s->fds);
>> +    if (ret < 0) {
>> +        trace_vxhs_open_epipe('.');
>> +        ret = -errno;
>> +        goto errout;
>
> This leaks s->vdisk_guid, s->vdisk_hostinfo[i].hostip, etc.
> bdrv_close() will not be called so this function must do cleanup itself.
>
>> +    }
>> +    fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK);
>> +
>> +    aio_context = bdrv_get_aio_context(bs);
>> +    aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ],
>> +                       false, vxhs_aio_event_reader, NULL, s);
>> +
>> +    /*
>> +     * Initialize the spin-locks.
>> +     */
>> +    qemu_spin_init(&s->vdisk_lock);
>> +    qemu_spin_init(&s->vdisk_acb_lock);
>> +
>> +    return 0;
>> +
>> +errout:
>> +    /*
>> +     * Close remote vDisk device if it was opened earlier
>> +     */
>> +    if (device_opened) {
>
> This is always true.  The device_opened variable can be removed.
>
>> +/*
>> + * This allocates QEMU-VXHS callback for each IO
>> + * and is passed to QNIO. When QNIO completes the work,
>> + * it will be passed back through the callback.
>> + */
>> +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs,
>> +                                int64_t sector_num, QEMUIOVector *qiov,
>> +                                int nb_sectors,
>> +                                BlockCompletionFunc *cb,
>> +                                void *opaque, int iodir)
>> +{
>> +    VXHSAIOCB *acb = NULL;
>> +    BDRVVXHSState *s = bs->opaque;
>> +    size_t size;
>> +    uint64_t offset;
>> +    int iio_flags = 0;
>> +    int ret = 0;
>> +    void *qnio_ctx = s->qnio_ctx;
>> +    uint32_t rfd = s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd;
>> +
>> +    offset = sector_num * BDRV_SECTOR_SIZE;
>> +    size = nb_sectors * BDRV_SECTOR_SIZE;
>> +
>> +    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
>> +    /*
>> +     * Setup or initialize VXHSAIOCB.
>> +     * Every single field should be initialized since
>> +     * acb will be picked up from the slab without
>> +     * initializing with zero.
>> +     */
>> +    acb->io_offset = offset;
>> +    acb->size = size;
>> +    acb->ret = 0;
>> +    acb->flags = 0;
>> +    acb->aio_done = VXHS_IO_INPROGRESS;
>> +    acb->segments = 0;
>> +    acb->buffer = 0;
>> +    acb->qiov = qiov;
>> +    acb->direction = iodir;
>> +
>> +    qemu_spin_lock(&s->vdisk_lock);
>> +    if (OF_VDISK_FAILED(s)) {
>> +        trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset);
>> +        qemu_spin_unlock(&s->vdisk_lock);
>> +        goto errout;
>> +    }
>> +    if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>> +        s->vdisk_aio_retry_qd++;
>> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> +        qemu_spin_unlock(&s->vdisk_lock);
>> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1);
>> +        goto out;
>> +    }
>> +    s->vdisk_aio_count++;
>> +    qemu_spin_unlock(&s->vdisk_lock);
>> +
>> +    iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>> +
>> +    switch (iodir) {
>> +    case VDISK_AIO_WRITE:
>> +            vxhs_inc_acb_segment_count(acb, 1);
>> +            ret = vxhs_qnio_iio_writev(qnio_ctx, rfd, qiov,
>> +                                       offset, (void *)acb, iio_flags);
>> +            break;
>> +    case VDISK_AIO_READ:
>> +            vxhs_inc_acb_segment_count(acb, 1);
>> +            ret = vxhs_qnio_iio_readv(qnio_ctx, rfd, qiov,
>> +                                       offset, (void *)acb, iio_flags);
>> +            break;
>> +    default:
>> +            trace_vxhs_aio_rw_invalid(iodir);
>> +            goto errout;
>
> s->vdisk_aio_count must be decremented before returning.
>
>> +static void vxhs_close(BlockDriverState *bs)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +    int i;
>> +
>> +    trace_vxhs_close(s->vdisk_guid);
>> +    close(s->fds[VDISK_FD_READ]);
>> +    close(s->fds[VDISK_FD_WRITE]);
>> +
>> +    /*
>> +     * Clearing all the event handlers for oflame registered to QEMU
>> +     */
>> +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ],
>> +                       false, NULL, NULL, NULL);
>
> Please remove the event handler before closing the fd.  I don't think it
> matters in this case but in other scenarios there could be race
> conditions if another thread opens an fd and the file descriptor number
> is reused.



reply via email to

[Prev in Thread] Current Thread [Next in Thread]