[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 06/11] scsi-disk: correctly implement WRITE SAME
From: |
Peter Lieven |
Subject: |
Re: [Qemu-devel] [PATCH 06/11] scsi-disk: correctly implement WRITE SAME |
Date: |
Wed, 13 Nov 2013 07:18:13 +0100 |
Am 12.11.2013 um 16:49 schrieb Paolo Bonzini <address@hidden>:
> The WRITE SAME command is implemented incorrectly. WRITE SAME with the
> UNMAP bit set should _not_ unmap the sectors unless the written data
> matches the payload of the WRITE SAME command; currently, QEMU is not
> looking at the payload at all.
>
> Thus, fetch the data to be written from the input buffer. If it is
> all zeroes, we can use the write_zeroes call (possibly with the new
> MAY_UNMAP flag). Otherwise, do as many write cycles as needed, covering
> 512k at a time to avoid allocating lots of memory for the bounce
> buffer.
Would it make sense to add a bdrv_write_same or is the use case for
WRITE SAME with non-zero payload too rare?
And secondly would it make sense to add an optimal request size field
to the BlockLimits?
>
> Strictly speaking, this is still incorrect because a zero cluster should
> only be written if the MAY_UNMAP flag is set. But this is a bug in the
> block layer, not here.
Can you explain what exactly you mean?
>
> Signed-off-by: Paolo Bonzini <address@hidden>
> ---
> hw/scsi/scsi-disk.c | 140 +++++++++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 116 insertions(+), 24 deletions(-)
>
> diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
> index 7e29760..cd5116c 100644
> --- a/hw/scsi/scsi-disk.c
> +++ b/hw/scsi/scsi-disk.c
> @@ -41,6 +41,7 @@ do { printf("scsi-disk: " fmt , ## __VA_ARGS__); } while (0)
> #include <scsi/sg.h>
> #endif
>
> +#define SCSI_WRITE_SAME_MAX 524288
I would call this SCSI_MAX_WRITE_SAME_LEN (like max_ws_len)
> #define SCSI_DMA_BUF_SIZE 131072
> #define SCSI_MAX_INQUIRY_LEN 256
> #define SCSI_MAX_MODE_LEN 256
> @@ -634,6 +635,8 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req,
> uint8_t *outbuf)
> buflen = 0x40;
> memset(outbuf + 4, 0, buflen - 4);
>
> + outbuf[4] = 0x1; /* wsnz */
> +
> /* optimal transfer length granularity */
> outbuf[6] = (min_io_size >> 8) & 0xff;
> outbuf[7] = min_io_size & 0xff;
> @@ -1589,6 +1592,111 @@ invalid_field:
> scsi_check_condition(r, SENSE_CODE(INVALID_FIELD));
> }
>
> +typedef struct WriteSameCBData {
> + SCSIDiskReq *r;
> + int64_t sector;
> + int nb_sectors;
> + QEMUIOVector qiov;
> + struct iovec iov;
> +} WriteSameCBData;
> +
> +static void scsi_write_same_complete(void *opaque, int ret)
> +{
> + WriteSameCBData *data = opaque;
> + SCSIDiskReq *r = data->r;
> + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
> +
> + assert(r->req.aiocb != NULL);
> + r->req.aiocb = NULL;
> + bdrv_acct_done(s->qdev.conf.bs, &r->acct);
> + if (r->req.io_canceled) {
> + goto done;
> + }
> +
> + if (ret < 0) {
> + if (scsi_handle_rw_error(r, -ret)) {
> + goto done;
> + }
> + }
> +
> + data->nb_sectors -= data->iov.iov_len / 512;
> + data->sector += data->iov.iov_len / 512;
> + data->iov.iov_len = MIN(data->nb_sectors * 512, data->iov.iov_len);
> + if (data->iov.iov_len) {
> + bdrv_acct_start(s->qdev.conf.bs, &r->acct, data->iov.iov_len,
> BDRV_ACCT_WRITE);
> + r->req.aiocb = bdrv_aio_writev(s->qdev.conf.bs, data->sector,
> + &data->qiov, data->iov.iov_len / 512,
> + scsi_write_same_complete, r);
> + return;
> + }
> +
> + scsi_req_complete(&r->req, GOOD);
> +
> +done:
> + if (!r->req.io_canceled) {
> + scsi_req_unref(&r->req);
> + }
> + g_free (data->iov.iov_base);
> + g_free (data);
> +}
> +
> +static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
> +{
> + SCSIRequest *req = &r->req;
> + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
> + uint32_t nb_sectors = scsi_data_cdb_length(r->req.cmd.buf);
> + WriteSameCBData *data;
> + uint8_t *buf;
> + int i;
> +
> + /* Fail if PBDATA=1 or LBDATA=1 or ANCHOR=1. */
> + if (nb_sectors == 0 || (req->cmd.buf[1] & 0x16)) {
> + scsi_check_condition(r, SENSE_CODE(INVALID_FIELD));
> + return;
> + }
> +
> + if (bdrv_is_read_only(s->qdev.conf.bs)) {
> + scsi_check_condition(r, SENSE_CODE(WRITE_PROTECTED));
> + return;
> + }
> + if (!check_lba_range(s, r->req.cmd.lba, nb_sectors)) {
> + scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
> + return;
> + }
> +
> + if (buffer_is_zero(inbuf, s->qdev.blocksize)) {
> + int flags = (req->cmd.buf[1] & 0x8) ? BDRV_REQ_MAY_UNMAP : 0;
> +
> + /* The request is used as the AIO opaque value, so add a ref. */
> + scsi_req_ref(&r->req);
> + bdrv_acct_start(s->qdev.conf.bs, &r->acct, nb_sectors *
> s->qdev.blocksize,
> + BDRV_ACCT_WRITE);
> + r->req.aiocb = bdrv_aio_write_zeroes(s->qdev.conf.bs,
> + r->req.cmd.lba *
> (s->qdev.blocksize / 512),
> + nb_sectors * (s->qdev.blocksize
> / 512),
> + flags, scsi_aio_complete, r);
> + return;
> + }
> +
> + data = g_new0(WriteSameCBData, 1);
> + data->r = r;
> + data->sector = r->req.cmd.lba * (s->qdev.blocksize / 512);
> + data->nb_sectors = nb_sectors * (s->qdev.blocksize / 512);
> + data->iov.iov_len = MIN(data->nb_sectors * 512, SCSI_WRITE_SAME_MAX);
> + data->iov.iov_base = buf = g_malloc(data->iov.iov_len);
> + qemu_iovec_init_external(&data->qiov, &data->iov, 1);
> +
> + for (i = 0; i < data->iov.iov_len; i += s->qdev.blocksize) {
> + memcpy(&buf[i], inbuf, s->qdev.blocksize);
> + }
> +
> + scsi_req_ref(&r->req);
> + bdrv_acct_start(s->qdev.conf.bs, &r->acct, data->iov.iov_len,
> BDRV_ACCT_WRITE);
> + r->req.aiocb = bdrv_aio_writev(s->qdev.conf.bs, data->sector,
> + &data->qiov, data->iov.iov_len / 512,
> + scsi_write_same_complete, data);
> +}
I asked myself many time if to use 512 or BDRV_SECTOR_SIZE. Both is used
heavily in the whole block code.
> +
> static void scsi_disk_emulate_write_data(SCSIRequest *req)
> {
> SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
> @@ -1612,6 +1720,10 @@ static void scsi_disk_emulate_write_data(SCSIRequest
> *req)
> scsi_disk_emulate_unmap(r, r->iov.iov_base);
> break;
>
> + case WRITE_SAME_10:
> + case WRITE_SAME_16:
> + scsi_disk_emulate_write_same(r, r->iov.iov_base);
> + break;
> default:
> abort();
> }
> @@ -1854,30 +1966,10 @@ static int32_t scsi_disk_emulate_command(SCSIRequest
> *req, uint8_t *buf)
> break;
> case WRITE_SAME_10:
> case WRITE_SAME_16:
> - nb_sectors = scsi_data_cdb_length(r->req.cmd.buf);
> - if (bdrv_is_read_only(s->qdev.conf.bs)) {
> - scsi_check_condition(r, SENSE_CODE(WRITE_PROTECTED));
> - return 0;
> - }
> - if (!check_lba_range(s, r->req.cmd.lba, nb_sectors)) {
> - goto illegal_lba;
> - }
> -
> - /*
> - * We only support WRITE SAME with the unmap bit set for now.
> - * Reject UNMAP=0 or ANCHOR=1.
> - */
> - if (!(req->cmd.buf[1] & 0x8) || (req->cmd.buf[1] & 0x10)) {
> - goto illegal_request;
> - }
> -
> - /* The request is used as the AIO opaque value, so add a ref. */
> - scsi_req_ref(&r->req);
> - r->req.aiocb = bdrv_aio_discard(s->qdev.conf.bs,
> - r->req.cmd.lba * (s->qdev.blocksize
> / 512),
> - nb_sectors * (s->qdev.blocksize /
> 512),
> - scsi_aio_complete, r);
> - return 0;
> + DPRINTF("WRITE SAME %d (len %lu)\n",
> + req->cmd.buf[0] == WRITE_SAME_10 ? 10 : 16,
> + (long)r->req.cmd.xfer);
> + break;
> default:
> DPRINTF("Unknown SCSI command (%2.2x)\n", buf[0]);
> scsi_check_condition(r, SENSE_CODE(INVALID_OPCODE));
> --
> 1.8.4.2
>
>
Peter
- [Qemu-devel] [PATCH 00/11] block & scsi: write_zeroes support through the whole stack, Paolo Bonzini, 2013/11/12
- [Qemu-devel] [PATCH 03/11] block: add bdrv_aio_write_zeroes, Paolo Bonzini, 2013/11/12
- [Qemu-devel] [PATCH 04/11] scsi-disk: catch write protection errors in UNMAP, Paolo Bonzini, 2013/11/12
- [Qemu-devel] [PATCH 02/11] block: add flags to BlockRequest, Paolo Bonzini, 2013/11/12
- [Qemu-devel] [PATCH 01/11] block: generalize BlockLimits handling to cover bdrv_aio_discard too, Paolo Bonzini, 2013/11/12
- [Qemu-devel] [PATCH 06/11] scsi-disk: correctly implement WRITE SAME, Paolo Bonzini, 2013/11/12
- Re: [Qemu-devel] [PATCH 06/11] scsi-disk: correctly implement WRITE SAME,
Peter Lieven <=
- [Qemu-devel] [PATCH 07/11] block: handle ENOTSUP from discard in generic code, Paolo Bonzini, 2013/11/12
- [Qemu-devel] [PATCH 08/11] raw-posix: implement write_zeroes with MAY_UNMAP for files, Paolo Bonzini, 2013/11/12
- [Qemu-devel] [PATCH 10/11] raw-posix: add support for write_zeroes on XFS and block devices, Paolo Bonzini, 2013/11/12
- [Qemu-devel] [PATCH 09/11] raw-posix: implement write_zeroes with MAY_UNMAP for block devices, Paolo Bonzini, 2013/11/12