[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v8 3/5] hw/block/nvme: add dulbe support
From: |
Minwoo Im |
Subject: |
Re: [PATCH v8 3/5] hw/block/nvme: add dulbe support |
Date: |
Mon, 16 Nov 2020 20:43:48 +0900 |
User-agent: |
Mutt/1.11.4 (2019-03-13) |
On 11/12 20:59, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
>
> Add support for reporting the Deallocated or Unwritten Logical Block
> Error (DULBE).
>
> Rely on the block status flags reported by the block layer and consider
> any block with the BDRV_BLOCK_ZERO flag to be deallocated.
>
> Multiple factors affect when a Write Zeroes command result in
> deallocation of blocks.
>
> * the underlying file system block size
> * the blockdev format
> * the 'discard' and 'logical_block_size' parameters
>
> format | discard | wz (512B) wz (4KiB) wz (64KiB)
> -----------------------------------------------------
> qcow2 ignore n n y
> qcow2 unmap n n y
> raw ignore n y y
> raw unmap n y y
>
> So, this works best with an image in raw format and 4KiB LBAs, since
> holes can then be punched on a per-block basis (this assumes a file
> system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size
> of 64KiB by default and blocks will only be marked deallocated if a full
> cluster is zeroed or discarded. However, this *is* consistent with the
> spec since Write Zeroes "should" deallocate the block if the Deallocate
> attribute is set and "may" deallocate if the Deallocate attribute is not
> set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is
> always set).
>
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> ---
> hw/block/nvme-ns.h | 4 ++
> include/block/nvme.h | 5 +++
> hw/block/nvme-ns.c | 8 ++--
> hw/block/nvme.c | 91 ++++++++++++++++++++++++++++++++++++++++++-
> hw/block/trace-events | 4 ++
> 5 files changed, 107 insertions(+), 5 deletions(-)
>
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 83734f4606e1..44bf6271b744 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -31,6 +31,10 @@ typedef struct NvmeNamespace {
> NvmeIdNs id_ns;
>
> NvmeNamespaceParams params;
> +
> + struct {
> + uint32_t err_rec;
> + } features;
> } NvmeNamespace;
>
> static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 8a46d9cf015f..966c3bb304bd 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -687,6 +687,7 @@ enum NvmeStatusCodes {
> NVME_E2E_REF_ERROR = 0x0284,
> NVME_CMP_FAILURE = 0x0285,
> NVME_ACCESS_DENIED = 0x0286,
> + NVME_DULB = 0x0287,
> NVME_MORE = 0x2000,
> NVME_DNR = 0x4000,
> NVME_NO_COMPLETE = 0xffff,
> @@ -903,6 +904,9 @@ enum NvmeIdCtrlLpa {
> #define NVME_AEC_NS_ATTR(aec) ((aec >> 8) & 0x1)
> #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
>
> +#define NVME_ERR_REC_TLER(err_rec) (err_rec & 0xffff)
> +#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
> +
> enum NvmeFeatureIds {
> NVME_ARBITRATION = 0x1,
> NVME_POWER_MANAGEMENT = 0x2,
> @@ -1023,6 +1027,7 @@ enum NvmeNsIdentifierType {
>
>
> #define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1))
> +#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1)
> #define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1)
> #define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf))
> #define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1)
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index 31c80cdf5b5f..f1cc734c60f5 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -33,9 +33,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
> NvmeIdNs *id_ns = &ns->id_ns;
> int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
>
> - if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
> - ns->id_ns.dlfeat = 0x9;
> - }
> + ns->id_ns.dlfeat = 0x9;
>
> id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
>
> @@ -44,6 +42,9 @@ static void nvme_ns_init(NvmeNamespace *ns)
> /* no thin provisioning */
> id_ns->ncap = id_ns->nsze;
> id_ns->nuse = id_ns->ncap;
> +
> + /* support DULBE */
> + id_ns->nsfeat |= 0x4;
> }
>
> static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> @@ -92,6 +93,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error
> **errp)
> }
>
> nvme_ns_init(ns);
> +
> if (nvme_register_namespace(n, ns, errp)) {
> return -1;
> }
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index a96a996ddc0d..8d75a89fd872 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
>
> static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
> [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
> + [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE |
> NVME_FEAT_CAP_NS,
> [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
> [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
> [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
> @@ -878,6 +879,49 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace
> *ns, uint64_t slba,
> return NVME_SUCCESS;
> }
>
> +static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
> + uint32_t nlb)
> +{
> + BlockDriverState *bs = blk_bs(ns->blkconf.blk);
> +
> + int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
> + int64_t offset = nvme_l2b(ns, slba);
> + bool zeroed;
> + int ret;
> +
> + Error *local_err = NULL;
> +
> + /*
> + * `pnum` holds the number of bytes after offset that shares the same
> + * allocation status as the byte at offset. If `pnum` is different from
> + * `bytes`, we should check the allocation status of the next range and
> + * continue this until all bytes have been checked.
> + */
> + do {
> + bytes -= pnum;
> +
> + ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
> + if (ret < 0) {
> + error_setg_errno(&local_err, -ret, "unable to get block status");
> + error_report_err(local_err);
> +
> + return NVME_INTERNAL_DEV_ERROR;
> + }
> +
> + zeroed = !!(ret & BDRV_BLOCK_ZERO);
> +
> + trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
> +
> + if (zeroed) {
> + return NVME_DULB;
> + }
> +
> + offset += pnum;
> + } while (pnum != bytes);
> +
> + return NVME_SUCCESS;
> +}
> +
> static void nvme_aio_err(NvmeRequest *req, int ret)
> {
> uint16_t status = NVME_SUCCESS;
> @@ -996,6 +1040,15 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
> goto invalid;
> }
>
> + if (acct == BLOCK_ACCT_READ) {
> + if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
> + status = nvme_check_dulbe(ns, slba, nlb);
> + if (status) {
> + goto invalid;
> + }
> + }
> + }
> +
> status = nvme_map_dptr(n, data_size, req);
> if (status) {
> goto invalid;
> @@ -1643,6 +1696,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n,
> NvmeRequest *req)
> uint8_t fid = NVME_GETSETFEAT_FID(dw10);
> NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
> uint16_t iv;
> + NvmeNamespace *ns;
>
> static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
> [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
> @@ -1705,6 +1759,18 @@ static uint16_t nvme_get_feature(NvmeCtrl *n,
> NvmeRequest *req)
> }
>
> return NVME_INVALID_FIELD | NVME_DNR;
> + case NVME_ERROR_RECOVERY:
> + if (!nvme_nsid_valid(n, nsid)) {
> + return NVME_INVALID_NSID | NVME_DNR;
> + }
> +
> + ns = nvme_ns(n, nsid);
> + if (unlikely(!ns)) {
> + return NVME_INVALID_FIELD | NVME_DNR;
> + }
> +
> + result = ns->features.err_rec;
> + goto out;
> case NVME_VOLATILE_WRITE_CACHE:
> result = n->features.vwc;
> trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
> @@ -1777,7 +1843,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n,
> NvmeRequest *req)
>
> static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
> {
> - NvmeNamespace *ns;
> + NvmeNamespace *ns = NULL;
>
> NvmeCmd *cmd = &req->cmd;
> uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> @@ -1785,6 +1851,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n,
> NvmeRequest *req)
> uint32_t nsid = le32_to_cpu(cmd->nsid);
> uint8_t fid = NVME_GETSETFEAT_FID(dw10);
> uint8_t save = NVME_SETFEAT_SAVE(dw10);
> + int i;
>
> trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
>
> @@ -1844,11 +1911,31 @@ static uint16_t nvme_set_feature(NvmeCtrl *n,
> NvmeRequest *req)
> NVME_LOG_SMART_INFO);
> }
>
> + break;
> + case NVME_ERROR_RECOVERY:
> + if (nsid == NVME_NSID_BROADCAST) {
> + for (i = 1; i <= n->num_namespaces; i++) {
> + ns = nvme_ns(n, i);
> +
> + if (!ns) {
> + continue;
> + }
> +
> + if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
> + ns->features.err_rec = dw11;
> + }
> + }
> +
> + break;
> + }
> +
> + assert(ns);
Klaus,
Sorry, but can we have assert for the failure that might happen due to
user's mis-behavior? Why don't we have NVME_INVALID_NSID for this case?
- [PATCH v8 0/5] hw/block/nvme: dulbe and dsm support, Klaus Jensen, 2020/11/12
- [PATCH v8 3/5] hw/block/nvme: add dulbe support, Klaus Jensen, 2020/11/12
- [PATCH v8 5/5] hw/block/nvme: add the dataset management command, Klaus Jensen, 2020/11/12
- [PATCH v8 4/5] nvme: add namespace I/O optimization fields to shared header, Klaus Jensen, 2020/11/12
- Re: [PATCH v8 0/5] hw/block/nvme: dulbe and dsm support, Klaus Jensen, 2020/11/23