qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH RFC v5 12/12] hw/block/nvme: add persistence for zone info


From: Klaus Jensen
Subject: [PATCH RFC v5 12/12] hw/block/nvme: add persistence for zone info
Date: Fri, 27 Nov 2020 00:46:01 +0100

From: Klaus Jensen <k.jensen@samsung.com>

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 docs/specs/nvme.txt   |  15 +++
 hw/block/nvme-ns.h    |  16 ++++
 hw/block/nvme-ns.c    | 212 +++++++++++++++++++++++++++++++++++++++++-
 hw/block/nvme.c       |  87 +++++++++++++++++
 hw/block/trace-events |   2 +
 5 files changed, 331 insertions(+), 1 deletion(-)

diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
index 03bb4d9516b4..05d81c88ad4e 100644
--- a/docs/specs/nvme.txt
+++ b/docs/specs/nvme.txt
@@ -20,6 +20,21 @@ The nvme device (-device nvme) emulates an NVM Express 
Controller.
   `zns.mor`; Specifies the number of open resources available. This is a 0s
      based value.
 
+  `zns.pstate`; This parameter specifies another blockdev to be used for
+     storing zone state persistently.
+
+       -drive id=zns-pstate,file=zns-pstate.img,format=raw
+       -device nvme-ns,zns.pstate=zns-pstate,...
+
+     To reset (or initialize) state, the blockdev image should be of zero size:
+
+       qemu-img create -f raw zns-pstate.img 0
+
+     The image will be intialized with a file format header and truncated to
+     the required size. If the pstate given is of non-zero size, it will be
+     assumed to already contain zone state information and the header will be
+     checked.
+
 
 Reference Specifications
 ------------------------
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 05a79a214605..5cb4c1da59ce 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,6 +19,15 @@
 #define NVME_NS(obj) \
     OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
 
+#define NVME_ZONE_PSTATE_MAGIC ((0x00 << 24) | ('S' << 16) | ('N' << 8) | 'Z')
+#define NVME_ZONE_PSTATE_V1 1
+
+typedef struct NvmeZonePStateHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint8_t  rsvd8[4088];
+} QEMU_PACKED NvmeZonePStateHeader;
+
 typedef struct NvmeNamespaceParams {
     uint32_t nsid;
     uint8_t  iocs;
@@ -74,6 +83,8 @@ typedef struct NvmeNamespace {
             QTAILQ_HEAD(, NvmeZone) lru_open;
             QTAILQ_HEAD(, NvmeZone) lru_active;
         } resources;
+
+        BlockBackend *pstate;
     } zns;
 } NvmeNamespace;
 
@@ -186,4 +197,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
**errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_flush(NvmeNamespace *ns);
 
+static inline void _nvme_ns_check_size(void)
+{
+    QEMU_BUILD_BUG_ON(sizeof(NvmeZonePStateHeader) != 4096);
+}
+
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index cd0f075dd281..4f311dd704c0 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -50,6 +50,31 @@ const char *nvme_zs_to_str(NvmeZoneState zs)
     return "UNKNOWN";
 }
 
+static int nvme_blk_truncate(BlockBackend *blk, size_t len, Error **errp)
+{
+    int ret;
+    uint64_t perm, shared_perm;
+
+    blk_get_perm(blk, &perm, &shared_perm);
+
+    ret = blk_set_perm(blk, perm | BLK_PERM_RESIZE, shared_perm, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = blk_truncate(blk, len, false, PREALLOC_MODE_OFF, 0, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = blk_set_perm(blk, perm, shared_perm, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
 static void nvme_ns_zns_init_zones(NvmeNamespace *ns)
 {
     NvmeZone *zone;
@@ -153,6 +178,176 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
     return 0;
 }
 
+static int nvme_ns_zns_restore_zone_state(NvmeNamespace *ns, Error **errp)
+{
+    for (int i = 0; i < ns->zns.num_zones; i++) {
+        NvmeZone *zone = &ns->zns.zones[i];
+        zone->zd = &ns->zns.zd[i];
+        if (ns->params.zns.zdes) {
+            zone->zde = &ns->zns.zde[i];
+        }
+
+        switch (nvme_zs(zone)) {
+        case NVME_ZS_ZSE:
+        case NVME_ZS_ZSF:
+        case NVME_ZS_ZSRO:
+        case NVME_ZS_ZSO:
+            break;
+
+        case NVME_ZS_ZSC:
+            if (nvme_wp(zone) == nvme_zslba(zone) &&
+                !(zone->zd->za & NVME_ZA_ZDEV)) {
+                nvme_zs_set(zone, NVME_ZS_ZSE);
+                break;
+            }
+
+            if (ns->zns.resources.active) {
+                ns->zns.resources.active--;
+                QTAILQ_INSERT_TAIL(&ns->zns.resources.lru_active, zone,
+                                   lru_entry);
+                break;
+            }
+
+            /* fallthrough */
+
+        case NVME_ZS_ZSIO:
+        case NVME_ZS_ZSEO:
+            zone->zd->wp = zone->zd->zslba;
+            nvme_zs_set(zone, NVME_ZS_ZSF);
+            break;
+
+        default:
+            error_setg(errp, "invalid zone state");
+            return -1;
+        }
+
+        zone->wp_staging = nvme_wp(zone);
+    }
+
+    return 0;
+}
+
+static int nvme_ns_zns_init_pstate(NvmeNamespace *ns, Error **errp)
+{
+    BlockBackend *blk = ns->zns.pstate;
+    NvmeZonePStateHeader header;
+    size_t zd_len, zde_len;
+    int ret;
+
+    zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor);
+    zde_len = ns->zns.num_zones * nvme_ns_zdes_bytes(ns);
+
+    ret = nvme_blk_truncate(blk, zd_len + zde_len + sizeof(header), errp);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "could not truncate zone pstate");
+        return ret;
+    }
+
+    nvme_ns_zns_init_zones(ns);
+
+    ret = blk_pwrite(blk, 0, ns->zns.zd, zd_len, 0);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "could not write zone descriptors to "
+                         "zone pstate");
+        return ret;
+    }
+
+    header = (NvmeZonePStateHeader) {
+        .magic = cpu_to_le32(NVME_ZONE_PSTATE_MAGIC),
+        .version = cpu_to_le32(NVME_ZONE_PSTATE_V1),
+    };
+
+    ret = blk_pwrite(blk, zd_len + zde_len, &header, sizeof(header), 0);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "could not write zone pstate header");
+        return ret;
+    }
+
+    return 0;
+}
+
+static int nvme_ns_zns_load_pstate(NvmeNamespace *ns, size_t len, Error **errp)
+{
+    BlockBackend *blk = ns->zns.pstate;
+    NvmeZonePStateHeader header;
+    size_t zd_len, zde_len;
+    int ret;
+
+    ret = blk_pread(blk, len - sizeof(header), &header, sizeof(header));
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "could not read zone pstate header");
+        return ret;
+    }
+
+    if (le32_to_cpu(header.magic) != NVME_ZONE_PSTATE_MAGIC) {
+        error_setg(errp, "invalid zone pstate header");
+        return -1;
+    } else if (le32_to_cpu(header.version) > NVME_ZONE_PSTATE_V1) {
+        error_setg(errp, "unsupported zone pstate version");
+        return -1;
+    }
+
+    zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor);
+    zde_len = ns->zns.num_zones * nvme_ns_zdes_bytes(ns);
+
+    ret = blk_pread(blk, 0, ns->zns.zd, zd_len);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "could not read zone descriptors from "
+                         "zone pstate");
+        return ret;
+    }
+
+    if (zde_len) {
+        ret = blk_pread(blk, zd_len, ns->zns.zde, zde_len);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "could not read zone descriptor "
+                             "extensions from zone pstate");
+            return ret;
+        }
+    }
+
+    if (nvme_ns_zns_restore_zone_state(ns, errp)) {
+        return -1;
+    }
+
+    ret = blk_pwrite(blk, 0, ns->zns.zd, zd_len, 0);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "could not write zone descriptors to "
+                         "zone pstate");
+        return ret;
+    }
+
+    return 0;
+}
+
+static int nvme_ns_zns_setup_pstate(NvmeNamespace *ns, Error **errp)
+{
+    BlockBackend *blk = ns->zns.pstate;
+    uint64_t perm, shared_perm;
+    ssize_t len;
+    int ret;
+
+    perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+    shared_perm = BLK_PERM_ALL;
+
+    ret = blk_set_perm(blk, perm, shared_perm, errp);
+    if (ret) {
+        return ret;
+    }
+
+    len = blk_getlength(blk);
+    if (len < 0) {
+        error_setg_errno(errp, -len, "could not determine zone pstate size");
+        return len;
+    }
+
+    if (!len) {
+        return nvme_ns_zns_init_pstate(ns, errp);
+    }
+
+    return nvme_ns_zns_load_pstate(ns, len, errp);
+}
+
 static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 {
     if (!blkconf_blocksizes(&ns->blkconf, errp)) {
@@ -236,7 +431,13 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
**errp)
     }
 
     if (nvme_ns_zoned(ns)) {
-        nvme_ns_zns_init_zones(ns);
+        if (ns->zns.pstate) {
+            if (nvme_ns_zns_setup_pstate(ns, errp)) {
+                return -1;
+            }
+        } else {
+            nvme_ns_zns_init_zones(ns);
+        }
     }
 
     if (nvme_register_namespace(n, ns, errp)) {
@@ -249,11 +450,19 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
**errp)
 void nvme_ns_drain(NvmeNamespace *ns)
 {
     blk_drain(ns->blkconf.blk);
+
+    if (ns->zns.pstate) {
+        blk_drain(ns->zns.pstate);
+    }
 }
 
 void nvme_ns_flush(NvmeNamespace *ns)
 {
     blk_flush(ns->blkconf.blk);
+
+    if (ns->zns.pstate) {
+        blk_flush(ns->zns.pstate);
+    }
 }
 
 static void nvme_ns_realize(DeviceState *dev, Error **errp)
@@ -283,6 +492,7 @@ static Property nvme_ns_props[] = {
     DEFINE_PROP_UINT8("zns.zdes", NvmeNamespace, params.zns.zdes, 0),
     DEFINE_PROP_UINT32("zns.mar", NvmeNamespace, params.zns.mar, 0xffffffff),
     DEFINE_PROP_UINT32("zns.mor", NvmeNamespace, params.zns.mor, 0xffffffff),
+    DEFINE_PROP_DRIVE("zns.pstate", NvmeNamespace, zns.pstate),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e62efd7cf0c4..04ad9f20223d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1023,6 +1023,46 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, 
uint64_t slba,
     return NVME_SUCCESS;
 }
 
+static int nvme_zns_commit_zone(NvmeNamespace *ns, NvmeZone *zone)
+{
+    uint64_t zslba;
+    int64_t offset;
+
+    if (!ns->zns.pstate) {
+        return 0;
+    }
+
+    trace_pci_nvme_zns_commit_zone(nvme_nsid(ns), nvme_zslba(zone));
+
+    zslba = nvme_zslba(zone);
+    offset = nvme_ns_zone_idx(ns, zslba) * sizeof(NvmeZoneDescriptor);
+
+    return blk_pwrite(ns->zns.pstate, offset, zone->zd,
+                      sizeof(NvmeZoneDescriptor), 0);
+}
+
+static int nvme_zns_commit_zde(NvmeNamespace *ns, NvmeZone *zone)
+{
+    uint64_t zslba;
+    int zidx;
+    size_t zd_len, zdes_bytes;
+    int64_t offset;
+
+    if (!ns->zns.pstate) {
+        return 0;
+    }
+
+    trace_pci_nvme_zns_commit_zde(nvme_nsid(ns), nvme_zslba(zone));
+
+    zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor);
+    zslba = nvme_zslba(zone);
+    zidx = nvme_ns_zone_idx(ns, zslba);
+    zdes_bytes = nvme_ns_zdes_bytes(ns);
+    offset = zd_len + zidx * zdes_bytes;
+
+    return blk_pwrite(ns->zns.pstate, offset, zone->zde, zdes_bytes, 0);
+}
+
 static inline void nvme_zone_reset_wp(NvmeZone *zone)
 {
     zone->zd->wp = zone->zd->zslba;
@@ -1058,6 +1098,10 @@ static uint16_t nvme_zrm_release_open(NvmeNamespace *ns)
             return status;
         }
 
+        if (nvme_zns_commit_zone(ns, candidate) < 0) {
+            return NVME_INTERNAL_DEV_ERROR;
+        }
+
         return NVME_SUCCESS;
     }
 
@@ -1252,6 +1296,10 @@ static uint16_t __nvme_zns_advance_wp(NvmeNamespace *ns, 
NvmeZone *zone,
         if (status) {
             return status;
         }
+
+        if (nvme_zns_commit_zone(ns, zone) < 0) {
+            return NVME_INTERNAL_DEV_ERROR;
+        }
     }
 
     return NVME_SUCCESS;
@@ -1307,6 +1355,10 @@ static void nvme_aio_err(NvmeRequest *req, int ret, 
NvmeZone *zone)
             NVME_ZS_ZSRO : NVME_ZS_ZSO;
 
         nvme_zrm_transition(ns, zone, zs);
+
+        if (nvme_zns_commit_zone(req->ns, zone) < 0) {
+            req->status = NVME_INTERNAL_DEV_ERROR;
+        }
     }
 
     /*
@@ -1618,6 +1670,10 @@ static void nvme_aio_zone_reset_cb(void *opaque, int ret)
         nvme_aio_err(req, ret, zone);
     }
 
+    if (nvme_zns_commit_zone(req->ns, zone) < 0) {
+        req->status = NVME_INTERNAL_DEV_ERROR;
+    }
+
     (*resets)--;
 
     if (*resets) {
@@ -1657,6 +1713,10 @@ static uint16_t nvme_zone_mgmt_send_close(NvmeCtrl *n, 
NvmeRequest *req,
         return status;
     }
 
+    if (nvme_zns_commit_zone(ns, zone) < 0) {
+        return NVME_INTERNAL_DEV_ERROR;
+    }
+
     return NVME_SUCCESS;
 }
 
@@ -1678,6 +1738,10 @@ static uint16_t nvme_zone_mgmt_send_finish(NvmeCtrl *n, 
NvmeRequest *req,
         return status;
     }
 
+    if (nvme_zns_commit_zone(ns, zone) < 0) {
+        return NVME_INTERNAL_DEV_ERROR;
+    }
+
     return NVME_SUCCESS;
 }
 
@@ -1699,6 +1763,10 @@ static uint16_t nvme_zone_mgmt_send_open(NvmeCtrl *n, 
NvmeRequest *req,
         return status;
     }
 
+    if (nvme_zns_commit_zone(ns, zone) < 0) {
+        return NVME_INTERNAL_DEV_ERROR;
+    }
+
     return NVME_SUCCESS;
 }
 
@@ -1754,6 +1822,10 @@ static uint16_t nvme_zone_mgmt_send_offline(NvmeCtrl *n, 
NvmeRequest *req,
     case NVME_ZS_ZSRO:
         nvme_zrm_transition(ns, zone, NVME_ZS_ZSO);
 
+        if (nvme_zns_commit_zone(ns, zone) < 0) {
+            return NVME_INTERNAL_DEV_ERROR;
+        }
+
         /* fallthrough */
 
     case NVME_ZS_ZSO:
@@ -1793,6 +1865,10 @@ static uint16_t nvme_zone_mgmt_send_set_zde(NvmeCtrl *n, 
NvmeRequest *req,
         return status;
     }
 
+    if (nvme_zns_commit_zde(ns, zone) < 0) {
+        return NVME_INTERNAL_DEV_ERROR;
+    }
+
     status = nvme_zrm_transition(ns, zone, NVME_ZS_ZSC);
     if (status) {
         return status;
@@ -1800,6 +1876,10 @@ static uint16_t nvme_zone_mgmt_send_set_zde(NvmeCtrl *n, 
NvmeRequest *req,
 
     NVME_ZA_SET(zone->zd->za, NVME_ZA_ZDEV);
 
+    if (nvme_zns_commit_zone(ns, zone) < 0) {
+        return NVME_INTERNAL_DEV_ERROR;
+    }
+
     return NVME_SUCCESS;
 }
 
@@ -2502,6 +2582,11 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
                 goto invalid;
             }
 
+            if (nvme_zns_commit_zone(ns, zone) < 0) {
+                status = NVME_INTERNAL_DEV_ERROR;
+                goto invalid;
+            }
+
             break;
         }
 
@@ -3778,6 +3863,8 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
                         nvme_zrm_transition(ns, zone, NVME_ZS_ZSE);
                     }
 
+                    nvme_zns_commit_zone(ns, zone);
+
                     /* fallthrough */
 
                 default:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 31482bfba1fe..aa5491c398b9 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -96,6 +96,8 @@ pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, 
uint16_t status) "c
 pci_nvme_zrm_transition(uint32_t nsid, uint64_t zslba, const char *s_from, 
uint8_t from, const char *s_to, uint8_t to) "nsid %"PRIu32" zslba 0x%"PRIx64" 
from '%s' (%"PRIu8") to '%s' (%"PRIu8")"
 pci_nvme_zrm_release_open(uint32_t nsid) "nsid %"PRIu32""
 pci_nvme_zns_advance_wp(uint32_t nsid, uint64_t zslba, uint64_t wp_orig, 
uint32_t nlb) "nsid 0x%"PRIx32" zslba 0x%"PRIx64" wp_orig 0x%"PRIx64" nlb 
%"PRIu32""
+pci_nvme_zns_commit_zone(uint32_t nsid, uint64_t zslba) "nsid 0x%"PRIx32" 
zslba 0x%"PRIx64""
+pci_nvme_zns_commit_zde(uint32_t nsid, uint64_t zslba) "nsid 0x%"PRIx32" zslba 
0x%"PRIx64""
 pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
 pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 
0x%"PRIx64""
 pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" 
new_head %"PRIu16""
-- 
2.29.2




reply via email to

[Prev in Thread] Current Thread [Next in Thread]