Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support

On Jun 9, 2022, at 4:55 AM, Klaus Jensen <its@irrelevant.dk> wrote:

On Jun 8 09:36, Jinhao Fan wrote:
Implement Doorbel Buffer Config command (Section 5.7 in NVMe Spec 1.3)
and Shadow Doorbel buffer & EventIdx buffer handling logic (Section 7.13
in NVMe Spec 1.3). For queues created before the Doorbell Buffer Config
command, the nvme_dbbuf_config function tries to associate each existing
SQ and CQ with its Shadow Doorbel buffer and EventIdx buffer address.
Queues created after the Doorbell Buffer Config command will have the
doorbell buffers associated with them when they are initialized.

In nvme_process_sq and nvme_post_cqe, proactively check for Shadow
Doorbell buffer changes instead of wait for doorbell register changes.
This reduces the number of MMIOs.

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
hw/nvme/ctrl.c       | 95 ++++++++++++++++++++++++++++++++++++++++++--
hw/nvme/nvme.h       | 8 ++++
include/block/nvme.h | 2 +
3 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 03760ddeae..d3f6c432df 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -223,6 +223,7 @@ static const uint32_t nvme_cse_acs[256] = {
    [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
    [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
    [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
+    [NVME_ADM_CMD_DBBUF_CONFIG]     = NVME_CMD_EFF_CSUPP,
    [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
};

@@ -1304,6 +1305,12 @@ static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
    }
}

+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+    pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
+            sizeof(cq->head));
+}
+
static void nvme_post_cqes(void *opaque)
{
    NvmeCQueue *cq = opaque;
@@ -1316,6 +1323,10 @@ static void nvme_post_cqes(void *opaque)
        NvmeSQueue *sq;
        hwaddr addr;

+        if (cq->cqid && n->dbbuf_enabled) {
+            nvme_update_cq_head(cq);

Shouldn't we update the cq head eventidx here (prior to reading the
doorbell buffer)? Like we do for the sq tail?

I’m not sure whether updating cq head eventidx is necessary. My understanding is that the purpose of updating eventidx is for the guest to notify the host about cq head or sq tail changes with MMIO. For sq tail this is necessary because other than MMIO, there is no way to trigger the host to check for sq tail updates (shadow doorbell here). For cq head this is different. The host will read cq head shadow doorbell every time it wants to post a cqe. Therefore, letting the guest notify the host on cq head changes seems unnecessary.

Please correct me if I miss some point.

+        }
+
        if (nvme_cq_full(cq)) {
            break;
        }
@@ -4237,6 +4248,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
                         uint16_t sqid, uint16_t cqid, uint16_t size)
{
+    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
    int i;
    NvmeCQueue *cq;

@@ -4256,6 +4268,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
    }
    sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);

+    if (sqid && n->dbbuf_dbs && n->dbbuf_eis) {
+        sq->db_addr = n->dbbuf_dbs + 2 * sqid * stride;
+        sq->ei_addr = n->dbbuf_eis + 2 * sqid * stride;
+    }
+
    assert(n->cq[cqid]);
    cq = n->cq[cqid];
    QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
@@ -4599,6 +4616,7 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
                         uint16_t cqid, uint16_t vector, uint16_t size,
                         uint16_t irq_enabled)
{
+    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
    int ret;

    if (msix_enabled(&n->parent_obj)) {
@@ -4615,6 +4633,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
    cq->head = cq->tail = 0;
    QTAILQ_INIT(&cq->req_list);
    QTAILQ_INIT(&cq->sq_list);
+    if (cqid && n->dbbuf_dbs && n->dbbuf_eis) {
+        cq->db_addr = n->dbbuf_dbs + (2 * cqid + 1) * stride;
+        cq->ei_addr = n->dbbuf_eis + (2 * cqid + 1) * stride;
+    }
    n->cq[cqid] = cq;
    cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
}
@@ -5767,6 +5789,43 @@ out:
    return status;
}

+static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
+{
+    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
+    uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
+    uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
+    int i;
+
+    /* Address should be page aligned */
+    if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    /* Save shadow buffer base addr for use during queue creation */
+    n->dbbuf_dbs = dbs_addr;
+    n->dbbuf_eis = eis_addr;
+    n->dbbuf_enabled = true;
+
+    for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
+        NvmeSQueue *sq = n->sq[i];
+        NvmeCQueue *cq = n->cq[i];
+
+        if (sq) {
+            /* Submission queue tail pointer location, 2 * QID * stride */
+            sq->db_addr = dbs_addr + 2 * i * stride;
+            sq->ei_addr = eis_addr + 2 * i * stride;
+        }
+
+        if (cq) {
+            /* Completion queue head pointer location, (2 * QID + 1) * stride */
+            cq->db_addr = dbs_addr + (2 * i + 1) * stride;
+            cq->ei_addr = eis_addr + (2 * i + 1) * stride;
+        }
+    }

Why no love for the admin queue? :)

You are special-casing the admin queue below in process_sq() and
process_db(), as well as above in post_cqes(). As I'm reading the spec,
I do not see why the Admin queue should be treated differently wrt.
doorbell buffer configuration. Could this be a left-over from the
behavior in the original Google extension (prior to going into NVMe)?

Yes, this is “inherited” from the Google extension code. I also checked 1.3d of the spec and even found in Table 64 the addresses for admin queue’s shadow doorbell buffer and eventidx buffer.

Seems we need to take admin queue into account.

I peeked in to the kernel and it looks like it doesnt enable doorbell
buffers for admin queue, only for subsequently created I/O queues.

Keith, is this a bug in the kernel? If the code here would expect the
doorbell buffer to be updated for the admin queue as well, would we
break?

+
+    return NVME_SUCCESS;
+}
+
static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
{
    trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
@@ -5809,6 +5868,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
        return nvme_aer(n, req);
    case NVME_ADM_CMD_NS_ATTACHMENT:
        return nvme_ns_attachment(n, req);
+    case NVME_ADM_CMD_DBBUF_CONFIG:
+        return nvme_dbbuf_config(n, req);
    case NVME_ADM_CMD_FORMAT_NVM:
        return nvme_format(n, req);
    default:
@@ -5818,6 +5879,18 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
    return NVME_INVALID_OPCODE | NVME_DNR;
}

+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+    pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
+                  sizeof(sq->tail));
+}
+
+static void nvme_update_sq_tail(NvmeSQueue *sq)
+{
+    pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
+                 sizeof(sq->tail));
+}
+
static void nvme_process_sq(void *opaque)
{
    NvmeSQueue *sq = opaque;
@@ -5829,6 +5902,10 @@ static void nvme_process_sq(void *opaque)
    NvmeCmd cmd;
    NvmeRequest *req;

+    if (sq->sqid && n->dbbuf_enabled) {
+        nvme_update_sq_tail(sq);
+    }
+
    while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
        addr = sq->dma_addr + sq->head * n->sqe_size;
        if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
@@ -5852,6 +5929,11 @@ static void nvme_process_sq(void *opaque)
            req->status = status;
            nvme_enqueue_req_completion(cq, req);
        }
+
+        if (sq->sqid && n->dbbuf_enabled) {
+            nvme_update_sq_eventidx(sq);
+            nvme_update_sq_tail(sq);
+        }
    }
}

@@ -5889,6 +5971,9 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
    n->aer_queued = 0;
    n->outstanding_aers = 0;
    n->qs_created = false;
+    n->dbbuf_dbs = 0;
+    n->dbbuf_eis = 0;
+    n->dbbuf_enabled = false;
}

static void nvme_ctrl_shutdown(NvmeCtrl *n)
@@ -6397,7 +6482,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
        trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);

        start_sqs = nvme_cq_full(cq) ? 1 : 0;
-        cq->head = new_head;
+        if (!cq->cqid || !n->dbbuf_enabled) {
+            cq->head = new_head;
+        }
        if (start_sqs) {
            NvmeSQueue *sq;
            QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
@@ -6454,7 +6541,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)

        trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);

-        sq->tail = new_tail;
+        if (!sq->sqid || !n->dbbuf_enabled) {
+            sq->tail = new_tail;
+        }
        timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
    }
}
@@ -6733,7 +6822,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)

    id->mdts = n->params.mdts;
    id->ver = cpu_to_le32(NVME_SPEC_VER);
-    id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
+    id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF);
    id->cntrltype = 0x1;

    /*
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 6773819325..4452e4b1bf 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -334,6 +334,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc)
    case NVME_ADM_CMD_GET_FEATURES:     return "NVME_ADM_CMD_GET_FEATURES";
    case NVME_ADM_CMD_ASYNC_EV_REQ:     return "NVME_ADM_CMD_ASYNC_EV_REQ";
    case NVME_ADM_CMD_NS_ATTACHMENT:    return "NVME_ADM_CMD_NS_ATTACHMENT";
+    case NVME_ADM_CMD_DBBUF_CONFIG:     return "NVME_ADM_CMD_DBBUF_CONFIG";
    case NVME_ADM_CMD_FORMAT_NVM:       return "NVME_ADM_CMD_FORMAT_NVM";
    default:                            return "NVME_ADM_CMD_UNKNOWN";
    }
@@ -365,6 +366,8 @@ typedef struct NvmeSQueue {
    uint32_t    tail;
    uint32_t    size;
    uint64_t    dma_addr;
+    uint64_t    db_addr;
+    uint64_t    ei_addr;
    QEMUTimer   *timer;
    NvmeRequest *io_req;
    QTAILQ_HEAD(, NvmeRequest) req_list;
@@ -382,6 +385,8 @@ typedef struct NvmeCQueue {
    uint32_t    vector;
    uint32_t    size;
    uint64_t    dma_addr;
+    uint64_t    db_addr;
+    uint64_t    ei_addr;
    QEMUTimer   *timer;
    QTAILQ_HEAD(, NvmeSQueue) sq_list;
    QTAILQ_HEAD(, NvmeRequest) req_list;
@@ -432,6 +437,9 @@ typedef struct NvmeCtrl {
    uint64_t    starttime_ms;
    uint16_t    temperature;
    uint8_t     smart_critical_warning;
+    uint64_t    dbbuf_dbs;
+    uint64_t    dbbuf_eis;
+    bool        dbbuf_enabled;

    struct {
        MemoryRegion mem;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 3737351cc8..5b522d7b0e 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -595,6 +595,7 @@ enum NvmeAdminCommands {
    NVME_ADM_CMD_ACTIVATE_FW    = 0x10,
    NVME_ADM_CMD_DOWNLOAD_FW    = 0x11,
    NVME_ADM_CMD_NS_ATTACHMENT = 0x15,
+    NVME_ADM_CMD_DBBUF_CONFIG   = 0x7c,
    NVME_ADM_CMD_FORMAT_NVM     = 0x80,
    NVME_ADM_CMD_SECURITY_SEND = 0x81,
    NVME_ADM_CMD_SECURITY_RECV = 0x82,
@@ -1134,6 +1135,7 @@ enum NvmeIdCtrlOacs {
    NVME_OACS_FORMAT    = 1 << 1,
    NVME_OACS_FW        = 1 << 2,
    NVME_OACS_NS_MGMT   = 1 << 3,
+    NVME_OACS_DBBUF     = 1 << 8,
};

enum NvmeIdCtrlOncs {
--
2.25.1

--
One of us - No more doubt, silence or taboo about mental illness.

From:	Jinhao Fan
Subject:	Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support
Date:	Thu, 9 Jun 2022 09:49:47 +0800