qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support


From: Klaus Jensen
Subject: Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support
Date: Wed, 8 Jun 2022 22:55:30 +0200

On Jun  8 09:36, Jinhao Fan wrote:
> Implement Doorbel Buffer Config command (Section 5.7 in NVMe Spec 1.3)
> and Shadow Doorbel buffer & EventIdx buffer handling logic (Section 7.13
> in NVMe Spec 1.3). For queues created before the Doorbell Buffer Config
> command, the nvme_dbbuf_config function tries to associate each existing
> SQ and CQ with its Shadow Doorbel buffer and EventIdx buffer address.
> Queues created after the Doorbell Buffer Config command will have the
> doorbell buffers associated with them when they are initialized.
> 
> In nvme_process_sq and nvme_post_cqe, proactively check for Shadow
> Doorbell buffer changes instead of wait for doorbell register changes.
> This reduces the number of MMIOs.
> 
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
>  hw/nvme/ctrl.c       | 95 ++++++++++++++++++++++++++++++++++++++++++--
>  hw/nvme/nvme.h       |  8 ++++
>  include/block/nvme.h |  2 +
>  3 files changed, 102 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 03760ddeae..d3f6c432df 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -223,6 +223,7 @@ static const uint32_t nvme_cse_acs[256] = {
>      [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
>      [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
>      [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
> +    [NVME_ADM_CMD_DBBUF_CONFIG]     = NVME_CMD_EFF_CSUPP,
>      [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
>  };
>  
> @@ -1304,6 +1305,12 @@ static inline void nvme_blk_write(BlockBackend *blk, 
> int64_t offset,
>      }
>  }
>  
> +static void nvme_update_cq_head(NvmeCQueue *cq)
> +{
> +    pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
> +            sizeof(cq->head));
> +}
> +
>  static void nvme_post_cqes(void *opaque)
>  {
>      NvmeCQueue *cq = opaque;
> @@ -1316,6 +1323,10 @@ static void nvme_post_cqes(void *opaque)
>          NvmeSQueue *sq;
>          hwaddr addr;
>  
> +        if (cq->cqid && n->dbbuf_enabled) {
> +            nvme_update_cq_head(cq);

Shouldn't we update the cq head eventidx here (prior to reading the
doorbell buffer)? Like we do for the sq tail?

> +        }
> +
>          if (nvme_cq_full(cq)) {
>              break;
>          }
> @@ -4237,6 +4248,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest 
> *req)
>  static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
>                           uint16_t sqid, uint16_t cqid, uint16_t size)
>  {
> +    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
>      int i;
>      NvmeCQueue *cq;
>  
> @@ -4256,6 +4268,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
> uint64_t dma_addr,
>      }
>      sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
>  
> +    if (sqid && n->dbbuf_dbs && n->dbbuf_eis) {
> +        sq->db_addr = n->dbbuf_dbs + 2 * sqid * stride;
> +        sq->ei_addr = n->dbbuf_eis + 2 * sqid * stride;
> +    }
> +
>      assert(n->cq[cqid]);
>      cq = n->cq[cqid];
>      QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
> @@ -4599,6 +4616,7 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
> uint64_t dma_addr,
>                           uint16_t cqid, uint16_t vector, uint16_t size,
>                           uint16_t irq_enabled)
>  {
> +    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
>      int ret;
>  
>      if (msix_enabled(&n->parent_obj)) {
> @@ -4615,6 +4633,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
> uint64_t dma_addr,
>      cq->head = cq->tail = 0;
>      QTAILQ_INIT(&cq->req_list);
>      QTAILQ_INIT(&cq->sq_list);
> +    if (cqid && n->dbbuf_dbs && n->dbbuf_eis) {
> +        cq->db_addr = n->dbbuf_dbs + (2 * cqid + 1) * stride;
> +        cq->ei_addr = n->dbbuf_eis + (2 * cqid + 1) * stride;
> +    }
>      n->cq[cqid] = cq;
>      cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
>  }
> @@ -5767,6 +5789,43 @@ out:
>      return status;
>  }
>  
> +static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
> +{
> +    uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
> +    uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
> +    uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
> +    int i;
> +
> +    /* Address should be page aligned */
> +    if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
> +        return NVME_INVALID_FIELD | NVME_DNR;
> +    }
> +
> +    /* Save shadow buffer base addr for use during queue creation */
> +    n->dbbuf_dbs = dbs_addr;
> +    n->dbbuf_eis = eis_addr;
> +    n->dbbuf_enabled = true;
> +
> +    for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
> +        NvmeSQueue *sq = n->sq[i];
> +        NvmeCQueue *cq = n->cq[i];
> +
> +        if (sq) {
> +            /* Submission queue tail pointer location, 2 * QID * stride */
> +            sq->db_addr = dbs_addr + 2 * i * stride;
> +            sq->ei_addr = eis_addr + 2 * i * stride;
> +        }
> +
> +        if (cq) {
> +            /* Completion queue head pointer location, (2 * QID + 1) * 
> stride */
> +            cq->db_addr = dbs_addr + (2 * i + 1) * stride;
> +            cq->ei_addr = eis_addr + (2 * i + 1) * stride;
> +        }
> +    }

Why no love for the admin queue? :)

You are special-casing the admin queue below in process_sq() and
process_db(), as well as above in post_cqes(). As I'm reading the spec,
I do not see why the Admin queue should be treated differently wrt.
doorbell buffer configuration. Could this be a left-over from the
behavior in the original Google extension (prior to going into NVMe)?

I peeked in to the kernel and it looks like it doesnt enable doorbell
buffers for admin queue, only for subsequently created I/O queues.

Keith, is this a bug in the kernel? If the code here would expect the
doorbell buffer to be updated for the admin queue as well, would we
break?

> +
> +    return NVME_SUCCESS;
> +}
> +
>  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
>  {
>      trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
> @@ -5809,6 +5868,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest 
> *req)
>          return nvme_aer(n, req);
>      case NVME_ADM_CMD_NS_ATTACHMENT:
>          return nvme_ns_attachment(n, req);
> +    case NVME_ADM_CMD_DBBUF_CONFIG:
> +        return nvme_dbbuf_config(n, req);
>      case NVME_ADM_CMD_FORMAT_NVM:
>          return nvme_format(n, req);
>      default:
> @@ -5818,6 +5879,18 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, 
> NvmeRequest *req)
>      return NVME_INVALID_OPCODE | NVME_DNR;
>  }
>  
> +static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
> +{
> +    pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
> +                  sizeof(sq->tail));
> +}
> +
> +static void nvme_update_sq_tail(NvmeSQueue *sq)
> +{
> +    pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
> +                 sizeof(sq->tail));
> +}
> +
>  static void nvme_process_sq(void *opaque)
>  {
>      NvmeSQueue *sq = opaque;
> @@ -5829,6 +5902,10 @@ static void nvme_process_sq(void *opaque)
>      NvmeCmd cmd;
>      NvmeRequest *req;
>  
> +    if (sq->sqid && n->dbbuf_enabled) {
> +        nvme_update_sq_tail(sq);
> +    }
> +
>      while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
>          addr = sq->dma_addr + sq->head * n->sqe_size;
>          if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
> @@ -5852,6 +5929,11 @@ static void nvme_process_sq(void *opaque)
>              req->status = status;
>              nvme_enqueue_req_completion(cq, req);
>          }
> +
> +        if (sq->sqid && n->dbbuf_enabled) {
> +            nvme_update_sq_eventidx(sq);
> +            nvme_update_sq_tail(sq);
> +        }
>      }
>  }
>  
> @@ -5889,6 +5971,9 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
>      n->aer_queued = 0;
>      n->outstanding_aers = 0;
>      n->qs_created = false;
> +    n->dbbuf_dbs = 0;
> +    n->dbbuf_eis = 0;
> +    n->dbbuf_enabled = false;
>  }
>  
>  static void nvme_ctrl_shutdown(NvmeCtrl *n)
> @@ -6397,7 +6482,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, 
> int val)
>          trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
>  
>          start_sqs = nvme_cq_full(cq) ? 1 : 0;
> -        cq->head = new_head;
> +        if (!cq->cqid || !n->dbbuf_enabled) {
> +            cq->head = new_head;
> +        }
>          if (start_sqs) {
>              NvmeSQueue *sq;
>              QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> @@ -6454,7 +6541,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, 
> int val)
>  
>          trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
>  
> -        sq->tail = new_tail;
> +        if (!sq->sqid || !n->dbbuf_enabled) {
> +            sq->tail = new_tail;
> +        }
>          timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>      }
>  }
> @@ -6733,7 +6822,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>  
>      id->mdts = n->params.mdts;
>      id->ver = cpu_to_le32(NVME_SPEC_VER);
> -    id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
> +    id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | 
> NVME_OACS_DBBUF);
>      id->cntrltype = 0x1;
>  
>      /*
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 6773819325..4452e4b1bf 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -334,6 +334,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc)
>      case NVME_ADM_CMD_GET_FEATURES:     return "NVME_ADM_CMD_GET_FEATURES";
>      case NVME_ADM_CMD_ASYNC_EV_REQ:     return "NVME_ADM_CMD_ASYNC_EV_REQ";
>      case NVME_ADM_CMD_NS_ATTACHMENT:    return "NVME_ADM_CMD_NS_ATTACHMENT";
> +    case NVME_ADM_CMD_DBBUF_CONFIG:     return "NVME_ADM_CMD_DBBUF_CONFIG";
>      case NVME_ADM_CMD_FORMAT_NVM:       return "NVME_ADM_CMD_FORMAT_NVM";
>      default:                            return "NVME_ADM_CMD_UNKNOWN";
>      }
> @@ -365,6 +366,8 @@ typedef struct NvmeSQueue {
>      uint32_t    tail;
>      uint32_t    size;
>      uint64_t    dma_addr;
> +    uint64_t    db_addr;
> +    uint64_t    ei_addr;
>      QEMUTimer   *timer;
>      NvmeRequest *io_req;
>      QTAILQ_HEAD(, NvmeRequest) req_list;
> @@ -382,6 +385,8 @@ typedef struct NvmeCQueue {
>      uint32_t    vector;
>      uint32_t    size;
>      uint64_t    dma_addr;
> +    uint64_t    db_addr;
> +    uint64_t    ei_addr;
>      QEMUTimer   *timer;
>      QTAILQ_HEAD(, NvmeSQueue) sq_list;
>      QTAILQ_HEAD(, NvmeRequest) req_list;
> @@ -432,6 +437,9 @@ typedef struct NvmeCtrl {
>      uint64_t    starttime_ms;
>      uint16_t    temperature;
>      uint8_t     smart_critical_warning;
> +    uint64_t    dbbuf_dbs;
> +    uint64_t    dbbuf_eis;
> +    bool        dbbuf_enabled;
>  
>      struct {
>          MemoryRegion mem;
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 3737351cc8..5b522d7b0e 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -595,6 +595,7 @@ enum NvmeAdminCommands {
>      NVME_ADM_CMD_ACTIVATE_FW    = 0x10,
>      NVME_ADM_CMD_DOWNLOAD_FW    = 0x11,
>      NVME_ADM_CMD_NS_ATTACHMENT  = 0x15,
> +    NVME_ADM_CMD_DBBUF_CONFIG   = 0x7c,
>      NVME_ADM_CMD_FORMAT_NVM     = 0x80,
>      NVME_ADM_CMD_SECURITY_SEND  = 0x81,
>      NVME_ADM_CMD_SECURITY_RECV  = 0x82,
> @@ -1134,6 +1135,7 @@ enum NvmeIdCtrlOacs {
>      NVME_OACS_FORMAT    = 1 << 1,
>      NVME_OACS_FW        = 1 << 2,
>      NVME_OACS_NS_MGMT   = 1 << 3,
> +    NVME_OACS_DBBUF     = 1 << 8,
>  };
>  
>  enum NvmeIdCtrlOncs {
> -- 
> 2.25.1
> 

-- 
One of us - No more doubt, silence or taboo about mental illness.

Attachment: signature.asc
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]