qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 4/4] hw/nvme: fix controller hot unplugging


From: Stefan Hajnoczi
Subject: Re: [PATCH 4/4] hw/nvme: fix controller hot unplugging
Date: Wed, 7 Jul 2021 17:14:25 +0100

On Wed, Jul 07, 2021 at 12:43:56PM +0200, Hannes Reinecke wrote:
> On 7/7/21 11:53 AM, Klaus Jensen wrote:
> > On Jul  7 09:49, Hannes Reinecke wrote:
> > > On 7/6/21 11:33 AM, Klaus Jensen wrote:
> > > > From: Klaus Jensen <k.jensen@samsung.com>
> > > > 
> > > > Prior to this patch the nvme-ns devices are always children of the
> > > > NvmeBus owned by the NvmeCtrl. This causes the namespaces to be
> > > > unrealized when the parent device is removed. However, when subsystems
> > > > are involved, this is not what we want since the namespaces may be
> > > > attached to other controllers as well.
> > > > 
> > > > This patch adds an additional NvmeBus on the subsystem device. When
> > > > nvme-ns devices are realized, if the parent controller device is linked
> > > > to a subsystem, the parent bus is set to the subsystem one instead. This
> > > > makes sure that namespaces are kept alive and not unrealized.
> > > > 
> > > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > > > ---
> > > >  hw/nvme/nvme.h   | 18 ++++++++++--------
> > > >  hw/nvme/ctrl.c   |  8 +++++---
> > > >  hw/nvme/ns.c     | 32 +++++++++++++++++++++++++-------
> > > >  hw/nvme/subsys.c |  4 ++++
> > > >  4 files changed, 44 insertions(+), 18 deletions(-)
> > > > 
> > > > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> > > > index c4065467d877..9401e212f9f7 100644
> > > > --- a/hw/nvme/nvme.h
> > > > +++ b/hw/nvme/nvme.h
> > > > @@ -33,12 +33,21 @@ QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES >
> > > > NVME_NSID_BROADCAST - 1);
> > > >  typedef struct NvmeCtrl NvmeCtrl;
> > > >  typedef struct NvmeNamespace NvmeNamespace;
> > > > +#define TYPE_NVME_BUS "nvme-bus"
> > > > +OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS)
> > > > +
> > > > +typedef struct NvmeBus {
> > > > +    BusState parent_bus;
> > > > +    bool     is_subsys;
> > > > +} NvmeBus;
> > > > +
> > > >  #define TYPE_NVME_SUBSYS "nvme-subsys"
> > > >  #define NVME_SUBSYS(obj) \
> > > >      OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
> > > >  typedef struct NvmeSubsystem {
> > > >      DeviceState parent_obj;
> > > > +    NvmeBus     bus;
> > > >      uint8_t     subnqn[256];
> > > >      NvmeCtrl      *ctrls[NVME_MAX_CONTROLLERS];
> > > > @@ -365,13 +374,6 @@ typedef struct NvmeCQueue {
> > > >      QTAILQ_HEAD(, NvmeRequest) req_list;
> > > >  } NvmeCQueue;
> > > > -#define TYPE_NVME_BUS "nvme-bus"
> > > > -#define NVME_BUS(obj) OBJECT_CHECK(NvmeBus, (obj), TYPE_NVME_BUS)
> > > > -
> > > > -typedef struct NvmeBus {
> > > > -    BusState parent_bus;
> > > > -} NvmeBus;
> > > > -
> > > >  #define TYPE_NVME "nvme"
> > > >  #define NVME(obj) \
> > > >          OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME)
> > > > @@ -463,7 +465,7 @@ typedef struct NvmeCtrl {
> > > >  static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
> > > >  {
> > > > -    if (!nsid || nsid > NVME_MAX_NAMESPACES) {
> > > > +    if (!n || !nsid || nsid > NVME_MAX_NAMESPACES) {
> > > >          return NULL;
> > > >      }
> > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > > > index 90e3ee2b70ee..7c8fca36d9a5 100644
> > > > --- a/hw/nvme/ctrl.c
> > > > +++ b/hw/nvme/ctrl.c
> > > > @@ -6516,11 +6516,13 @@ static void nvme_exit(PCIDevice *pci_dev)
> > > >      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
> > > >          ns = nvme_ns(n, i);
> > > > -        if (!ns) {
> > > > -            continue;
> > > > +        if (ns) {
> > > > +            ns->attached--;
> > > >          }
> > > > +    }
> > > > -        nvme_ns_cleanup(ns);
> > > > +    if (n->subsys) {
> > > > +        nvme_subsys_unregister_ctrl(n->subsys, n);
> > > >      }
> > > >      if (n->subsys) {
> > > > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
> > > > index 3c4f5b8c714a..612a2786d75d 100644
> > > > --- a/hw/nvme/ns.c
> > > > +++ b/hw/nvme/ns.c
> > > > @@ -444,13 +444,29 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
> > > >  static void nvme_ns_realize(DeviceState *dev, Error **errp)
> > > >  {
> > > >      NvmeNamespace *ns = NVME_NS(dev);
> > > > -    BusState *s = qdev_get_parent_bus(dev);
> > > > -    NvmeCtrl *n = NVME(s->parent);
> > > > -    NvmeSubsystem *subsys = n->subsys;
> > > > +    BusState *qbus = qdev_get_parent_bus(dev);
> > > > +    NvmeBus *bus = NVME_BUS(qbus);
> > > > +    NvmeCtrl *n = NULL;
> > > > +    NvmeSubsystem *subsys = NULL;
> > > >      uint32_t nsid = ns->params.nsid;
> > > >      int i;
> > > > -    if (!n->subsys) {
> > > > +    if (bus->is_subsys) {
> > > > +        subsys = NVME_SUBSYS(qbus->parent);
> > > > +    } else {
> > > > +        n = NVME(qbus->parent);
> > > > +        subsys = n->subsys;
> > > > +    }
> > > > +
> > > > +    if (subsys) {
> > > > +        /*
> > > > +         * If this namespace belongs to a subsystem (through a
> > > > link on the
> > > > +         * controller device), reparent the device.
> > > > +         */
> > > > +        if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) {
> > > > +            return;
> > > > +        }
> > > > +    } else {
> > > >          if (ns->params.detached) {
> > > >              error_setg(errp, "detached requires that the nvme
> > > > device is "
> > > >                         "linked to an nvme-subsys device");
> > > > @@ -470,7 +486,7 @@ static void nvme_ns_realize(DeviceState
> > > > *dev, Error **errp)
> > > >      if (!nsid) {
> > > >          for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
> > > > -            if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) {
> > > > +            if (nvme_subsys_ns(subsys, i) || nvme_ns(n, i)) {
> > > >                  continue;
> > > >              }
> > > > @@ -483,7 +499,7 @@ static void nvme_ns_realize(DeviceState
> > > > *dev, Error **errp)
> > > >              return;
> > > >          }
> > > >      } else {
> > > > -        if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) {
> > > > +        if (nvme_subsys_ns(subsys, nsid) || nvme_ns(n, nsid)) {
> > > >              error_setg(errp, "namespace id '%d' already
> > > > allocated", nsid);
> > > >              return;
> > > >          }
> > > > @@ -509,7 +525,9 @@ static void nvme_ns_realize(DeviceState
> > > > *dev, Error **errp)
> > > >          }
> > > >      }
> > > > -    nvme_attach_ns(n, ns);
> > > > +    if (n) {
> > > > +        nvme_attach_ns(n, ns);
> > > > +    }
> > > >  }
> > > >  static Property nvme_ns_props[] = {
> > > > diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
> > > > index 92caa604a280..fb7c3a7c55fc 100644
> > > > --- a/hw/nvme/subsys.c
> > > > +++ b/hw/nvme/subsys.c
> > > > @@ -50,6 +50,10 @@ static void nvme_subsys_realize(DeviceState
> > > > *dev, Error **errp)
> > > >  {
> > > >      NvmeSubsystem *subsys = NVME_SUBSYS(dev);
> > > > +    qbus_create_inplace(&subsys->bus, sizeof(NvmeBus),
> > > > TYPE_NVME_BUS, dev,
> > > > +                        dev->id);
> > > > +    subsys->bus.is_subsys = true;
> > > > +
> > > >      nvme_subsys_setup(subsys);
> > > >  }
> > > > 
> > > Why not always create a subsystem, and distinguish between 'shared'
> > > and 'non-shared' subsystems instead of the ->subsys pointer?
> > > That way all namespaces can be children of the subsystem, we won't
> > > need any reparenting, and the whole thing will be more in-line with
> > > qdev, no?
> > > 
> > 
> > Hi Hannes,
> > 
> > Thanks for your reviews and comments!
> > 
> > So, I have actually considered what you suggest.
> > 
> > The problem is that the nvme-ns device currently expects to plug into a
> > bus (an 'nvme-bus') and we cannot really get rid of that 'bus' parameter
> > without breaking compatibility. I experimented with removing the bus
> > from the nvme device and instead creating it in the nvme-subsys device.
> > My idea here was to implicitly always create an nvme-subsys if not
> > explicitly created by the user, but since nvme-subsys does not plug into
> > any bus itself, it is not exposed in the qtree and thus not reachable
> > (i.e., when realizing the nvme-ns device, it will complain that there
> > are no 'nvme-bus' available to plug into). To make this work, I believe
> > we'd have to have the nvme-subsys plug into the main system bus (or some
> > other bus rooted at main-system-bus), and I'd prefer not to do that
> > since this is already a flawed design and I think it would be more
> > intrusive than what my patch does.
> > 
> Sigh.
> _Again_.
> 
> I seem to trip over issues for which no patch can possibly be accepted as
> first the infrastructure has to be reworked.
> 
> ... there is a reason why I'm avoiding posting patches to qemu-devel :-(
> 
> > I raised these issues on the mailinglist some time ago[1]. We didn't
> > really find a good solution, but I think the conclusion is that the
> > bus/device design of subsystems and namespaces is flawed. That's why I
> > am working on an "objectification" of the two devices as suggested by
> > Stefan[2] as a proper fix for this mess.
> > 
> Actually, I _do_ think that it would be good to have an nvme-subsystem bus,
> providing a list of all namespaces for that subsystem.
> Creating a controller would then mean that one has to create a controller
> and a namespace _separately_, as then the namespace would _not_ be a child
> of the controller.
> But that's arguably a good thing, as the namespaces _do_ have a separate
> lifetime from controllers.
> And the lifetime of the subsystem could be completely self-contained; the
> controller would be looking up subsystems via the subsystem NQN if present;
> I guess we'll need to create ad-hoc subsystems for PCIe.
> But nothing insurmountable.
> And certainly nothing which require a large-scale rework of qemu internals,
> I would think.

I don't remember the reason to use --object instead of --device. Maybe
the --device approach is viable and it makes some sense because this is
a "device" (like a SCSI LUN) that exposed to the guest. It would be
necessary to have a nvme-subsys device on the system bus to act as the
bus for nvme-ns devices as Klaus described.

> But _if_ you work on the rework, please ensure that not only NVMe is
> represented here. SCSI multipathing has the same issue currently; there is a
> strict host->lun->block->OS device relationship currently in SCSI, making it
> impossible to represent a multipathed SCSI device in qemu.
> The only sneaky way is to allow for a shared OS device (ie specifying the
> filename more than once), but that completely sidesteps qemu internals, too.

Do you want one SCSIDevice to be accessible via multiple SCSIBus
instances for multipathing in QEMU?

Stefan

Attachment: signature.asc
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]