qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2 12/23] vfio-user: region read/write


From: Alex Williamson
Subject: Re: [PATCH v2 12/23] vfio-user: region read/write
Date: Mon, 6 Feb 2023 12:07:24 -0700

On Wed,  1 Feb 2023 21:55:48 -0800
John Johnson <john.g.johnson@oracle.com> wrote:

> Add support for posted writes on remote devices
> 
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  hw/vfio/user-protocol.h       |  12 +++++
>  hw/vfio/user.h                |   1 +
>  include/hw/vfio/vfio-common.h |   3 +-
>  hw/vfio/common.c              |  23 ++++++---
>  hw/vfio/pci.c                 |   5 +-
>  hw/vfio/user-pci.c            |   5 ++
>  hw/vfio/user.c                | 112 
> ++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/trace-events          |   1 +
>  8 files changed, 154 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
> index 6f70a48..6987435 100644
> --- a/hw/vfio/user-protocol.h
> +++ b/hw/vfio/user-protocol.h
> @@ -139,4 +139,16 @@ typedef struct {
>      uint64_t offset;
>  } VFIOUserRegionInfo;
>  
> +/*
> + * VFIO_USER_REGION_READ
> + * VFIO_USER_REGION_WRITE
> + */
> +typedef struct {
> +    VFIOUserHdr hdr;
> +    uint64_t offset;
> +    uint32_t region;
> +    uint32_t count;
> +    char data[];
> +} VFIOUserRegionRW;
> +
>  #endif /* VFIO_USER_PROTOCOL_H */
> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
> index e6485dc..3012a86 100644
> --- a/hw/vfio/user.h
> +++ b/hw/vfio/user.h
> @@ -84,6 +84,7 @@ typedef struct VFIOUserProxy {
>  /* VFIOProxy flags */
>  #define VFIO_PROXY_CLIENT        0x1
>  #define VFIO_PROXY_FORCE_QUEUED  0x4
> +#define VFIO_PROXY_NO_POST       0x8
>  
>  VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp);
>  void vfio_user_disconnect(VFIOUserProxy *proxy);
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 9fb4c80..bbc4b15 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -57,6 +57,7 @@ typedef struct VFIORegion {
>      VFIOMmap *mmaps;
>      uint8_t nr; /* cache the region number for debug */
>      int fd; /* fd to mmap() region */
> +    bool post_wr; /* writes can be posted */
>  } VFIORegion;
>  
>  typedef struct VFIOMigration {
> @@ -180,7 +181,7 @@ struct VFIODeviceIO {
>      int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t 
> size,
>                         void *data);
>      int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t 
> size,
> -                        void *data);
> +                        void *data, bool post);
>  };
>  
>  struct VFIOContainerIO {
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index d26b325..de64e53 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -215,6 +215,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
>          uint32_t dword;
>          uint64_t qword;
>      } buf;
> +    bool post = region->post_wr;
>      int ret;
>  
>      switch (size) {
> @@ -235,12 +236,19 @@ void vfio_region_write(void *opaque, hwaddr addr,
>          break;
>      }
>  
> -    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf);
> +    /* read-after-write hazard if guest can directly access region */
> +    if (region->nr_mmaps) {
> +        post = false;
> +    }
> +    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf,
> +                                     post);
>      if (ret != size) {
> +        const char *err = ret < 0 ? strerror(-ret) : "short write";
> +
>          error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
> -                     ",%d) failed: %m",
> +                     ",%d) failed: %s",
>                       __func__, vbasedev->name, region->nr,
> -                     addr, data, size);
> +                     addr, data, size, err);
>      }
>      trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
>  
> @@ -271,9 +279,11 @@ uint64_t vfio_region_read(void *opaque,
>  
>      ret = vbasedev->io->region_read(vbasedev, region->nr, addr, size, &buf);
>      if (ret != size) {
> -        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> +        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
>                       __func__, vbasedev->name, region->nr,
> -                     addr, size);
> +                     addr, size, err);
>          return (uint64_t)-1;
>      }
>  
> @@ -1584,6 +1594,7 @@ int vfio_region_setup(Object *obj, VFIODevice 
> *vbasedev, VFIORegion *region,
>      region->size = info->size;
>      region->fd_offset = info->offset;
>      region->nr = index;
> +    region->post_wr = false;
>      if (vbasedev->regfds != NULL) {
>          region->fd = vbasedev->regfds[index];
>      } else {
> @@ -2711,7 +2722,7 @@ static int vfio_io_region_read(VFIODevice *vbasedev, 
> uint8_t index, off_t off,
>  }
>  
>  static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t 
> off,
> -                                uint32_t size, void *data)
> +                                uint32_t size, void *data, bool post)
>  {

This is all a bit confusing as a non-posted write on bare metal would
require a follow-up read to flush the write, but in the kernel case we
rely on both the bus protocols and the userspace driver to perform such
actions to enforce coherency.  The read-after-write hazard comment above
suggests the issue is split access between mmap and read/write across
the region, where the mmap access bypasses the socket protocol.  But
isn't this actually across the whole device?  For example, PCI doesn't
care which BAR a write targets, reads to another BAR cannot bypass the
write, aiui.  IOW, couldn't a write to a BAR that doesn't support mmap
affect the behavior of a BAR that does support mmap, and therefore
there should be no posted writes for the entire device if any region
supports mmap access?

I wonder if a better approach wouldn't be to make all writes operations
synchronous and avoid read-after-write hazard by performing writes
through the mmap when available, ie. make use of the same bypass to
avoid the hazard.

In any case, I think this deserves more comments, both why vfio-user
needs it and why vfio-kernel doesn't, and ideally the initial
read/write implementation would be entirely synchronous with posted
write support added as a separate patch with full explanation and
justification.  Thanks,

Alex




reply via email to

[Prev in Thread] Current Thread [Next in Thread]