qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v6 5/6] Inter-VM shared memory PCI device


From: Blue Swirl
Subject: Re: [Qemu-devel] [PATCH v6 5/6] Inter-VM shared memory PCI device
Date: Sat, 5 Jun 2010 09:44:41 +0000

On Fri, Jun 4, 2010 at 9:45 PM, Cam Macdonell <address@hidden> wrote:
> Support an inter-vm shared memory device that maps a shared-memory object as a
> PCI device in the guest.  This patch also supports interrupts between guest by
> communicating over a unix domain socket.  This patch applies to the qemu-kvm
> repository.
>
>    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>
> Interrupts are supported between multiple VMs by using a shared memory server
> by using a chardev socket.
>
>    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>           [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
>    -chardev socket,path=<path>,id=<id>
>
> (shared memory server is qemu.git/contrib/ivshmem-server)
>
> Sample programs and init scripts are in a git repo here:
>
>    www.gitorious.org/nahanni
> ---
>  Makefile.target |    3 +
>  hw/ivshmem.c    |  852 
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  qemu-char.c     |    6 +
>  qemu-char.h     |    3 +
>  qemu-doc.texi   |   43 +++
>  5 files changed, 907 insertions(+), 0 deletions(-)
>  create mode 100644 hw/ivshmem.c
>
> diff --git a/Makefile.target b/Makefile.target
> index c4ba592..4888308 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -202,6 +202,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o
>  obj-y += rtl8139.o
>  obj-y += e1000.o
>
> +# Inter-VM PCI shared memory
> +obj-y += ivshmem.o
> +

Can this be compiled once, simply by moving this to Makefile.objs
instead of Makefile.target? Also, because the code seems to be KVM
specific, it can't be compiled unconditionally but depending on at
least CONFIG_KVM and maybe CONFIG_EVENTFD.

Why is this KVM specific BTW, Posix SHM is available on many
platforms? What would happen if kvm_set_foobar functions were not
called when KVM is not being used? Is host eventfd support essential?

>  # Hardware support
>  obj-i386-y += vga.o
>  obj-i386-y += mc146818rtc.o i8259.o pc.o
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> new file mode 100644
> index 0000000..9057612
> --- /dev/null
> +++ b/hw/ivshmem.c
> @@ -0,0 +1,852 @@
> +/*
> + * Inter-VM Shared Memory PCI device.
> + *
> + * Author:
> + *      Cam Macdonell <address@hidden>
> + *
> + * Based On: cirrus_vga.c
> + *          Copyright (c) 2004 Fabrice Bellard
> + *          Copyright (c) 2004 Makoto Suzuki (suzu)
> + *
> + *      and rtl8139.c
> + *          Copyright (c) 2006 Igor Kovalenko
> + *
> + * This code is licensed under the GNU GPL v2.
> + */
> +#include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/io.h>
> +#include <sys/ioctl.h>
> +#include "hw.h"
> +#include "console.h"
> +#include "pc.h"
> +#include "pci.h"
> +#include "sysemu.h"
> +
> +#include "msix.h"
> +#include "qemu-kvm.h"
> +#include "libkvm.h"
> +
> +#include <sys/eventfd.h>
> +#include <sys/mman.h>
> +#include <sys/socket.h>
> +#include <sys/ioctl.h>
> +
> +#define IVSHMEM_IRQFD   0
> +#define IVSHMEM_MSI     1
> +
> +//#define DEBUG_IVSHMEM
> +#ifdef DEBUG_IVSHMEM
> +#define IVSHMEM_DPRINTF(fmt, args...)        \
> +    do {printf("IVSHMEM: " fmt, ##args); } while (0)

Please use __VA_ARGS__.

> +#else
> +#define IVSHMEM_DPRINTF(fmt, args...)
> +#endif
> +
> +typedef struct Peer {
> +    int nb_eventfds;
> +    int *eventfds;
> +} Peer;
> +
> +typedef struct EventfdEntry {
> +    PCIDevice *pdev;
> +    int vector;
> +} EventfdEntry;
> +
> +typedef struct IVShmemState {
> +    PCIDevice dev;
> +    uint32_t intrmask;
> +    uint32_t intrstatus;
> +    uint32_t doorbell;
> +
> +    CharDriverState ** eventfd_chr;

I'd remove the space between '**' and 'eventfd_chr', it's used inconsistently.

> +    CharDriverState * server_chr;
> +    int ivshmem_mmio_io_addr;
> +
> +    pcibus_t mmio_addr;
> +    pcibus_t shm_pci_addr;
> +    uint64_t ivshmem_offset;
> +    uint64_t ivshmem_size; /* size of shared memory region */
> +    int shm_fd; /* shared memory file descriptor */
> +
> +    Peer *peers;
> +    int nb_peers; /* how many guests we have space for */
> +    int max_peer; /* maximum numbered peer */
> +
> +    int vm_id;
> +    uint32_t vectors;
> +    uint32_t features;
> +    EventfdEntry *eventfd_table;
> +
> +    char * shmobj;
> +    char * sizearg;
> +    char * role;
> +} IVShmemState;
> +
> +/* registers for the Inter-VM shared memory device */
> +enum ivshmem_registers {
> +    IntrMask = 0,
> +    IntrStatus = 4,
> +    IVPosition = 8,
> +    Doorbell = 12,
> +};

IIRC these should be uppercase.

> +
> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
> +    return (ivs->features & (1 << feature));
> +}

Since this is the first version, do we need any features at this
point, can't we expect that all features are available now? Why does
the user need to specify the features?

To avoid a negative shift, I'd make 'feature' unsigned.

> +
> +static inline bool is_power_of_two(uint64_t x) {
> +    return (x & (x - 1)) == 0;
> +}
> +
> +static void ivshmem_map(PCIDevice *pci_dev, int region_num,
> +                    pcibus_t addr, pcibus_t size, int type)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> +    s->shm_pci_addr = addr;
> +
> +    if (s->ivshmem_offset > 0) {
> +        cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
> +                                                            
> s->ivshmem_offset);
> +        if (s->role && strncmp(s->role, "peer", 4) == 0) {
> +            IVSHMEM_DPRINTF("marking pages no migrate\n");
> +            cpu_mark_pages_no_migrate(s->ivshmem_offset, s->ivshmem_size);
> +        }
> +    }
> +
> +    IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = %u\n",
> +                (uint32_t)addr, (uint32_t)s->ivshmem_offset, (uint32_t)size);

Please use FMT_PCIBUS for addr and size and PRIu64 for s->ivshmem_offset.

> +
> +}
> +
> +/* accessing registers - based on rtl8139 */
> +static void ivshmem_update_irq(IVShmemState *s, int val)
> +{
> +    int isr;
> +    isr = (s->intrstatus & s->intrmask) & 0xffffffff;
> +
> +    /* don't print ISR resets */
> +    if (isr) {
> +        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
> +           isr ? 1 : 0, s->intrstatus, s->intrmask);
> +    }
> +
> +    qemu_set_irq(s->dev.irq[0], (isr != 0));
> +}
> +
> +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
> +{
> +    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
> +
> +    s->intrmask = val;
> +
> +    ivshmem_update_irq(s, val);
> +}
> +
> +static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
> +{
> +    uint32_t ret = s->intrmask;
> +
> +    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
> +
> +    return ret;
> +}
> +
> +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
> +{
> +    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
> +
> +    s->intrstatus = val;
> +
> +    ivshmem_update_irq(s, val);
> +    return;
> +}
> +
> +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
> +{
> +    uint32_t ret = s->intrstatus;
> +
> +    /* reading ISR clears all interrupts */
> +    s->intrstatus = 0;
> +
> +    ivshmem_update_irq(s, 0);
> +
> +    return ret;
> +}
> +
> +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
> +{
> +
> +    IVSHMEM_DPRINTF("We shouldn't be writing words\n");
> +}
> +
> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
> +{
> +    IVShmemState *s = opaque;
> +
> +    u_int64_t write_one = 1;

Please use uintNN_t instead of u_intNN_t.

> +    u_int16_t dest = val >> 16;
> +    u_int16_t vector = val & 0xff;
> +
> +    addr &= 0xfc;

I'd add a debug printf here, likewise for exit of ivshmem_io_readl().
When you do the merge (see below), the correct printf format for the
addresses will be TARGET_FMT_plx.

> +
> +    switch (addr)
> +    {
> +        case IntrMask:
> +            ivshmem_IntrMask_write(s, val);
> +            break;
> +
> +        case IntrStatus:
> +            ivshmem_IntrStatus_write(s, val);
> +            break;
> +
> +        case Doorbell:
> +            /* check that dest VM ID is reasonable */
> +            if ((dest < 0) || (dest > s->max_peer)) {
> +                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
> +                break;
> +            }
> +
> +            /* check doorbell range */
> +            if ((vector >= 0) && (vector < s->peers[dest].nb_eventfds)) {
> +                IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n",
> +                                                    write_one, dest, vector);

PRId64 for write_one, %ld is not enough on ILP32.

> +                if (write(s->peers[dest].eventfds[vector],
> +                                                    &(write_one), 8) != 8) {
> +                    IVSHMEM_DPRINTF("error writing to eventfd\n");
> +                }
> +            }
> +            break;
> +        default:
> +            IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
> +    }
> +}
> +
> +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
> +{
> +    IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
> +}
> +
> +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
> +{
> +
> +    IVSHMEM_DPRINTF("We shouldn't be reading words\n");
> +    return 0;
> +}
> +
> +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
> +{
> +
> +    IVShmemState *s = opaque;
> +    uint32_t ret;
> +
> +    switch (addr)
> +    {
> +        case IntrMask:
> +            ret = ivshmem_IntrMask_read(s);
> +            break;
> +
> +        case IntrStatus:
> +            ret = ivshmem_IntrStatus_read(s);
> +            break;
> +
> +        case IVPosition:
> +            /* return my VM ID if the memory is mapped */
> +            if (s->shm_fd > 0) {
> +                ret = s->vm_id;
> +            } else {
> +                ret = -1;
> +            }
> +            break;
> +
> +        default:
> +            IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
> +            ret = 0;
> +    }
> +
> +    return ret;
> +}
> +
> +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
> +{
> +    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
> +
> +    return 0;
> +}
> +
> +static void ivshmem_mmio_writeb(void *opaque,
> +                                target_phys_addr_t addr, uint32_t val)
> +{
> +    ivshmem_io_writeb(opaque, addr & 0xFF, val);
> +}

This function and others below only performs a cast and useless
masking (the address passed is these days an offset from start of the
area). Please merge these to ivshmem_io_readl() etc.

> +
> +static void ivshmem_mmio_writew(void *opaque,
> +                                target_phys_addr_t addr, uint32_t val)
> +{
> +    ivshmem_io_writew(opaque, addr & 0xFF, val);
> +}
> +
> +static void ivshmem_mmio_writel(void *opaque,
> +                                target_phys_addr_t addr, uint32_t val)
> +{
> +    ivshmem_io_writel(opaque, addr & 0xFF, val);
> +}
> +
> +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    return ivshmem_io_readb(opaque, addr & 0xFF);
> +}
> +
> +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    uint32_t val = ivshmem_io_readw(opaque, addr & 0xFF);
> +    return val;
> +}
> +
> +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    uint32_t val = ivshmem_io_readl(opaque, addr & 0xFF);
> +    return val;
> +}
> +
> +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {

Please add 'const'.

> +    ivshmem_mmio_readb,
> +    ivshmem_mmio_readw,
> +    ivshmem_mmio_readl,
> +};
> +
> +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
> +    ivshmem_mmio_writeb,
> +    ivshmem_mmio_writew,
> +    ivshmem_mmio_writel,
> +};
> +
> +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
> +{
> +    IVShmemState *s = opaque;
> +
> +    ivshmem_IntrStatus_write(s, *buf);
> +
> +    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
> +}
> +
> +static int ivshmem_can_receive(void * opaque)
> +{
> +    return 8;
> +}
> +
> +static void ivshmem_event(void *opaque, int event)
> +{
> +    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
> +}
> +
> +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
> +
> +    EventfdEntry *entry = opaque;
> +    PCIDevice *pdev = entry->pdev;
> +
> +    IVSHMEM_DPRINTF("fake irqfd on vector %p %d\n", pdev, entry->vector);
> +    msix_notify(pdev, entry->vector);
> +}
> +
> +static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
> +                                                                    int 
> vector)
> +{
> +    /* create a event character device based on the passed eventfd */
> +    IVShmemState *s = opaque;
> +    CharDriverState * chr;
> +
> +    chr = qemu_chr_open_eventfd(eventfd);
> +
> +    if (chr == NULL) {
> +        IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd);

This should not be a DPRINTF.

> +        exit(-1);
> +    }
> +
> +    /* if MSI is supported we need multiple interrupts */
> +    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +        s->eventfd_table[vector].pdev = &s->dev;
> +        s->eventfd_table[vector].vector = vector;
> +
> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
> +                      ivshmem_event, &s->eventfd_table[vector]);
> +    } else {
> +        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
> +                      ivshmem_event, s);
> +    }
> +
> +    return chr;
> +
> +}
> +
> +static int check_shm_size(IVShmemState *s, int fd) {
> +    /* check that the guest isn't going to try and map more memory than the
> +     * the object has allocated return -1 to indicate error */
> +
> +    struct stat buf;
> +
> +    fstat(fd, &buf);
> +
> +    if (s->ivshmem_size > buf.st_size) {
> +        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
> +        fprintf(stderr, " than shared object size (%ld > %ld)\n",
> +                                          s->ivshmem_size, buf.st_size);

Please use PRIx64 for s->ivshmem_size, this will cause a warning on ILP32.

> +        return -1;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +/* create the shared memory BAR when we are not using the server, so we can
> + * create the BAR and map the memory immediately */
> +static void create_shared_memory_BAR(IVShmemState *s, int fd) {
> +
> +    void * ptr;
> +
> +    s->shm_fd = fd;
> +
> +    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
> +
> +    s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, ptr);

qemu_ram_map() does not exist in HEAD.

> +
> +    /* region for shared memory */
> +    pci_register_bar(&s->dev, 2, s->ivshmem_size,
> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY, 
> ivshmem_map);
> +}
> +
> +static void close_guest_eventfds(IVShmemState *s, int posn)
> +{
> +    int i, guest_curr_max;
> +
> +    guest_curr_max = s->peers[posn].nb_eventfds;
> +
> +    for (i = 0; i < guest_curr_max; i++)
> +        close(s->peers[posn].eventfds[i]);

CODING_STYLE.

> +
> +    qemu_free(s->peers[posn].eventfds);
> +    s->peers[posn].nb_eventfds = 0;
> +}
> +
> +static void setup_ioeventfds(IVShmemState *s) {
> +
> +    int i, j;
> +
> +    for (i = 0; i <= s->max_peer; i++) {
> +        for (j = 0; j < s->peers[i].nb_eventfds; j++) {
> +            kvm_set_ioeventfd_mmio_long(s->peers[i].eventfds[j],
> +                    s->mmio_addr + Doorbell, (i << 16) | j, 1);
> +        }
> +    }
> +
> +    /* setup irqfd for this VM's eventfds */
> +    for (i = 0; i < s->vectors; i++) {
> +        kvm_set_irqfd(s->dev.msix_irq_entries[i].gsi,
> +                        s->peers[s->vm_id].eventfds[i], 1);

kvm_set_irqfd() does not exist in HEAD.

> +    }
> +}
> +
> +
> +/* this function increase the dynamic storage need to store data about other
> + * guests */
> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
> +
> +    int j, old_nb_alloc;
> +
> +    old_nb_alloc = s->nb_peers;
> +
> +    while (new_min_size >= s->nb_peers)
> +        s->nb_peers = s->nb_peers * 2;
> +
> +    IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nb_peers);
> +    s->peers = qemu_realloc(s->peers, s->nb_peers * sizeof(Peer));
> +
> +    if (s->peers == NULL) {
> +        fprintf(stderr, "Allocation error - exiting\n");
> +        exit(1);
> +    }

qemu_realloc will never return zero.

> +
> +    /* zero out new pointers */
> +    for (j = old_nb_alloc; j < s->nb_peers; j++) {
> +        s->peers[j].eventfds = NULL;
> +        s->peers[j].nb_eventfds = 0;
> +    }
> +}
> +
> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
> +{
> +    IVShmemState *s = opaque;
> +    int incoming_fd, tmp_fd;
> +    int guest_curr_max;
> +    long incoming_posn;
> +
> +    memcpy(&incoming_posn, buf, sizeof(long));
> +    /* pick off s->server_chr->msgfd and store it, posn should accompany msg 
> */
> +    tmp_fd = qemu_chr_get_msgfd(s->server_chr);
> +    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
> +
> +    /* make sure we have enough space for this guest */
> +    if (incoming_posn >= s->nb_peers) {
> +        increase_dynamic_storage(s, incoming_posn);
> +    }
> +
> +    if (tmp_fd == -1) {
> +        /* if posn is positive and unseen before then this is our posn*/
> +        if ((incoming_posn >= 0) && (s->peers[incoming_posn].eventfds == 
> NULL)) {
> +            /* receive our posn */
> +            s->vm_id = incoming_posn;
> +            return;
> +        } else {
> +            /* otherwise an fd == -1 means an existing guest has gone away */
> +            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
> +            close_guest_eventfds(s, incoming_posn);
> +            return;
> +        }
> +    }
> +
> +    /* because of the implementation of get_msgfd, we need a dup */
> +    incoming_fd = dup(tmp_fd);
> +
> +    if (incoming_fd == -1) {
> +        fprintf(stderr, "could not allocate file descriptor %s\n",
> +                                                            strerror(errno));
> +        return;
> +    }
> +
> +    /* if the position is -1, then it's shared memory region fd */
> +    if (incoming_posn == -1) {
> +
> +        void * map_ptr;
> +
> +        s->max_peer = 0;
> +
> +        if (check_shm_size(s, incoming_fd) == -1) {
> +            exit(-1);
> +        }
> +
> +        /* mmap the region and map into the BAR2 */
> +        map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED,
> +                                                                incoming_fd, 
> 0);
> +        s->ivshmem_offset = qemu_ram_map(s->ivshmem_size, map_ptr);
> +
> +        IVSHMEM_DPRINTF("guest pci addr = %u, guest h/w addr = %u, size = 
> %u\n",
> +                        (uint32_t)s->shm_pci_addr, 
> (uint32_t)s->ivshmem_offset,
> +                        (uint32_t)s->ivshmem_size);
> +
> +        if (s->shm_pci_addr > 0) {
> +            /* map memory into BAR2 */
> +            cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
> +                                                            
> s->ivshmem_offset);
> +            if (s->role && strncmp(s->role, "peer", 4) == 0) {
> +                IVSHMEM_DPRINTF("marking pages no migrate\n");
> +                cpu_mark_pages_no_migrate(s->ivshmem_offset, 
> s->ivshmem_size);
> +            }
> +
> +        }
> +
> +        /* only store the fd if it is successfully mapped */
> +        s->shm_fd = incoming_fd;
> +
> +        return;
> +    }
> +
> +    /* each guest has an array of eventfds, and we keep track of how many
> +     * guests for each VM */
> +    guest_curr_max = s->peers[incoming_posn].nb_eventfds;
> +    if (guest_curr_max == 0) {
> +        /* one eventfd per MSI vector */
> +        s->peers[incoming_posn].eventfds = (int *) qemu_malloc(s->vectors *
> +                                                                sizeof(int));

Useless cast in C.

> +    }
> +
> +    /* this is an eventfd for a particular guest VM */
> +    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, 
> guest_curr_max,
> +                                                                incoming_fd);
> +    s->peers[incoming_posn].eventfds[guest_curr_max] = incoming_fd;
> +
> +    /* increment count for particular guest */
> +    s->peers[incoming_posn].nb_eventfds++;
> +
> +    /* keep track of the maximum VM ID */
> +    if (incoming_posn > s->max_peer) {
> +        s->max_peer = incoming_posn;
> +    }
> +
> +    if (incoming_posn == s->vm_id) {
> +        int vector = guest_curr_max;

Why add a new variable?

> +        if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +            /* initialize char device for callback
> +             * if this is one of my eventfds */
> +            s->eventfd_chr[vector] = create_eventfd_chr_device(s,
> +                       s->peers[s->vm_id].eventfds[vector], vector);
> +        }
> +    }
> +
> +    return;
> +}
> +
> +static void ivshmem_reset(DeviceState *d)
> +{
> +    return;

Nothing to do?

> +}
> +
> +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
> +                       pcibus_t addr, pcibus_t size, int type)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> +    s->mmio_addr = addr;
> +    cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr);

The 0x400 should be #defined earlier. Why 0x400 since you are only
interested in the 0x100 first bytes?

> +
> +    /* ioeventfd and irqfd are enabled together,
> +     * so the flag IRQFD refers to both */
> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +        setup_ioeventfds(s);
> +    }
> +}
> +
> +static uint64_t ivshmem_get_size(IVShmemState * s) {
> +
> +    uint64_t value;
> +    char *ptr;
> +
> +    value = strtoul(s->sizearg, &ptr, 10);

I'd use strtoull() but the whole function should be suppressed, see below.

> +    switch (*ptr) {
> +        case 0: case 'M': case 'm':
> +            value <<= 20;
> +            break;
> +        case 'G': case 'g':
> +            value <<= 30;
> +            break;
> +        default:
> +            fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
> +            exit(1);
> +    }
> +
> +    /* BARs must be a power of 2 */
> +    if (!is_power_of_two(value)) {
> +        fprintf(stderr, "ivshmem: size must be power of 2\n");
> +        exit(1);
> +    }

Isn't the BAR check in pci.c enough?

> +
> +    return value;
> +
> +}
> +
> +static void ivshmem_setup_msi(IVShmemState * s) {
> +
> +    int i;
> +
> +    /* allocate the MSI-X vectors */
> +
> +    if (!msix_init(&s->dev, s->vectors, 1, 0)) {
> +        pci_register_bar(&s->dev, 1,
> +                         msix_bar_size(&s->dev),
> +                         PCI_BASE_ADDRESS_SPACE_MEMORY,
> +                         msix_mmio_map);
> +        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
> +    } else {
> +        IVSHMEM_DPRINTF("msix initialization failed\n");

Is this fatal considering the msix_vector_use() below?

> +    }
> +
> +    /* 'activate' the vectors */
> +    for (i = 0; i < s->vectors; i++) {
> +        msix_vector_use(&s->dev, i);
> +    }
> +
> +    /* if IRQFDs are not supported, we'll have to trigger the interrupts
> +     * via Qemu char devices */
> +    if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> +        /* for handling interrupts when IRQFD is not available */
> +        s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
> +    }
> +}
> +
> +static void ivshmem_save(QEMUFile* f, void *opaque)
> +{
> +    IVShmemState *proxy = opaque;
> +
> +    IVSHMEM_DPRINTF("ivshmem_save\n");
> +    pci_device_save(&proxy->dev, f);
> +
> +    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
> +        msix_save(&proxy->dev, f);
> +    } else {
> +        qemu_put_be32(f, proxy->intrstatus);
> +        qemu_put_be32(f, proxy->intrmask);
> +    }
> +
> +}

There may be VMState magic to handle conditional structures (or just
make the structures unconditional), so VMState should be used instead.

> +
> +static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
> +{
> +    IVSHMEM_DPRINTF("ivshmem_load\n");
> +
> +    IVShmemState *proxy = opaque;
> +    int ret, i;
> +

Missing version check.

> +    ret = pci_device_load(&proxy->dev, f);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
> +        msix_load(&proxy->dev, f);
> +        for (i = 0; i < proxy->vectors; i++) {
> +            msix_vector_use(&proxy->dev, i);
> +        }
> +    } else {
> +        proxy->intrstatus = qemu_get_be32(f);
> +        proxy->intrmask = qemu_get_be32(f);
> +    }
> +
> +    return 0;
> +}
> +
> +static int pci_ivshmem_init(PCIDevice *dev)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +    uint8_t *pci_conf;
> +
> +    if (s->sizearg == NULL)
> +        s->ivshmem_size = 4 << 20; /* 4 MB default */
> +    else {
> +        s->ivshmem_size = ivshmem_get_size(s);
> +    }
> +
> +    register_savevm("ivshmem", 0, 0, ivshmem_save, ivshmem_load, dev);
> +
> +    /* IRQFD requires MSI */
> +    if (ivshmem_has_feature(s, IVSHMEM_IRQFD) &&
> +        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +        fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
> +        exit(1);
> +    }
> +
> +    /* check that role is reasonable */
> +    if (s->role && !((strncmp(s->role, "peer", 5) == 0) ||
> +                        (strncmp(s->role, "master", 7) == 0))) {
> +        fprintf(stderr, "ivshmem: 'role' must be 'peer' or 'master'\n");
> +        exit(1);
> +    }

I'd add a scalar flag in IVShmemState for role so that further strcmps
are avoided.

> +
> +    pci_conf = s->dev.config;
> +    pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
> +    pci_conf[0x01] = 0x1a;
> +    pci_conf[0x02] = 0x10;
> +    pci_conf[0x03] = 0x11;

Please add the DID to hw/pci_ids.h and use pci_config_set_xyz() here.

> +    pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
> +    pci_conf[0x0a] = 0x00; /* RAM controller */
> +    pci_conf[0x0b] = 0x05;
> +    pci_conf[0x0e] = 0x00; /* header_type */
> +
> +    pci_conf[PCI_INTERRUPT_PIN] = 1;
> +
> +    s->shm_pci_addr = 0;
> +    s->ivshmem_offset = 0;
> +    s->shm_fd = 0;
> +
> +    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
> +                                    ivshmem_mmio_write, s);
> +    /* region for registers*/
> +    pci_register_bar(&s->dev, 0, 0x400,
> +                           PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
> +
> +    if ((s->server_chr != NULL) &&
> +                        (strncmp(s->server_chr->filename, "unix:", 5) == 0)) 
> {
> +        /* if we get a UNIX socket as the parameter we will talk
> +         * to the ivshmem server to receive the memory region */
> +
> +        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
> +                                                    s->server_chr->filename);
> +
> +        if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +            ivshmem_setup_msi(s);
> +        }
> +
> +        /* we allocate enough space for 16 guests and grow as needed */
> +        s->nb_peers = 16;
> +        s->vm_id = -1;
> +
> +        /* allocate/initialize space for interrupt handling */
> +        s->peers = qemu_mallocz(s->nb_peers * sizeof(Peer));
> +
> +        pci_register_bar(&s->dev, 2, s->ivshmem_size,
> +                                    PCI_BASE_ADDRESS_SPACE_MEMORY, 
> ivshmem_map);
> +
> +        s->eventfd_chr = (CharDriverState **) qemu_mallocz(s->vectors *
> +                                                sizeof(CharDriverState *));

Useless cast in C.

> +
> +        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, 
> ivshmem_read,
> +                     ivshmem_event, s);
> +    } else {
> +        /* just map the file immediately, we're not using a server */
> +        int fd;
> +
> +        if (s->shmobj == NULL) {
> +            fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
> +        }

I'd rather have separate 'chardev' and 'shm_file' parameters. Then
'info qtree' could return more useful information about the chardev.

> +
> +        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
> +
> +        /* try opening with O_EXCL and if it succeeds zero the memory
> +         * by truncating to 0 */
> +        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
> +                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
> +           /* truncate file to length PCI device's memory */
> +            if (ftruncate(fd, s->ivshmem_size) != 0) {
> +                fprintf(stderr, "kvm_ivshmem: could not truncate shared 
> file\n");

Why 'kvm_ivshmem'?

> +            }
> +
> +        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
> +                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
> +            fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
> +            exit(-1);
> +
> +        }
> +
> +        if (check_shm_size(s, fd) == -1) {
> +            exit(-1);
> +        }
> +
> +        create_shared_memory_BAR(s, fd);
> +
> +    }
> +
> +    return 0;
> +}
> +
> +static int pci_ivshmem_uninit(PCIDevice *dev)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +
> +    cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
> +
> +    return 0;
> +}
> +
> +static PCIDeviceInfo ivshmem_info = {
> +    .qdev.name  = "ivshmem",
> +    .qdev.size  = sizeof(IVShmemState),
> +    .qdev.reset = ivshmem_reset,
> +    .init       = pci_ivshmem_init,
> +    .exit       = pci_ivshmem_uninit,
> +    .qdev.props = (Property[]) {
> +        DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
> +        DEFINE_PROP_STRING("size", IVShmemState, sizearg),

This should be scalar type, not string.

> +        DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
> +        DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, 
> false),
> +        DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
> +        DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
> +        DEFINE_PROP_STRING("role", IVShmemState, role),
> +        DEFINE_PROP_END_OF_LIST(),
> +    }
> +};
> +
> +static void ivshmem_register_devices(void)
> +{
> +    pci_qdev_register(&ivshmem_info);
> +}
> +
> +device_init(ivshmem_register_devices)
> diff --git a/qemu-char.c b/qemu-char.c
> index ac65a1c..b2e50d0 100644
> --- a/qemu-char.c
> +++ b/qemu-char.c
> @@ -2093,6 +2093,12 @@ static void tcp_chr_read(void *opaque)
>     }
>  }
>
> +CharDriverState *qemu_chr_open_eventfd(int eventfd){
> +
> +    return qemu_chr_open_fd(eventfd, eventfd);
> +
> +}
> +
>  static void tcp_chr_connect(void *opaque)
>  {
>     CharDriverState *chr = opaque;
> diff --git a/qemu-char.h b/qemu-char.h
> index e3a0783..6ea01ba 100644
> --- a/qemu-char.h
> +++ b/qemu-char.h
> @@ -94,6 +94,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject 
> *ret_data);
>  void qemu_chr_info(Monitor *mon, QObject **ret_data);
>  CharDriverState *qemu_chr_find(const char *name);
>
> +/* add an eventfd to the qemu devices that are polled */
> +CharDriverState *qemu_chr_open_eventfd(int eventfd);

Maybe this should be removed and just open coded with qemu_chr_open_fd.

> +
>  extern int term_escape_char;
>
>  /* async I/O support */
> diff --git a/qemu-doc.texi b/qemu-doc.texi
> index 6647b7b..24f8748 100644
> --- a/qemu-doc.texi
> +++ b/qemu-doc.texi
> @@ -706,6 +706,49 @@ Using the @option{-net socket} option, it is possible to 
> make VLANs
>  that span several QEMU instances. See @ref{sec_invocation} to have a
>  basic example.
>
> address@hidden Other Devices
> +
> address@hidden Inter-VM Shared Memory device
> +
> +With KVM enabled on a Linux host, a shared memory device is available.  
> Guests
> +map a POSIX shared memory region into the guest as a PCI device that enables
> +zero-copy communication to the application level of the guests.  The basic
> +syntax is:
> +
> address@hidden
> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> address@hidden example
> +
> +If desired, interrupts can be sent between guest VMs accessing the same 
> shared
> +memory region.  Interrupt support requires using a shared memory server and
> +using a chardev socket to connect to it.  The code for the shared memory 
> server
> +is qemu.git/contrib/ivshmem-server.  An example syntax when using the shared
> +memory server is:
> +
> address@hidden
> +qemu -device ivshmem,size=<size in format accepted by -m>[,chardev=<id>]
> +                        [,msi=on][,irqfd=on][,vectors=n][,role=peer|master]
> +qemu -chardev socket,path=<path>,id=<id>
> address@hidden example
> +
> +When using the server, the guest will be assigned a VM ID (>=0) that allows 
> guests
> +using the same server to communicate via interrupts.  Guests can read their
> +VM ID from a device register (see example code).  Since receiving the shared
> +memory region from the server is asynchronous, there is a (small) chance the
> +guest may boot before the shared memory is attached.  To allow an application
> +to ensure shared memory is attached, the VM ID register will return -1 (an
> +invalid VM ID) until the memory is attached.  Once the shared memory is
> +attached, the VM ID will return the guest's valid VM ID.  With these 
> semantics,
> +the guest application can check to ensure the shared memory is attached to 
> the
> +guest before proceeding.
> +
> +The @option{role} argument can be set to either master or peer and will 
> affect
> +how the shared memory is migrated.  With @option{role=master}, the guest will
> +copy the shared memory on migration to the destination host.  With
> address@hidden, the shared memory will not be copied on migration.  Only
> +one guest should be specified as
> +the master.
> +
> address@hidden direct_linux_boot
> address@hidden Direct Linux Boot
>
> --
> 1.6.3.2.198.g6096d
>
>
>



reply via email to

[Prev in Thread] Current Thread [Next in Thread]