[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v2 3/4] vfio: vfio-pci device assignment driver
From: |
Jan Kiszka |
Subject: |
Re: [Qemu-devel] [PATCH v2 3/4] vfio: vfio-pci device assignment driver |
Date: |
Tue, 14 Aug 2012 19:40:59 +0200 |
User-agent: |
Mozilla/5.0 (X11; U; Linux i686 (x86_64); de; rv:1.8.1.12) Gecko/20080226 SUSE/2.0.0.12-1.1 Thunderbird/2.0.0.12 Mnenhy/0.7.5.666 |
Just some comments, didn't look at all details.
On 2012-08-02 21:17, Alex Williamson wrote:
> +
> +static int vfio_msix_vector_use(PCIDevice *pdev,
> + unsigned int vector, MSIMessage msg)
> +{
> + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> + int ret, fd;
> +
> + DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function, vector);
> +
> + if (vdev->interrupt != INT_MSIX) {
> + vfio_disable_interrupts(vdev);
> + }
> +
> + if (!vdev->msi_vectors) {
> + vdev->msi_vectors = g_malloc0(vdev->msix->entries *
> sizeof(MSIVector));
> + }
> +
> + vdev->msi_vectors[vector].vdev = vdev;
> + vdev->msi_vectors[vector].vector = vector;
> + vdev->msi_vectors[vector].use = true;
> +
> + msix_vector_use(pdev, vector);
> +
> + if (event_notifier_init(&vdev->msi_vectors[vector].interrupt, 0)) {
> + error_report("vfio: Error: event_notifier_init failed\n");
> + }
> +
> + fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
> +
> + /*
> + * Attempt to enable route through KVM irqchip,
> + * default to userspace handling if unavailable.
> + */
> + vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state,
> msg);
> + if (vdev->msi_vectors[vector].virq < 0 ||
> + kvm_irqchip_add_irqfd(kvm_state, fd,
> + vdev->msi_vectors[vector].virq) < 0) {
If kvm_irqchip_add_irqfd fails, you have to drop the route and set virq
to -1. Otherwise, you won't match with the release logic below.
> + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> + &vdev->msi_vectors[vector]);
> + }
> +
> + /*
> + * We don't want to have the host allocate all possible MSI vectors
> + * for a device if they're not in use, so we shutdown and incrementally
> + * increase them as needed.
> + */
> + if (vdev->nr_vectors < vector + 1) {
> + int i;
> +
> + vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
> + vdev->nr_vectors = vector + 1;
> + ret = vfio_enable_vectors(vdev, true);
> + if (ret) {
> + error_report("vfio: failed to enable vectors, %d\n", ret);
> + }
> +
> + /* We don't know if we've missed interrupts in the interim... */
> + for (i = 0; i < vdev->msix->entries; i++) {
> + if (vdev->msi_vectors[i].use) {
> + msix_notify(&vdev->pdev, i);
> + }
> + }
And it wasn't possible to provide an interface with VFIO that allows
vector addition/removal on the fly? KVM has an aweful one in this
regard, but that is legacy, VFIO is new. The above logic is a bit ugly IMHO.
> + } else {
> + struct vfio_irq_set_fd irq_set_fd = {
> + .irq_set = {
> + .argsz = sizeof(irq_set_fd),
> + .flags = VFIO_IRQ_SET_DATA_EVENTFD |
> + VFIO_IRQ_SET_ACTION_TRIGGER,
> + .index = VFIO_PCI_MSIX_IRQ_INDEX,
> + .start = vector,
> + .count = 1,
> + },
> + .fd = fd,
> + };
> + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
> + if (ret) {
> + error_report("vfio: failed to modify vector, %d\n", ret);
> + }
> + msix_notify(&vdev->pdev, vector);
That injection should no longer be needed once we bounce and record in
the PBA, right? Maybe add a comment for now.
> + }
> +
> + return 0;
> +}
> +
> +static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int vector)
> +{
> + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> + struct vfio_irq_set_fd irq_set_fd = {
> + .irq_set = {
> + .argsz = sizeof(irq_set_fd),
> + .flags = VFIO_IRQ_SET_DATA_EVENTFD |
> + VFIO_IRQ_SET_ACTION_TRIGGER,
> + .index = VFIO_PCI_MSIX_IRQ_INDEX,
> + .start = vector,
> + .count = 1,
> + },
> + .fd = -1,
> + };
> + int fd;
> +
> + DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function, vector);
> +
> + /*
> + * XXX What's the right thing to do here? This turns off the interrupt
> + * completely, but do we really just want to switch the interrupt to
> + * bouncing through userspace and let msix.c drop it? Not sure.
> + */
> + msix_vector_unuse(pdev, vector);
> + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
> +
> + fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
> +
> + if (vdev->msi_vectors[vector].virq < 0) {
> + qemu_set_fd_handler(fd, NULL, NULL, NULL);
> + } else {
> + kvm_irqchip_remove_irqfd(kvm_state, fd,
> vdev->msi_vectors[vector].virq);
> + kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[vector].virq);
> + vdev->msi_vectors[vector].virq = -1;
> + }
> +
> + event_notifier_cleanup(&vdev->msi_vectors[vector].interrupt);
> + vdev->msi_vectors[vector].use = false;
> +}
> +
> +/* XXX This should move to msi.c */
Nope. We rather need notifier support for MSI. I only have an outdated
patch at hand.
> +static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector)
> +{
> + uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap +
> PCI_MSI_FLAGS);
> + bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> + MSIMessage msg;
> +
> + if (msi64bit) {
> + msg.address = pci_get_quad(pdev->config +
> + pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> + } else {
> + msg.address = pci_get_long(pdev->config +
> + pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> + }
> +
> + msg.data = pci_get_word(pdev->config + pdev->msi_cap +
> + (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32));
> + msg.data += vector;
> +
> + return msg;
> +}
> +
> +/* So should this */
> +static void msi_set_qsize(PCIDevice *pdev, uint8_t size)
> +{
> + uint8_t *config = pdev->config + pdev->msi_cap;
> + uint16_t flags;
> +
> + flags = pci_get_word(config + PCI_MSI_FLAGS);
> + flags = le16_to_cpu(flags);
> + flags &= ~PCI_MSI_FLAGS_QSIZE;
> + flags |= (size & 0x7) << 4;
> + flags = cpu_to_le16(flags);
> + pci_set_word(config + PCI_MSI_FLAGS, flags);
Hmm...
> +}
> +
> +static void vfio_enable_msi(VFIODevice *vdev)
> +{
> + int ret, i;
> +
> + vfio_disable_interrupts(vdev);
> +
> + vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
> +retry:
> + vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(MSIVector));
> +
> + for (i = 0; i < vdev->nr_vectors; i++) {
> + MSIMessage msg;
> + int fd;
> +
> + vdev->msi_vectors[i].vdev = vdev;
> + vdev->msi_vectors[i].vector = i;
> + vdev->msi_vectors[i].use = true;
> +
> + if (event_notifier_init(&vdev->msi_vectors[i].interrupt, 0)) {
> + error_report("vfio: Error: event_notifier_init failed\n");
> + }
> +
> + fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> +
> + msg = msi_get_msg(&vdev->pdev, i);
> +
> + /*
> + * Attempt to enable route through KVM irqchip,
> + * default to userspace handling if unavailable.
> + */
> + vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state,
> msg);
> + if (vdev->msi_vectors[i].virq < 0 ||
> + kvm_irqchip_add_irqfd(kvm_state, fd,
> + vdev->msi_vectors[i].virq) < 0) {
> + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> + &vdev->msi_vectors[i]);
> + }
> + }
> +
> + ret = vfio_enable_vectors(vdev, false);
> + if (ret) {
> + if (ret < 0) {
> + error_report("vfio: Error: Failed to setup MSI fds: %s\n",
> + strerror(errno));
> + } else if (ret != vdev->nr_vectors) {
> + error_report("vfio: Error: Failed to enable %d "
> + "MSI vectors, retry with %d\n", vdev->nr_vectors,
> ret);
> + }
> +
> + for (i = 0; i < vdev->nr_vectors; i++) {
> + int fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> + if (vdev->msi_vectors[i].virq >= 0) {
> + kvm_irqchip_remove_irqfd(kvm_state, fd,
> + vdev->msi_vectors[i].virq);
> + kvm_irqchip_release_virq(kvm_state,
> vdev->msi_vectors[i].virq);
> + vdev->msi_vectors[i].virq = -1;
> + } else {
> + qemu_set_fd_handler(fd, NULL, NULL, NULL);
> + }
> + event_notifier_cleanup(&vdev->msi_vectors[i].interrupt);
> + }
> +
> + g_free(vdev->msi_vectors);
> +
> + if (ret > 0 && ret != vdev->nr_vectors) {
> + vdev->nr_vectors = ret;
> + goto retry;
> + }
> + vdev->nr_vectors = 0;
> +
> + return;
> + }
> +
> + msi_set_qsize(&vdev->pdev, vdev->nr_vectors);
Hmmm... Can we really patch qsize? While the guest is already running
and possibly evaluated this field before? IOW: Does the spec allow it to
be changed by the device and, if yes, in which states?
> +
> + DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function, vdev->nr_vectors);
> +}
> +
...
> +
> +static int vfio_setup_msi(VFIODevice *vdev, int pos)
> +{
> + uint16_t ctrl;
> + bool msi_64bit, msi_maskbit;
> + int ret, entries;
> +
> + if (!msi_supported) {
Not critical, but I would prefer to keep this variable in the context of
the MSI core. msi_init will return ENOTSUP, and you could handle that
gracefully.
> + return 0;
> + }
> +
> + if (pread(vdev->fd, &ctrl, sizeof(ctrl),
> + vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> + return -1;
> + }
> + ctrl = le16_to_cpu(ctrl);
> +
> + msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
> + msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
> + entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
> +
> + DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
> + vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
> +
> + ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
> + if (ret < 0) {
> + error_report("vfio: msi_init failed\n");
> + return ret;
> + }
> + vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 :
> 0);
> +
> + return 0;
> +}
> +
> +/*
> + * We don't have any control over how pci_add_capability() inserts
> + * capabilities into the chain.
What control is missing precisely? Can pci_add_capability be improved to
simplify the early setup? I don't see it (msix_init requires the
parameters), but the comment suggests this somehow.
> In order to setup MSI-X we need a
> + * MemoryRegion for the BAR. In order to setup the BAR and not
> + * attempt to mmap the MSI-X table area, which VFIO won't allow, we
> + * need to first look for where the MSI-X table lives. So we
> + * unfortunately split MSI-X setup across two functions.
> + */
> +static int vfio_early_setup_msix(VFIODevice *vdev)
> +{
> + uint8_t pos;
> + uint16_t ctrl;
> + uint32_t table, pba;
> +
> + pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
> + if (!pos) {
> + return 0;
> + }
> +
> + if (pread(vdev->fd, &ctrl, sizeof(ctrl),
> + vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> + return -1;
> + }
> +
> + if (pread(vdev->fd, &table, sizeof(table),
> + vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
> + return -1;
> + }
> +
> + if (pread(vdev->fd, &pba, sizeof(pba),
> + vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
> + return -1;
> + }
> +
> + ctrl = le16_to_cpu(ctrl);
> + table = le32_to_cpu(table);
> + pba = le32_to_cpu(pba);
> +
> + vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
> + vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
> + vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
> + vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
> + vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
> + vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
> +
> + DPRINTF("%04x:%02x:%02x.%x "
> + "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function, pos, vdev->msix->table_bar,
> + vdev->msix->table_offset, vdev->msix->entries);
> +
> + return 0;
> +}
> +
> +static int vfio_setup_msix(VFIODevice *vdev, int pos)
> +{
> + int ret;
> +
> + if (!msi_supported) {
See above.
> + return 0;
> + }
> +
> + ret = msix_init(&vdev->pdev, vdev->msix->entries,
> + &vdev->bars[vdev->msix->table_bar].mem,
> + vdev->msix->table_bar, vdev->msix->table_offset,
> + &vdev->bars[vdev->msix->pba_bar].mem,
> + vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
> + if (ret < 0) {
> + error_report("vfio: msix_init failed\n");
> + return ret;
> + }
> +
> + ret = msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
> + vfio_msix_vector_release);
> + if (ret) {
> + error_report("vfio: msix_set_vector_notifiers failed %d\n", ret);
> + msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
> + &vdev->bars[vdev->msix->pba_bar].mem);
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +static void vfio_teardown_msi(VFIODevice *vdev)
> +{
> + msi_uninit(&vdev->pdev);
> +
> + if (vdev->msix) {
> + /* FIXME: Why can't unset just silently do nothing?? */
Yep, that would be better.
> + if (vdev->pdev.msix_vector_use_notifier &&
> + vdev->pdev.msix_vector_release_notifier) {
> + msix_unset_vector_notifiers(&vdev->pdev);
> + }
> +
> + msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
> + &vdev->bars[vdev->msix->pba_bar].mem);
> + }
> +}
> +
> +/*
> + * Resource setup
> + */
> +static void vfio_unmap_bar(VFIODevice *vdev, int nr)
> +{
> + VFIOBAR *bar = &vdev->bars[nr];
> +
> + if (!bar->size) {
> + return;
> + }
> +
> + memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
> + munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
> +
> + if (vdev->msix && vdev->msix->table_bar == nr) {
> + memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
> + munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
> + }
> +
> + memory_region_destroy(&bar->mem);
> +}
> +
> +static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion
> *submem,
> + void **map, size_t size, off_t offset,
> + const char *name)
> +{
> + int ret = 0;
> +
> + if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
> + int prot = 0;
> +
> + if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
> + prot |= PROT_READ;
> + }
> +
> + if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
> + prot |= PROT_WRITE;
> + }
> +
> + *map = mmap(NULL, size, prot, MAP_SHARED,
> + bar->fd, bar->fd_offset + offset);
> + if (*map == MAP_FAILED) {
> + *map = NULL;
> + ret = -errno;
> + goto empty_region;
> + }
> +
> + memory_region_init_ram_ptr(submem, name, size, *map);
> + } else {
> +empty_region:
> + /* Create a zero sized sub-region to make cleanup easy. */
> + memory_region_init(submem, name, 0);
> + }
> +
> + memory_region_add_subregion(mem, offset, submem);
> +
> + return ret;
> +}
> +
> +static void vfio_map_bar(VFIODevice *vdev, int nr)
> +{
> + VFIOBAR *bar = &vdev->bars[nr];
> + unsigned size = bar->size;
> + char name[64];
> + uint32_t pci_bar;
> + uint8_t type;
> + int ret;
> +
> + /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
> + if (!size) {
> + return;
> + }
> +
> + snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function, nr);
> +
> + /* Determine what type of BAR this is for registration */
> + ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
> + vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
> + if (ret != sizeof(pci_bar)) {
> + error_report("vfio: Failed to read BAR %d (%s)\n", nr,
> strerror(errno));
> + return;
> + }
> +
> + pci_bar = le32_to_cpu(pci_bar);
> + type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
> + ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
> +
> + /* A "slow" read/write mapping underlies all BARs */
> + memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
> + pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
> +
> + /*
> + * We can't mmap areas overlapping the MSIX vector table, so we
> + * potentially insert a direct-mapped subregion before and after it.
> + */
> + if (vdev->msix && vdev->msix->table_bar == nr) {
> + size = vdev->msix->table_offset & TARGET_PAGE_MASK;
> + }
> +
> + strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
This could generate an unterminated name if we actually have to cut the
appended string. You could set name[sizeof(name)-1] = 0.
> + if (vfio_mmap_bar(bar, &bar->mem,
> + &bar->mmap_mem, &bar->mmap, size, 0, name)) {
> + error_report("%s unsupported. Performance may be slow\n", name);
> + }
> +
> + if (vdev->msix && vdev->msix->table_bar == nr) {
> + unsigned start;
> +
> + start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
> + (vdev->msix->entries *
> PCI_MSIX_ENTRY_SIZE));
> +
> + size = start < bar->size ? bar->size - start : 0;
> + strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
Same here.
> + /* MSIXInfo contains another MemoryRegion for this mapping */
> + if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
> + &vdev->msix->mmap, size, start, name)) {
> + error_report("%s unsupported. Performance may be slow\n", name);
> + }
> + }
> +
> + return;
Unneeded return.
> +}
> +
...
> +
> +static int vfio_initfn(struct PCIDevice *pdev)
> +{
> + VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> + VFIOGroup *group;
> + char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> + ssize_t len;
> + struct stat st;
> + int groupid;
> + int ret;
> +
> + /* Check that the host device exists */
> + snprintf(path, sizeof(path),
> + "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function);
> + if (stat(path, &st) < 0) {
> + error_report("vfio: error: no such host device: %s", path);
> + return -1;
> + }
> +
> + strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
See above for the termination problem.
> +
> + len = readlink(path, iommu_group_path, PATH_MAX);
> + if (len <= 0) {
> + error_report("vfio: error no iommu_group for device\n");
> + return -1;
> + }
> +
> + iommu_group_path[len] = 0;
> + group_name = basename(iommu_group_path);
> +
> + if (sscanf(group_name, "%d", &groupid) != 1) {
> + error_report("vfio: error reading %s: %s", path, strerror(errno));
> + return -1;
> + }
> +
> + DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
> + vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
> +
> + group = vfio_get_group(groupid);
> + if (!group) {
> + error_report("vfio: failed to get group %d", groupid);
> + return -1;
> + }
> +
> + snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function);
> +
> + QLIST_FOREACH(pvdev, &group->device_list, next) {
> + if (pvdev->host.domain == vdev->host.domain &&
> + pvdev->host.bus == vdev->host.bus &&
> + pvdev->host.slot == vdev->host.slot &&
> + pvdev->host.function == vdev->host.function) {
> +
> + error_report("vfio: error: device %s is already attached\n",
> path);
> + vfio_put_group(group);
> + return -1;
> + }
> + }
> +
> + ret = vfio_get_device(group, path, vdev);
> + if (ret) {
> + error_report("vfio: failed to get device %s", path);
> + vfio_put_group(group);
> + return -1;
> + }
> +
> + /* Get a copy of config space */
> + assert(pci_config_size(&vdev->pdev) <= vdev->config_size);
> + ret = pread(vdev->fd, vdev->pdev.config,
> + pci_config_size(&vdev->pdev), vdev->config_offset);
> + if (ret < (int)pci_config_size(&vdev->pdev)) {
> + error_report("vfio: Failed to read device config space\n");
> + goto out_put;
> + }
> +
> + /*
> + * Clear host resource mapping info. If we choose not to register a
> + * BAR, such as might be the case with the option ROM, we can get
> + * confusing, unwritable, residual addresses from the host here.
> + */
> + memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
> + memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
> +
> + vfio_load_rom(vdev);
> +
> + if (vfio_early_setup_msix(vdev)) {
> + goto out_put;
> + }
> +
> + vfio_map_bars(vdev);
> +
> + if (vfio_add_capabilities(vdev)) {
> + goto out_teardown;
> + }
> +
> + if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> + pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
> + }
> +
> + if (vfio_enable_intx(vdev)) {
Although vfio_enable_intx also check for PCI_INTERRUPT_PIN, I would move
this under the test above - more consistent when reading the code.
> + goto out_teardown;
> + }
> +
> + return 0;
> +
> +out_teardown:
> + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> + vfio_teardown_msi(vdev);
> + vfio_unmap_bars(vdev);
> +out_put:
> + vfio_put_device(vdev);
> + vfio_put_group(group);
> + return -1;
> +}
> +
> +static void vfio_exitfn(struct PCIDevice *pdev)
> +{
> + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> + VFIOGroup *group = vdev->group;
> +
> + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> + vfio_disable_interrupts(vdev);
> + vfio_teardown_msi(vdev);
> + vfio_unmap_bars(vdev);
> + vfio_put_device(vdev);
> + vfio_put_group(group);
> +}
> +
> +static void vfio_reset(DeviceState *dev)
> +{
> + PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
> + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +
> + if (!vdev->reset_works) {
> + return;
> + }
> +
> + if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
> + error_report("vfio: Error unable to reset physical device "
> + "(%04x:%02x:%02x.%x): %s\n", vdev->host.domain,
> + vdev->host.bus, vdev->host.slot, vdev->host.function,
> + strerror(errno));
> + }
> +}
> +
> +static Property vfio_pci_dev_properties[] = {
> + DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
> + /*
> + * TODO - support passed fds... is this necessary?
> + * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
> + * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
> + */
> + DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +
> +static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> +{
> + PCIDeviceClass *dc = PCI_DEVICE_CLASS(klass);
> +
> + dc->parent_class.reset = vfio_reset;
> + dc->init = vfio_initfn;
> + dc->exit = vfio_exitfn;
> + dc->config_read = vfio_pci_read_config;
> + dc->config_write = vfio_pci_write_config;
> + dc->parent_class.props = vfio_pci_dev_properties;
> +}
> +
> +static TypeInfo vfio_pci_dev_info = {
> + .name = "vfio-pci",
> + .parent = TYPE_PCI_DEVICE,
> + .instance_size = sizeof(VFIODevice),
> + .class_init = vfio_pci_dev_class_init,
> +};
> +
> +static void register_vfio_pci_dev_type(void)
> +{
> + type_register_static(&vfio_pci_dev_info);
> +}
> +
> +type_init(register_vfio_pci_dev_type)
> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> new file mode 100644
> index 0000000..0a71bce
> --- /dev/null
> +++ b/hw/vfio_pci.h
> @@ -0,0 +1,101 @@
> +#ifndef HW_VFIO_PCI_H
> +#define HW_VFIO_PCI_H
> +
> +#include "qemu-common.h"
> +#include "qemu-queue.h"
> +#include "pci.h"
> +#include "event_notifier.h"
> +
> +typedef struct VFIOBAR {
> + off_t fd_offset; /* offset of BAR within device fd */
> + int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
> + MemoryRegion mem; /* slow, read/write access */
> + MemoryRegion mmap_mem; /* direct mapped access */
> + void *mmap;
> + size_t size;
> + uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
> + uint8_t nr; /* cache the BAR number for debug */
> +} VFIOBAR;
> +
> +typedef struct INTx {
> + bool pending; /* interrupt pending */
> + bool kvm_accel; /* set when QEMU bypass through KVM enabled */
> + uint8_t pin; /* which pin to pull for qemu_set_irq */
> + EventNotifier interrupt; /* eventfd triggered on interrupt */
> + EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
> + PCIINTxRoute route; /* routing info for QEMU bypass */
> +} INTx;
Please add a VFIO prefix.
> +
> +struct VFIODevice;
> +
> +typedef struct MSIVector {
> + EventNotifier interrupt; /* eventfd triggered on interrupt */
> + struct VFIODevice *vdev; /* back pointer to device */
> + int vector; /* the vector number for this element */
Could also be calculated via vector - vector->vdev->msi_vectors. But I
don't mind.
> + int virq; /* KVM irqchip route for QEMU bypass */
> + bool use;
> +} MSIVector;
Also here. Just in case we ever decide to introduce a generic structure
with this name.
> +
> +enum {
> + INT_NONE = 0,
> + INT_INTx = 1,
> + INT_MSI = 2,
> + INT_MSIX = 3,
> +};
> +
> +struct VFIOGroup;
> +
> +typedef struct VFIOContainer {
> + int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> + struct {
> + /* enable abstraction to support various iommu backends */
> + union {
> + MemoryListener listener; /* Used by type1 iommu */
> + };
> + void (*release)(struct VFIOContainer *);
> + } iommu_data;
> + QLIST_HEAD(, VFIOGroup) group_list;
> + QLIST_ENTRY(VFIOContainer) next;
> +} VFIOContainer;
> +
> +/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map
> */
> +typedef struct MSIXInfo {
> + uint8_t table_bar;
> + uint8_t pba_bar;
> + uint16_t entries;
> + uint32_t table_offset;
> + uint32_t pba_offset;
> + MemoryRegion mmap_mem;
> + void *mmap;
> +} MSIXInfo;
Also a pretty generic name.
> +
> +typedef struct VFIODevice {
> + PCIDevice pdev;
> + int fd;
> + INTx intx;
> + unsigned int config_size;
> + off_t config_offset; /* Offset of config space region within device fd */
> + unsigned int rom_size;
> + off_t rom_offset; /* Offset of ROM region within device fd */
> + int msi_cap_size;
> + MSIVector *msi_vectors;
> + MSIXInfo *msix;
> + int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
> + int interrupt; /* Current interrupt type */
> + VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
> + PCIHostDeviceAddress host;
> + QLIST_ENTRY(VFIODevice) next;
> + struct VFIOGroup *group;
> + bool reset_works;
> +} VFIODevice;
> +
> +typedef struct VFIOGroup {
> + int fd;
> + int groupid;
> + VFIOContainer *container;
> + QLIST_HEAD(, VFIODevice) device_list;
> + QLIST_ENTRY(VFIOGroup) next;
> + QLIST_ENTRY(VFIOGroup) container_next;
> +} VFIOGroup;
> +
> +#endif /* HW_VFIO_PCI_H */
>
Why do all theses structs have to go into a header file? Will there be
more users than vfio_pci.c?
Jan
--
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux