qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC/PATCH] Fix guest OS panic when 64bit BAR is present


From: Alexey Korolev
Subject: [Qemu-devel] [RFC/PATCH] Fix guest OS panic when 64bit BAR is present
Date: Wed, 25 Jan 2012 18:46:03 +1300
User-agent: Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20111124 Thunderbird/8.0

Hi, 
In this post
http://lists.gnu.org/archive/html/qemu-devel/2011-12/msg03171.html I've
mentioned about the issues when 64Bit PCI BAR is present and 32bit
address range is selected for it.
The issue affects all recent qemu releases and all
old and recent guest Linux kernel versions.

We've done some investigations. Let me explain what happens.
Assume we have 64bit BAR with size 32MB mapped at [0xF0000000 -
0xF2000000]

When Linux guest starts it does PCI bus enumeration.
The OS enumerates 64BIT bars using the following procedure.
1. Write all FF's to lower half of 64bit BAR
2. Write address back to lower half of 64bit BAR
3. Write all FF's to higher half of 64bit BAR
4. Write address back to higher half of 64bit BAR

Linux code is here: 
http://lxr.linux.no/#linux+v3.2.1/drivers/pci/probe.c#L149

What does it mean for qemu?

At step 1. qemu pci_default_write_config() recevies all FFs for lower
part of the 64bit BAR. Then it applies the mask and converts the value
to "All FF's - size + 1" (FE000000 if size is 32MB).
Then pci_bar_address() checks if BAR address is valid. Since it is a
64bit bar it reads 0x00000000FE000000 - this address is valid. So qemu
updates topology and sends request to update mappings in KVM with new
range for the 64bit BAR FE000000 - 0xFFFFFFFF. This usually means kernel
panic on boot, if there is another mapping in the FE000000 - 0xFFFFFFFF
range, which is quite common.


The following patch fixes the issue. It affects 64bit PCI BAR's only. 
The idea of the patch is: we introduce the states for low and high BARs
whose can have 3 possible values: BAR_VALID, PCIBAR64_PARTIAL_SIZE_QUERY
- someone has requested size of one half of the 64bit PCI BAR,
PCIBAR64_PARTIAL_ADDR_PROGRAM - someone has sent a request to update the
address of one half of the 64bit PCI BAR. The state becomes BAR_VALID
when both halfs are in the same state. We ignore BAR value until both
states become BAR_VALID

Note: Please use the latest Seabios version (commit
139d5ac037de828f89c36e39c6dd15610650cede and later), as older versions
didn't initialize high part of 64bit BAR. 

The patch is tested on Linux 2.6.18 - 3.1.0 and Windows 2008 Server

Signed-off-by: Alexey Korolev <address@hidden>
---
 hw/pci.c |   45 +++++++++++++++++++++++++++++++++++++++++++++
 hw/pci.h |    7 +++++++
 2 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/hw/pci.c b/hw/pci.c
index 57ec104..3a7deb2 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -1055,6 +1055,40 @@ static pcibus_t pci_bar_address(PCIDevice *d,
     return new_addr;
 }
 
+static void pci_update_region_state(PCIDevice *d, uint32_t addr, uint32_t val)
+{
+    PCIIORegion *r;
+    int barnum = (addr - PCI_BASE_ADDRESS_0) >> 2;
+    PCIBARState *state;
+
+    r = &d->io_regions[barnum];
+
+    if (d->io_regions[barnum].type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+        /* Programming low part of the 64bit BAR */
+        r = &d->io_regions[barnum];
+        state = &r->state_lo;
+    } else if (barnum > 0 &&
+        (d->io_regions[barnum - 1].type & PCI_BASE_ADDRESS_MEM_TYPE_64)) {
+        /* Programming high part of the 64bit BAR */
+        r = &d->io_regions[barnum - 1];
+        state = &r->state_hi;
+    } else {
+        /* Not a 64bit BAR's */
+        d->io_regions[barnum].state_lo = PCIBAR_VALID;
+        return;
+    }
+
+    /* Request to read BAR size */
+    if (val == -1U)
+        *state = PCIBAR64_PARTIAL_SIZE_QUERY;
+    else
+        *state = PCIBAR64_PARTIAL_ADDR_PROGRAM;
+
+
+    if (r->state_lo == r->state_hi)
+        r->state_lo = r->state_hi = PCIBAR_VALID;
+}
+
 static void pci_update_mappings(PCIDevice *d)
 {
     PCIIORegion *r;
@@ -1068,6 +1102,13 @@ static void pci_update_mappings(PCIDevice *d)
         if (!r->size)
             continue;
 
+        /* this region state is invalid */
+        if (r->state_lo != PCIBAR_VALID)
+            continue;
+        if ((r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) &&
+           (r->state_hi != PCIBAR_VALID))
+            continue;
+
         new_addr = pci_bar_address(d, i, r->type, r->size);
 
         /* This bar isn't changed */
@@ -1117,6 +1158,7 @@ uint32_t pci_default_read_config(PCIDevice *d,
 void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
 {
     int i, was_irq_disabled = pci_irq_disabled(d);
+    uint32_t orig_val = val;
 
     for (i = 0; i < l; val >>= 8, ++i) {
         uint8_t wmask = d->wmask[addr + i];
@@ -1133,6 +1175,9 @@ void pci_default_write_config(PCIDevice *d, uint32_t 
addr, uint32_t val, int l)
         assigned_dev_update_irqs();
 #endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */
 
+    if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24))
+        pci_update_region_state(d, addr, orig_val);
+
     if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) ||
         ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) ||
         ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) ||
diff --git a/hw/pci.h b/hw/pci.h
index 4220151..5d1e529 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -86,12 +86,19 @@ typedef uint32_t PCIConfigReadFunc(PCIDevice *pci_dev,
 typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
                                 pcibus_t addr, pcibus_t size, int type);
 typedef int PCIUnregisterFunc(PCIDevice *pci_dev);
+typedef enum PCIBARState {
+     PCIBAR_VALID = 0,
+     PCIBAR64_PARTIAL_SIZE_QUERY,
+     PCIBAR64_PARTIAL_ADDR_PROGRAM
+} PCIBARState;
 
 typedef struct PCIIORegion {
     pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
 #define PCI_BAR_UNMAPPED (~(pcibus_t)0)
     pcibus_t size;
     uint8_t type;
+    PCIBARState state_lo;
+    PCIBARState state_hi;
     MemoryRegion *memory;
     MemoryRegion *address_space;
 } PCIIORegion;
-- 
1.7.5.4







reply via email to

[Prev in Thread] Current Thread [Next in Thread]