qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] Re: [PATCH 0/3] Refactor AIO to allow multiple AIO implemen


From: Anthony Liguori
Subject: [Qemu-devel] Re: [PATCH 0/3] Refactor AIO to allow multiple AIO implementations
Date: Mon, 22 Sep 2008 22:32:49 -0500
User-agent: Thunderbird 2.0.0.16 (X11/20080723)

Can you run the same performance tests with the following patches (using sync=on instead of cache=off)?

You'll need my aio_init fix too. I suspect this will give equally good performance to your patch set. That's not saying your patch set isn't useful, but I would like to get performance to be better for the case that we're going through the page cache.

Regards,

Anthony Liguori

Ryan Harper wrote:
The patchset adds additional AIO driver abstraction to the block raw driver to
support multiple aio implementations for each device.  The first patch pulls
the posix aio implementation out of the block-raw device using a generic call
to the newly created AIO Driver structure.  The posix aio implementation that
was contained in block-raw-posix.c has been refactored in to aio-posix.c.  The
next patch adds a linux aio implementation for raw devices being opened
O_DIRECT via cache=off drive option.  We only use linux aio when cache=off as
linux aio falls back to synchronous ops if not opened with O_DIRECT flag.

Addtional work has been done on top of QEMU for KVM and virtio-blk devices.
While virtio-blk is not yet upstream in QEMU, the AIO changes here provide a
tremendous performance improvement (from 7.6% of native, to 100% of randwrite,
and 3.9% of native, to 101.4% of native for seq write) for virtio
devices with cache=off.

Storage subsystem:
  IBM EXP300 - 14 Disk Fiber Expansion, 17G - 15K RPMS
  Host: AMD Barcelona, 2 socket, 8G RAM
  HBA: QLogic Corp. ISP2312-based 2Gb Fibre Channel to PCI-X HBA (rev 02)

Benchmark[1]:
           fio --name=guestrun --filename=/dev/mapper/volumes-fibre \
               --rw=randwrite --bs=16k --ioengine=libaio --direct=1 \
               --norandommap --runtime=120 --time_based --numjobs=1 \
               --group_reporting --thread --size=25g --write_lat_log \
               --write_bw_log --iodepth=74

Qemu parameters:
 -m 1024 \
 -drive file=/images/npt2-guest-virtio.qcow2,if=ide,boot=on,snapshot=off \
 -drive file=/dev/mapper/volumes-fibre,if=virtio,cache=(on|off) \
 -drive file=/dev/mapper/volumes-npt2--dom1,if=virtio,cache=off
 -net nic,macaddr=00:FF:FF:00:00:01,model=rtl8139 -net tap -vnc :123 \
 -monitor stdio

Guest io scheduler: noop

Results:
 These results are with the patch series applied to KVM (plus a small KVM only
 change -- KVM patches forthcoming).


 16k randwrite 1 thread, 74 iodepth | MB/s | avg sub lat (us) | avg comp lat 
(ms)
 
---------------------------------------+---------------------+------------------
 baremetal (O_DIRECT, aka cache=off)| 61.2 |   13.07          |  19.59
 kvm: cache=off posix-aio w/o patch |  4.7 | 3467.44          | 254.08
 kvm: cache=off linux-aio           | 61.1 |   75.35          |  19.57
 kvm: cache=on  posix-aio w/o patch |127.0 |  115.78          |   9.19
 kvm: cache=on  posix-aio w/ patch  |126.0 |   67.35          |   9.30


 16k write 1 thread, 74 iodepth     | MB/s | avg sub lat (us) | avg comp lat 
(ms)
 
---------------------------------------+---------------------+------------------
 baremetal (O_DIRECT, aka cache=off)|128.1 |   10.90          |   9.45
kvm: cache=off posix-aio w/o patch | 5.1 | 3152.00 | 231.06 kvm: cache=off linux-aio |130.0 | 83.83 | 8.99
 kvm: cache=on  posix-aio w/o patch |184.0 |   80.46          |   6.35
 kvm: cache=on  posix-aio w/ patch  |165.0 |   70.90          |   7.09


1. http://brick.kernel.dk/snaps/fio-1.21.tar.bz2

Index: vl.c
===================================================================
--- vl.c        (revision 5300)
+++ vl.c        (working copy)
@@ -5439,12 +5439,13 @@
     int max_devs;
     int index;
     int cache;
+    int sync;
     int bdrv_flags;
     char *str = arg->opt;
     static const char * const params[] = { "bus", "unit", "if", "index",
                                            "cyls", "heads", "secs", "trans",
                                            "media", "snapshot", "file",
-                                           "cache", "format", NULL };
+                                           "cache", "format", "sync", NULL };
 
     if (check_params(buf, sizeof(buf), params, str) < 0) {
          fprintf(stderr, "qemu: unknown parameter '%s' in '%s'\n",
@@ -5459,6 +5460,7 @@
     translation = BIOS_ATA_TRANSLATION_AUTO;
     index = -1;
     cache = 1;
+    sync = 0;
 
     if (!strcmp(machine->name, "realview") ||
         !strcmp(machine->name, "SS-5") ||
@@ -5612,6 +5614,17 @@
         }
     }
 
+    if (get_param_value(buf, sizeof(buf), "sync", str)) {
+        if (!strcmp(buf, "off"))
+            sync = 0;
+        else if (!strcmp(buf, "on"))
+            sync = 1;
+        else {
+           fprintf(stderr, "qemu: invalid sync option\n");
+           return -1;
+        }
+    }
+
     if (get_param_value(buf, sizeof(buf), "format", str)) {
        if (strcmp(buf, "?") == 0) {
             fprintf(stderr, "qemu: Supported formats:");
@@ -5728,6 +5741,8 @@
         bdrv_flags |= BDRV_O_SNAPSHOT;
     if (!cache)
         bdrv_flags |= BDRV_O_DIRECT;
+    if (sync)
+        bdrv_flags |= BDRV_O_SYNC;
     if (bdrv_open2(bdrv, file, bdrv_flags, drv) < 0 || qemu_key_check(bdrv, 
file)) {
         fprintf(stderr, "qemu: could not open disk image %s\n",
                         file);
Index: block-raw-posix.c
===================================================================
--- block-raw-posix.c   (revision 5304)
+++ block-raw-posix.c   (working copy)
@@ -127,6 +127,8 @@
     if (flags & BDRV_O_DIRECT)
         open_flags |= O_DIRECT;
 #endif
+    if (flags & BDRV_O_SYNC)
+        open_flags |= O_SYNC;
 
     s->type = FTYPE_FILE;
 
@@ -937,6 +939,8 @@
     if (flags & BDRV_O_DIRECT)
         open_flags |= O_DIRECT;
 #endif
+    if (flags & BDRV_O_SYNC)
+        open_flags |= O_SYNC;
 
     s->type = FTYPE_FILE;
 #if defined(__linux__)
Index: block.h
===================================================================
--- block.h     (revision 5300)
+++ block.h     (working copy)
@@ -48,6 +48,7 @@
                                      it (default for
                                      bdrv_file_open()) */
 #define BDRV_O_DIRECT      0x0020
+#define BDRV_O_SYNC        0x0040
 
 void bdrv_info(void);
 void bdrv_info_stats(void);
Index: block-raw-posix.c
===================================================================
--- block-raw-posix.c   (revision 5304)
+++ block-raw-posix.c   (working copy)
@@ -84,10 +84,20 @@
    reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
 
+#define RAW_FD_POOL_SIZE       4
+
+typedef struct RawFdPoolEntry
+{
+    int fd;
+    int inuse;
+} RawFdPoolEntry;
+
 typedef struct BDRVRawState {
     int fd;
     int type;
     unsigned int lseek_err_cnt;
+    int fd0_inuse;
+    RawFdPoolEntry fd_pool[RAW_FD_POOL_SIZE];
 #if defined(__linux__)
     /* linux floppy specific */
     int fd_open_flags;
@@ -109,6 +119,7 @@
 {
     BDRVRawState *s = bs->opaque;
     int fd, open_flags, ret;
+    int i;
 
     posix_aio_init();
 
@@ -138,6 +149,11 @@
         return ret;
     }
     s->fd = fd;
+    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
+        s->fd_pool[i].fd = -1;
+        s->fd_pool[i].inuse = 0;
+    }
+    s->fd0_inuse = 0;
 #if defined(O_DIRECT)
     s->aligned_buf = NULL;
     if (flags & BDRV_O_DIRECT) {
@@ -436,6 +452,7 @@
 
 typedef struct RawAIOCB {
     BlockDriverAIOCB common;
+    int fd;
     struct aiocb aiocb;
     struct RawAIOCB *next;
     int ret;
@@ -447,6 +464,52 @@
     RawAIOCB *first_aio;
 } PosixAioState;
 
+static int raw_fd_pool_get(BDRVRawState *s)
+{
+    if (s->fd0_inuse) {
+        int i;
+
+        for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
+            if (s->fd_pool[i].fd == -1) {
+                s->fd_pool[i].fd = dup(s->fd);
+                if (s->fd_pool[i].fd == -1)
+                    continue;
+                s->fd_pool[i].inuse = 0;
+            }
+
+            if (!s->fd_pool[i].inuse) {
+                s->fd_pool[i].inuse++;
+                return s->fd_pool[i].fd;
+            }
+        }
+    }
+    s->fd0_inuse++;
+
+    return s->fd;
+}
+
+static void raw_fd_pool_put(RawAIOCB *acb)
+{
+    BDRVRawState *s = acb->common.bs->opaque;
+    int fd = acb->fd;
+    int i;
+
+    if (s->fd == fd) {
+        s->fd0_inuse--;
+        return;
+    }
+
+    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
+        if (s->fd_pool[i].fd == fd) {
+            s->fd_pool[i].inuse--;
+            if (s->fd_pool[i].inuse == 0) {
+                close(s->fd_pool[i].fd);
+                s->fd_pool[i].fd = -1;
+            }
+        }
+    }
+}
+
 static void posix_aio_read(void *opaque)
 {
     PosixAioState *s = opaque;
@@ -487,6 +550,7 @@
             if (ret == ECANCELED) {
                 /* remove the request */
                 *pacb = acb->next;
+                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
             } else if (ret != EINPROGRESS) {
                 /* end of aio */
@@ -503,6 +567,7 @@
                 *pacb = acb->next;
                 /* call the callback */
                 acb->common.cb(acb->common.opaque, ret);
+                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
                 break;
             } else {
@@ -575,7 +640,8 @@
     acb = qemu_aio_get(bs, cb, opaque);
     if (!acb)
         return NULL;
-    acb->aiocb.aio_fildes = s->fd;
+    acb->fd = raw_fd_pool_get(s);
+    acb->aiocb.aio_fildes = acb->fd;
     acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
     acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
     acb->aiocb.aio_buf = buf;
@@ -682,6 +748,7 @@
             break;
         } else if (*pacb == acb) {
             *pacb = acb->next;
+            raw_fd_pool_put(acb);
             qemu_aio_release(acb);
             break;
         }
@@ -698,6 +765,7 @@
 static void raw_close(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
+    int i;
     if (s->fd >= 0) {
         close(s->fd);
         s->fd = -1;
@@ -706,6 +774,10 @@
             qemu_free(s->aligned_buf);
 #endif
     }
+    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
+        if (s->fd_pool[i].fd != -1)
+            close(s->fd_pool[i].fd);
+    } 
 }
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -973,6 +1045,18 @@
 }
 
 #if defined(__linux__)
+static void raw_invalidate_fd_pool(BDRVRawState *s)
+{
+    int i;
+    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
+        if (s->fd_pool[i].fd != -1) {
+            close(s->fd_pool[i].fd);
+            s->fd_pool[i].fd = -1;
+            s->fd_pool[i].inuse = 0;
+        }
+    }
+    s->fd0_inuse = 0;
+}
 
 /* Note: we do not have a reliable method to detect if the floppy is
    present. The current method is to try to open the floppy at every
@@ -989,6 +1073,7 @@
         (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
         close(s->fd);
         s->fd = -1;
+        raw_invalidate_fd_pool(s);
 #ifdef DEBUG_FLOPPY
         printf("Floppy closed\n");
 #endif
@@ -1089,6 +1174,7 @@
             if (s->fd >= 0) {
                 close(s->fd);
                 s->fd = -1;
+                raw_invalidate_fd_pool(s);
             }
             fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
             if (fd >= 0) {

reply via email to

[Prev in Thread] Current Thread [Next in Thread]