qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [Qemu-block] RFC cdrom in own thread?


From: Peter Lieven
Subject: Re: [Qemu-devel] [Qemu-block] RFC cdrom in own thread?
Date: Fri, 14 Aug 2015 16:45:35 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.4.0

Am 14.08.2015 um 16:08 schrieb Kevin Wolf:
> Am 14.08.2015 um 15:43 hat Peter Lieven geschrieben:
>> Am 22.06.2015 um 23:54 schrieb John Snow:
>>> On 06/22/2015 09:09 AM, Peter Lieven wrote:
>>>> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
>>>>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven <address@hidden> wrote:
>>>>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>>>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven <address@hidden> wrote:
>>>>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>>>>>> Thread 2 (Thread 0x7ffff5550700 (LWP 2636)):
>>>>>>>>>>>> #0  0x00007ffff5d87aa3 in ppoll () from
>>>>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>>>>>> No symbol table info available.
>>>>>>>>>>>> #1  0x0000555555955d91 in qemu_poll_ns (fds=0x5555563889c0,
>>>>>>>>>>>> nfds=3,
>>>>>>>>>>>>       timeout=4999424576) at qemu-timer.c:326
>>>>>>>>>>>>           ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>>>>>           tvsec = 4
>>>>>>>>>>>> #2  0x0000555555956feb in aio_poll (ctx=0x5555563528e0,
>>>>>>>>>>>> blocking=true)
>>>>>>>>>>>>       at aio-posix.c:231
>>>>>>>>>>>>           node = 0x0
>>>>>>>>>>>>           was_dispatching = false
>>>>>>>>>>>>           ret = 1
>>>>>>>>>>>>           progress = false
>>>>>>>>>>>> #3  0x000055555594aeed in bdrv_prwv_co (bs=0x55555637eae0,
>>>>>>>>>>>> offset=4292007936,
>>>>>>>>>>>>       qiov=0x7ffff554f760, is_write=false, flags=0) at
>>>>>>>>>>>> block.c:2699
>>>>>>>>>>>>           aio_context = 0x5555563528e0
>>>>>>>>>>>>           co = 0x5555563888a0
>>>>>>>>>>>>           rwco = {bs = 0x55555637eae0, offset = 4292007936,
>>>>>>>>>>>>             qiov = 0x7ffff554f760, is_write = false, ret =
>>>>>>>>>>>> 2147483647,
>>>>>>>>>>>> flags = 0}
>>>>>>>>>>>> #4  0x000055555594afa9 in bdrv_rw_co (bs=0x55555637eae0,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4, is_write=false,
>>>>>>>>>>>> flags=0)
>>>>>>>>>>>>       at block.c:2722
>>>>>>>>>>>>           qiov = {iov = 0x7ffff554f780, niov = 1, nalloc = -1,
>>>>>>>>>>>> size =
>>>>>>>>>>>> 2048}
>>>>>>>>>>>>           iov = {iov_base = 0x7ffff44cc800, iov_len = 2048}
>>>>>>>>>>>> #5  0x000055555594b008 in bdrv_read (bs=0x55555637eae0,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>>>>>> No locals.
>>>>>>>>>>>> #6  0x000055555599acef in blk_read (blk=0x555556376820,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at
>>>>>>>>>>>> block/block-backend.c:404
>>>>>>>>>>>> No locals.
>>>>>>>>>>>> #7  0x0000555555833ed2 in cd_read_sector (s=0x555556408f88,
>>>>>>>>>>>> lba=2095707,
>>>>>>>>>>>>       buf=0x7ffff44cc800 "(", sector_size=2048) at
>>>>>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>>>>>           ret = 32767
>>>>>>>>>>> Here is the problem: The ATAPI emulation uses synchronous
>>>>>>>>>>> blk_read()
>>>>>>>>>>> instead of the AIO or coroutine interfaces. This means that it
>>>>>>>>>>> keeps
>>>>>>>>>>> polling for request completion while it holds the BQL until the
>>>>>>>>>>> request
>>>>>>>>>>> is completed.
>>>>>>>>>> I will look at this.
>>>>>>>> I need some further help. My way to "emulate" a hung NFS Server is to
>>>>>>>> block it in the Firewall. Currently I face the problem that I
>>>>>>>> cannot mount
>>>>>>>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously
>>>>>>>> tried with
>>>>>>>> a kernel NFS mount). It reads a few sectors and then stalls (maybe
>>>>>>>> another
>>>>>>>> bug):
>>>>>>>>
>>>>>>>> (gdb) thread apply all bt full
>>>>>>>>
>>>>>>>> Thread 3 (Thread 0x7ffff0c21700 (LWP 29710)):
>>>>>>>> #0  qemu_cond_broadcast (address@hidden) at
>>>>>>>> util/qemu-thread-posix.c:120
>>>>>>>>          err = <optimized out>
>>>>>>>>          __func__ = "qemu_cond_broadcast"
>>>>>>>> #1  0x0000555555911164 in rfifolock_unlock
>>>>>>>> (address@hidden) at
>>>>>>>> util/rfifolock.c:75
>>>>>>>>          __PRETTY_FUNCTION__ = "rfifolock_unlock"
>>>>>>>> #2  0x0000555555875921 in aio_context_release
>>>>>>>> (address@hidden)
>>>>>>>> at async.c:329
>>>>>>>> No locals.
>>>>>>>> #3  0x000055555588434c in aio_poll (address@hidden,
>>>>>>>> address@hidden) at aio-posix.c:272
>>>>>>>>          node = <optimized out>
>>>>>>>>          was_dispatching = false
>>>>>>>>          i = <optimized out>
>>>>>>>>          ret = <optimized out>
>>>>>>>>          progress = false
>>>>>>>>          timeout = 611734526
>>>>>>>>          __PRETTY_FUNCTION__ = "aio_poll"
>>>>>>>> #4  0x00005555558bc43d in bdrv_prwv_co (address@hidden,
>>>>>>>> address@hidden, address@hidden,
>>>>>>>> address@hidden, address@hidden(unknown: 0)) at
>>>>>>>> block/io.c:552
>>>>>>>>          aio_context = 0x5555562598b0
>>>>>>>>          co = <optimized out>
>>>>>>>>          rwco = {bs = 0x55555627c0f0, offset = 7038976, qiov =
>>>>>>>> 0x7ffff0c208f0, is_write = false, ret = 2147483647, flags =
>>>>>>>> (unknown: 0)}
>>>>>>>> #5  0x00005555558bc533 in bdrv_rw_co (bs=0x55555627c0f0,
>>>>>>>> address@hidden, address@hidden "(",
>>>>>>>> address@hidden, address@hidden,
>>>>>>>>      address@hidden(unknown: 0)) at block/io.c:575
>>>>>>>>          qiov = {iov = 0x7ffff0c208e0, niov = 1, nalloc = -1, size
>>>>>>>> = 2048}
>>>>>>>>          iov = {iov_base = 0x555557874800, iov_len = 2048}
>>>>>>>> #6  0x00005555558bc593 in bdrv_read (bs=<optimized out>,
>>>>>>>> address@hidden, address@hidden "(",
>>>>>>>> address@hidden) at block/io.c:583
>>>>>>>> No locals.
>>>>>>>> #7  0x00005555558af75d in blk_read (blk=<optimized out>,
>>>>>>>> address@hidden, address@hidden "(",
>>>>>>>> address@hidden) at block/block-backend.c:493
>>>>>>>>          ret = <optimized out>
>>>>>>>> #8  0x00005555557abb88 in cd_read_sector (sector_size=<optimized out>,
>>>>>>>> buf=0x555557874800 "(", lba=3437, s=0x55555760db70) at
>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>          ret = <optimized out>
>>>>>>>> #9  ide_atapi_cmd_reply_end (s=0x55555760db70) at hw/ide/atapi.c:190
>>>>>>>>          byte_count_limit = <optimized out>
>>>>>>>>          size = <optimized out>
>>>>>>>>          ret = 2
>>>>>>> This is still the same scenario Kevin explained.
>>>>>>>
>>>>>>> The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
>>>>>>> function holds the QEMU global mutex while waiting for the I/O request
>>>>>>> to complete.  This blocks other vcpu threads and the main loop thread.
>>>>>>>
>>>>>>> The solution is to convert the CD-ROM emulation code to use
>>>>>>> blk_aio_readv() instead of blk_read().
>>>>>> I tried a little, but i am stuck with my approach. I reads one sector
>>>>>> and then doesn't continue. Maybe someone with more knowledge
>>>>>> of ATAPI/IDE could help?
>>>>> Converting synchronous code to asynchronous requires an understanding
>>>>> of the device's state transitions.  Asynchronous code has to put the
>>>>> device registers into a busy state until the request completes.  It
>>>>> also needs to handle hardware register accesses that occur while the
>>>>> request is still pending.
>>>> That was my assumption as well. But I don't know how to proceed...
>>>>
>>>>> I don't know ATAPI/IDE code well enough to suggest a fix.
>>>> Maybe @John can help?
>>>>
>>>> Peter
>>>>
>> I looked into this again and it seems that the remaining problem (at least 
>> when the CDROM is
>> mounted via libnfs) is the blk_drain_all() in bmdma_cmd_writeb. At least I 
>> end there if I have
>> a proper OS booted and cut off the NFS server. The VM remains responsive 
>> until the guest OS
>> issues a DMA cancel.
>>
>> I do not know what the proper solution is. I had the following ideas so far 
>> (not knowing if the
>> approaches would be correct or not).
>>
>> a) Do not clear BM_STATUS_DMAING if we are not able to drain all requests. 
>> This works until
>> the connection is reestablished. The guest OS issues DMA cancel operations 
>> again and
>> again, but when the connectivity is back I end in the following assertion:
>>
>> qemu-system-x86_64: ./hw/ide/pci.h:65: bmdma_active_if: Assertion 
>> `bmdma->bus->retry_unit != (uint8_t)-1' failed.
> I would have to check the specs to see if this is allowed.
>
>> b) Call the aiocb with -ECANCELED and somehow (?) turn all the callbacks of 
>> the outstanding IOs into NOPs.
> This wouldn't be correct for write requests: We would tell the guest
> that the request is cancelled when it's actually still in flight. At
> some point it could still complete, however, and that's not expected by
> the guest.
>
>> c) Follow the hint in the comment in bmdma_cmd_writeb (however this works 
>> out):
>>              * In the future we'll be able to safely cancel the I/O if the
>>              * whole DMA operation will be submitted to disk with a single
>>              * aio operation with preadv/pwritev.
> Not sure how likely it is that cancelling that single AIOCB will
> actually cancel the operation and not end up doing bdrv_drain_all()
> internally instead because there is no good way of cancelling the
> request.

Maybe this is a solution? It seems to work for the CDROM only case:

diff --git a/block/io.c b/block/io.c
index d4bc83b..475d44c 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2075,7 +2075,9 @@ static const AIOCBInfo bdrv_em_co_aiocb_info = {
 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 {
     if (!acb->need_bh) {
-        acb->common.cb(acb->common.opaque, acb->req.error);
+        if (acb->common.cb) {
+            acb->common.cb(acb->common.opaque, acb->req.error);
+        }
         qemu_aio_unref(acb);
     }
 }
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index d31ff88..fecfa3e 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -252,11 +252,17 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
              * whole DMA operation will be submitted to disk with a single
              * aio operation with preadv/pwritev.
              */
-            if (bm->bus->dma->aiocb) {
-                blk_drain_all();
-                assert(bm->bus->dma->aiocb == NULL);
-            }
-            bm->status &= ~BM_STATUS_DMAING;
+             if (bm->bus->dma->aiocb) {
+                 bool read_write = false;
+                 read_write |= bm->bus->master && 
!blk_is_read_only(bm->bus->master->conf.blk);
+                 read_write |= bm->bus->slave && 
!blk_is_read_only(bm->bus->slave->conf.blk);
+                 if (read_write) {
+                     blk_drain_all();
+                 } else {
+                     bm->bus->dma->aiocb->cb = NULL;
+                 }
+             }
+             bm->status &= ~BM_STATUS_DMAING;
         } else {
             bm->cur_addr = bm->addr;
             if (!(bm->status & BM_STATUS_DMAING)) {


>
> Kevin




reply via email to

[Prev in Thread] Current Thread [Next in Thread]