qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: 9pfs: Twalk crash


From: Greg Kurz
Subject: Re: 9pfs: Twalk crash
Date: Tue, 31 Aug 2021 19:04:54 +0200

On Tue, 31 Aug 2021 17:00:32 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Dienstag, 31. August 2021 12:57:49 CEST Greg Kurz wrote:
> > On Mon, 30 Aug 2021 17:55:04 +0200
> > 
> > Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > > Apparently commit 8d6cb100731c4d28535adbf2a3c2d1f29be3fef4 '9pfs: reduce
> > > latency of Twalk' has introduced occasional crashes.
> > > 
> > > My first impression after looking at the backtrace: looks like the patch
> > > itself is probably not causing this, but rather unmasked this issue (i.e.
> > > increased the chance to be triggered).
> > > 
> > > The crash is because of 'elem' is NULL in virtio_pdu_vunmarshal() (frame
> > > 0).
> > Ouch... this certainly isn't expected to happen :-\
> > 
> > elem is popped out the vq in handle_9p_output():
> > 
> >         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
> >         if (!elem) {
> >             goto out_free_pdu;
> >         }
> > [...]
> >         v->elems[pdu->idx] = elem;
> > 
> > and cleared on PDU completion in virtio_9p_push_and_notify() :
> > 
> >     v->elems[pdu->idx] = NULL;
> > 
> > 
> > I can't think of a way where push_and_notify() could collide
> > with handle_output()... both are supposed to be run sequentially
> > by the main event loop.
> 
> Ok, I made a quick "is same thread" assertion check:
> 
> diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
> index 54ee93b71f..bb6ebd16aa 100644
> --- a/hw/9pfs/virtio-9p-device.c
> +++ b/hw/9pfs/virtio-9p-device.c
> @@ -28,12 +28,25 @@
>  #include "qemu/module.h"
>  #include "sysemu/qtest.h"
>  
> +static void assert_thread(void) {
> +    static QemuThread thread;
> +    static bool calledBefore;
> +    if (!calledBefore) {
> +        calledBefore = 1;
> +        qemu_thread_get_self(&thread);
> +        return;
> +    }
> +    assert(qemu_thread_is_self(&thread));
> +}
> +
>  static void virtio_9p_push_and_notify(V9fsPDU *pdu)
>  {
>      V9fsState *s = pdu->s;
>      V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
>      VirtQueueElement *elem = v->elems[pdu->idx];
>  
> +    assert_thread();
> +
>      /* push onto queue and notify */
>      virtqueue_push(v->vq, elem, pdu->size);
>      g_free(elem);
> @@ -51,6 +64,8 @@ static void handle_9p_output(VirtIODevice *vdev, VirtQueue 
> *vq)
>      ssize_t len;
>      VirtQueueElement *elem;
>  
> +    assert_thread();
> +
>      while ((pdu = pdu_alloc(s))) {
>          P9MsgHeader out;
>  
> @@ -125,6 +140,8 @@ static ssize_t virtio_pdu_vmarshal(V9fsPDU *pdu, size_t 
> offset,
>      VirtQueueElement *elem = v->elems[pdu->idx];
>      ssize_t ret;
>  
> +    assert_thread();
> +
>      ret = v9fs_iov_vmarshal(elem->in_sg, elem->in_num, offset, 1, fmt, ap);
>      if (ret < 0) {
>          VirtIODevice *vdev = VIRTIO_DEVICE(v);
> @@ -143,6 +160,8 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t 
> offset,
>      VirtQueueElement *elem = v->elems[pdu->idx];
>      ssize_t ret;
>  
> +    assert_thread();
> +
>      ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, 
> ap);
>      if (ret < 0) {
>          VirtIODevice *vdev = VIRTIO_DEVICE(v);
> @@ -160,6 +179,8 @@ static void virtio_init_in_iov_from_pdu(V9fsPDU *pdu, 
> struct iovec **piov,
>      VirtQueueElement *elem = v->elems[pdu->idx];
>      size_t buf_size = iov_size(elem->in_sg, elem->in_num);
>  
> +    assert_thread();
> +
>      if (buf_size < size) {
>          VirtIODevice *vdev = VIRTIO_DEVICE(v);
>  
> @@ -180,6 +201,8 @@ static void virtio_init_out_iov_from_pdu(V9fsPDU *pdu, 
> struct iovec **piov,
>      VirtQueueElement *elem = v->elems[pdu->idx];
>      size_t buf_size = iov_size(elem->out_sg, elem->out_num);
>  
> +    assert_thread();
> +
>      if (buf_size < size) {
>          VirtIODevice *vdev = VIRTIO_DEVICE(v);
>  
> And it triggered, however I am not sure if some of those functions I asserted 
> above are indeed allowed to be executed on a different thread than main 
> thread:
> 
> Program terminated with signal SIGABRT, Aborted.
> #0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
> 50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
> [Current thread is 1 (Thread 0x7fd0bcef1700 (LWP 6470))]

Based in the thread number, it seems that the signal was raised by
the main event thread...

> (gdb) bt
> #0  0x00007fd3fe6db7bb in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/
> sysv/linux/raise.c:50
> #1  0x00007fd3fe6c6535 in __GI_abort () at abort.c:79
> #2  0x00007fd3fe6c640f in __assert_fail_base
>     (fmt=0x7fd3fe828ee0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", 
> assertion=0x563569935704 "qemu_thread_is_self(&thread)", file=0x5635699356e6 2
> #3  0x00007fd3fe6d4102 in __GI___assert_fail
>     (assertion=assertion@entry=0x563569935704 "qemu_thread_is_self(&thread)", 
> file=file@entry=0x5635699356e6 "../hw/9pfs/virtio-9p-device.c", line1

What is the content of thread then if it isn't pthread_self() ?

> #4  0x00005635695c98aa in assert_thread () at ../hw/9pfs/virtio-9p-device.c:39
> #5  0x00005635695c98aa in assert_thread () at ../hw/9pfs/virtio-9p-device.c:31
> #6  0x00005635695c9a4b in virtio_9p_push_and_notify (pdu=0x56356bc38f38) at 
> ../hw/9pfs/virtio-9p-device.c:48
> #7  0x00005635693c2f28 in pdu_complete (pdu=pdu@entry=0x56356bc38f38, 
> len=<optimized out>, len@entry=-4) at ../hw/9pfs/9p.c:1059
> #8  0x00005635693c49e0 in v9fs_walk (opaque=0x56356bc38f38) at ../hw/9pfs/
> 9p.c:1868
> #9  0x00005635698256bb in coroutine_trampoline (i0=<optimized out>, 
> i1=<optimized out>) at ../util/coroutine-ucontext.c:173
> #10 0x00007fd3fe6f0b50 in __correctly_grouped_prefixwc
>     (begin=0x2 <error: Cannot access memory at address 0x2>, end=0x0, 
> thousands=0 L'\000', grouping=0x7fd3fe6db7bb <__GI_raise+267> "H\213\214$\b\1
> #11 0x0000000000000000 in  ()
> (gdb)
> 
> Also there is no official qemu_main_thread() function it seems yet. That 
> might 
> be useful in general.
> 
> > Maybe active some traces ?
> 
> I need to read up on how to use traces, then I'll check that as well. But 
> probably ensuring thread sanity in a way proposed above would probably make 
> sense first.
> 

Yeah and we don't have traces at the virtio-9p device level... 

Anyway, have a look at hw/9pfs/trace-events to know existing traces
and then just add "-trace v9fs_blabla" to the QEMU command line. You
can add several -trace and I think you can put wildcards, i.e. 
"-trace v9fs_*"

> I also tried a little retry hack as a test, just in case this was some sort 
> of 
> "not ready yet" issue, because I saw Philippe was working on some virtio 
> queue 
> not ready issue lately:
> https://lists.gnu.org/archive/html/qemu-devel/2021-08/msg04460.html
> https://lists.gnu.org/archive/html/qemu-devel/2021-08/msg04471.html
> 
> diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
> index 54ee93b71f..ba332c27b6 100644
> --- a/hw/9pfs/virtio-9p-device.c
> +++ b/hw/9pfs/virtio-9p-device.c
> @@ -141,6 +141,19 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, 
> size_t 
> offset,
>      V9fsState *s = pdu->s;
>      V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
>      VirtQueueElement *elem = v->elems[pdu->idx];
> +    for (int i = 0; !elem; ++i) {
> +        bool in_coroutine = qemu_in_coroutine();
> +        printf("TRIGGERED %d (in_coroutine=%d)!\n", i, in_coroutine);
> +        fflush(stdout);
> +        if (in_coroutine) {
> +            qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
> +        } else {
> +            g_usleep(100);
> +        }
> +        s = pdu->s;
> +        v = container_of(s, V9fsVirtioState, state);
> +        elem = v->elems[pdu->idx];
> +    }
>      ssize_t ret;
>  
>      ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, 
> ap);
> 
> But that did not bring much:
> 
> TRIGGERED 0 (in_coroutine=1)!

Well, it proves that virtio_pdu_vunmarshal() is indeed passed
a PDU without an associated queue element which is really a
no go...

> TRIGGERED 1 (in_coroutine=1)!
> TRIGGERED 2 (in_coroutine=1)!
> ...
> TRIGGERED 43 (in_coroutine=1)!
> TRIGGERED 44 (in_coroutine=1)!
> qemu_aio_coroutine_enter: Co-routine was already scheduled in 
> 'qemu_co_sleep_ns'
> Aborted (core dumped)
> 

Strange...

> > > bt taken with HEAD being 8d6cb100731c4d28535adbf2a3c2d1f29be3fef4:
> > > 
> > > Program terminated with signal SIGSEGV, Segmentation fault.
> > > #0  virtio_pdu_vunmarshal (pdu=0x55a93717cde8, offset=7,
> > > fmt=0x55a9352766d1
> > > "ddw", ap=0x7f38a9ad9cd0) at ../hw/9pfs/virtio-9p-device.c:146
> > > 146         ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset,
> > > 1, fmt, ap);
> > > [Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))]
> > > (gdb) bt full
> > > #0  0x000055a934dfb9a7 in virtio_pdu_vunmarshal (pdu=0x55a93717cde8,
> > > offset=7, fmt=0x55a9352766d1 "ddw", ap=0x7f38a9ad9cd0) at
> > > ../hw/9pfs/virtio-9p-device.c: 146
> > > 
> > >         s = 0x55a93717b4b8
> > >         v = 0x55a93717aee0
> > >         elem = 0x0
> > 
> > So this is v->elems[pdu->idx]... what's the value of pdu->idx ?
> 
> In that originally posted core dump it was 113:
> 
> #0  virtio_pdu_vunmarshal (pdu=0x55a93717cde8, offset=7, fmt=0x55a9352766d1 
> "ddw", ap=0x7f38a9ad9cd0) at ../hw/9pfs/virtio-9p-device.c:146
> 146         ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, 
> fmt, ap);
> [Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))]
> (gdb) p pdu->idx
> $1 = 113
> 

Ok, so it is a valid index (not over MAX_REQ). So it would mean
that someone cleared the slot in our back ?

> > >         ret = <optimized out>
> > > 
> > > #1  0x000055a934bf35e8 in pdu_unmarshal (pdu=pdu@entry=0x55a93717cde8,
> > > offset=offset@entry=7, fmt=fmt@entry=0x55a9352766d1 "ddw") at
> > > ../hw/9pfs/9p.c: 71
> > > 
> > >         ret = <optimized out>
> > >         ap = {{gp_offset = 24, fp_offset = 48, overflow_arg_area =
> > > 
> > > 0x7f38a9ad9db0, reg_save_area = 0x7f38a9ad9cf0}}
> > > #2  0x000055a934bf68db in v9fs_walk (opaque=0x55a93717cde8) at ../hw/9pfs/
> > > 9p.c:1720
> > > 
> > >         name_idx = <optimized out>
> > >         qids = 0x0
> > >         i = <optimized out>
> > >         err = 0
> > >         dpath = {size = 0, data = 0x0}
> > >         path = {size = 0, data = 0x0}
> > >         pathes = 0x0
> > >         nwnames = 1
> > >         stbuf =
> > >         
> > >             {st_dev = 2050, st_ino = 1199848, st_nlink = 1, st_mode =
> > >             41471,
> > > 
> > > st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 13, st_blksize
> > > =
> > > 4096, st_blocks = 16, s}
> > > 
> > >         fidst =
> > >         
> > >             {st_dev = 2050, st_ino = 1198183, st_nlink = 3, st_mode =
> > >             16877,
> > > 
> > > st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 12288,
> > > st_blksize = 4096, st_blocks = 32}
> > > 
> > >         stbufs = 0x0
> > >         offset = 7
> > >         fid = 299
> > >         newfid = 687
> > >         wnames = 0x0
> > >         fidp = <optimized out>
> > >         newfidp = 0x0
> > >         pdu = 0x55a93717cde8
> > >         s = 0x55a93717b4b8
> > >         qid = {type = 2 '\002', version = 1556732739, path = 2399697}
> > > 
> > > #3  0x000055a93505760b in coroutine_trampoline (i0=<optimized out>,
> > > i1=<optimized out>) at ../util/coroutine-ucontext.c:173
> 
> 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]