[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 1/5] linux-user: Refactor do_fork to use new `qemu_clone`
From: |
Alex Bennée |
Subject: |
Re: [PATCH 1/5] linux-user: Refactor do_fork to use new `qemu_clone` |
Date: |
Tue, 16 Jun 2020 16:51:45 +0100 |
User-agent: |
mu4e 1.5.3; emacs 28.0.50 |
Josh Kunz <jkz@google.com> writes:
> This is pre-work for adding full support for the `CLONE_VM` `clone`
> flag. In a follow-up patch, we'll add support to `clone.c` for
> `clone_vm`-type clones beyond threads. CLONE_VM support is more
> complicated, so first we're splitting existing clone mechanisms
> (pthread_create, and fork) into a separate file.
>
> Signed-off-by: Josh Kunz <jkz@google.com>
> ---
> linux-user/Makefile.objs | 2 +-
> linux-user/clone.c | 152 ++++++++++++++++
> linux-user/clone.h | 27 +++
> linux-user/syscall.c | 376 +++++++++++++++++++--------------------
> 4 files changed, 365 insertions(+), 192 deletions(-)
> create mode 100644 linux-user/clone.c
> create mode 100644 linux-user/clone.h
>
> diff --git a/linux-user/Makefile.objs b/linux-user/Makefile.objs
> index 1940910a73..d6788f012c 100644
> --- a/linux-user/Makefile.objs
> +++ b/linux-user/Makefile.objs
> @@ -1,7 +1,7 @@
> obj-y = main.o syscall.o strace.o mmap.o signal.o \
> elfload.o linuxload.o uaccess.o uname.o \
> safe-syscall.o $(TARGET_ABI_DIR)/signal.o \
> - $(TARGET_ABI_DIR)/cpu_loop.o exit.o fd-trans.o
> + $(TARGET_ABI_DIR)/cpu_loop.o exit.o fd-trans.o clone.o
>
> obj-$(TARGET_HAS_BFLT) += flatload.o
> obj-$(TARGET_I386) += vm86.o
> diff --git a/linux-user/clone.c b/linux-user/clone.c
> new file mode 100644
> index 0000000000..f02ae8c464
> --- /dev/null
> +++ b/linux-user/clone.c
> @@ -0,0 +1,152 @@
> +#include "qemu/osdep.h"
> +#include "qemu.h"
> +#include "clone.h"
> +#include <pthread.h>
> +#include <sched.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sys/syscall.h>
> +#include <sys/types.h>
> +#include <sys/wait.h>
> +#include <unistd.h>
> +#include <stdbool.h>
> +#include <assert.h>
> +
> +static const unsigned long NEW_STACK_SIZE = 0x40000UL;
> +
> +/*
> + * A completion tracks an event that can be completed. It's based on the
> + * kernel concept with the same name, but implemented with userspace locks.
> + */
> +struct completion {
> + /* done is set once this completion has been completed. */
> + bool done;
> + /* mu syncronizes access to this completion. */
> + pthread_mutex_t mu;
> + /* cond is used to broadcast completion status to awaiting threads. */
> + pthread_cond_t cond;
> +};
> +
> +static void completion_init(struct completion *c)
> +{
> + c->done = false;
> + pthread_mutex_init(&c->mu, NULL);
> + pthread_cond_init(&c->cond, NULL);
> +}
> +
> +/*
> + * Block until the given completion finishes. Returns immediately if the
> + * completion has already finished.
> + */
> +static void completion_await(struct completion *c)
> +{
> + pthread_mutex_lock(&c->mu);
> + if (c->done) {
> + pthread_mutex_unlock(&c->mu);
> + return;
> + }
> + pthread_cond_wait(&c->cond, &c->mu);
> + assert(c->done && "returned from cond wait without being marked as
> done");
> + pthread_mutex_unlock(&c->mu);
> +}
This introduces another sync mechanism specifically for clone behaviour
- is there a reason one of the exiting mechanisms can't be used? If this
brings new useful functionality it might be worth introducing it as a
system wide function.
> +
> +/*
> + * Finish the completion. Unblocks all awaiters.
> + */
> +static void completion_finish(struct completion *c)
> +{
> + pthread_mutex_lock(&c->mu);
> + assert(!c->done && "trying to finish an already finished completion");
> + c->done = true;
> + pthread_cond_broadcast(&c->cond);
> + pthread_mutex_unlock(&c->mu);
> +}
> +
> +struct clone_thread_info {
> + struct completion running;
> + int tid;
> + int (*callback)(void *);
> + void *payload;
> +};
> +
> +static void *clone_thread_run(void *raw_info)
> +{
> + struct clone_thread_info *info = (struct clone_thread_info *) raw_info;
> + info->tid = syscall(SYS_gettid);
> +
> + /*
> + * Save out callback/payload since lifetime of info is only guaranteed
> + * until we finish the completion.
> + */
> + int (*callback)(void *) = info->callback;
> + void *payload = info->payload;
> + completion_finish(&info->running);
> +
> + _exit(callback(payload));
> +}
> +
> +static int clone_thread(int flags, int (*callback)(void *), void
> *payload)
It's nicer to typedef a function call type rather than manually casting
to it each time.
> +{
> + struct clone_thread_info info;
> + pthread_attr_t attr;
> + int ret;
> + pthread_t thread_unused;
> +
> + memset(&info, 0, sizeof(info));
> +
> + completion_init(&info.running);
> + info.callback = callback;
> + info.payload = payload;
> +
> + (void)pthread_attr_init(&attr);
> + (void)pthread_attr_setstacksize(&attr, NEW_STACK_SIZE);
> + (void)pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
> +
> + ret = pthread_create(&thread_unused, &attr, clone_thread_run, (void *)
> &info);
> + /* pthread_create returns errors directly, instead of via errno. */
> + if (ret != 0) {
> + errno = ret;
> + ret = -1;
> + } else {
> + completion_await(&info.running);
> + ret = info.tid;
> + }
> +
> + pthread_attr_destroy(&attr);
> + return ret;
> +}
> +
> +int qemu_clone(int flags, int (*callback)(void *), void *payload)
> +{
> + int ret;
> +
> + if (clone_flags_are_thread(flags)) {
> + /*
> + * The new process uses the same flags as pthread_create, so we can
> + * use pthread_create directly. This is an optimization.
> + */
> + return clone_thread(flags, callback, payload);
> + }
> +
> + if (clone_flags_are_fork(flags)) {
> + /*
> + * Special case a true `fork` clone call. This is so we can take
> + * advantage of special pthread_atfork handlers in libraries we
> + * depend on (e.g., glibc). Without this, existing users of `fork`
> + * in multi-threaded environments will likely get new flaky
> + * deadlocks.
> + */
> + fork_start();
> + ret = fork();
> + if (ret == 0) {
> + fork_end(1);
> + _exit(callback(payload));
> + }
> + fork_end(0);
> + return ret;
> + }
> +
> + /* !fork && !thread */
> + errno = EINVAL;
> + return -1;
> +}
> diff --git a/linux-user/clone.h b/linux-user/clone.h
> new file mode 100644
> index 0000000000..34ae9b3780
> --- /dev/null
> +++ b/linux-user/clone.h
> @@ -0,0 +1,27 @@
> +#ifndef CLONE_H
> +#define CLONE_H
> +
> +/*
> + * qemu_clone executes the given `callback`, with the given payload as the
> + * first argument, in a new process created with the given flags. Some clone
> + * flags, such as *SETTLS, *CLEARTID are not supported. The child thread ID
> is
> + * returned on success, otherwise negative errno is returned on clone
> failure.
> + */
> +int qemu_clone(int flags, int (*callback)(void *), void *payload);
> +
> +/* Returns true if the given clone flags can be emulated with libc fork. */
> +static bool clone_flags_are_fork(unsigned int flags)
> +{
> + return flags == SIGCHLD;
> +}
> +
> +/* Returns true if the given clone flags can be emulated with
> pthread_create. */
> +static bool clone_flags_are_thread(unsigned int flags)
> +{
> + return flags == (
> + CLONE_VM | CLONE_FS | CLONE_FILES |
> + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM
> + );
> +}
This is fine in of itself but we seemed to have lost some context from
moving the flags from syscall.c.
> +
> +#endif /* CLONE_H */
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index 97de9fb5c9..7ce021cea2 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -122,6 +122,7 @@
> #include "qapi/error.h"
> #include "fd-trans.h"
> #include "tcg/tcg.h"
> +#include "clone.h"
>
> #ifndef CLONE_IO
> #define CLONE_IO 0x80000000 /* Clone io context */
> @@ -135,12 +136,6 @@
> * * flags we can implement within QEMU itself
> * * flags we can't support and will return an error for
> */
> -/* For thread creation, all these flags must be present; for
> - * fork, none must be present.
> - */
> -#define CLONE_THREAD_FLAGS \
> - (CLONE_VM | CLONE_FS | CLONE_FILES | \
> - CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM)
>
> /* These flags are ignored:
> * CLONE_DETACHED is now ignored by the kernel;
> @@ -150,30 +145,10 @@
> (CLONE_DETACHED | CLONE_IO)
>
> /* Flags for fork which we can implement within QEMU itself */
> -#define CLONE_OPTIONAL_FORK_FLAGS \
> +#define CLONE_EMULATED_FLAGS \
> (CLONE_SETTLS | CLONE_PARENT_SETTID | \
> CLONE_CHILD_CLEARTID | CLONE_CHILD_SETTID)
>
> -/* Flags for thread creation which we can implement within QEMU itself */
> -#define CLONE_OPTIONAL_THREAD_FLAGS \
> - (CLONE_SETTLS | CLONE_PARENT_SETTID | \
> - CLONE_CHILD_CLEARTID | CLONE_CHILD_SETTID | CLONE_PARENT)
> -
> -#define CLONE_INVALID_FORK_FLAGS \
> - (~(CSIGNAL | CLONE_OPTIONAL_FORK_FLAGS | CLONE_IGNORED_FLAGS))
> -
> -#define CLONE_INVALID_THREAD_FLAGS \
> - (~(CSIGNAL | CLONE_THREAD_FLAGS | CLONE_OPTIONAL_THREAD_FLAGS | \
> - CLONE_IGNORED_FLAGS))
> -
> -/* CLONE_VFORK is special cased early in do_fork(). The other flag bits
> - * have almost all been allocated. We cannot support any of
> - * CLONE_NEWNS, CLONE_NEWCGROUP, CLONE_NEWUTS, CLONE_NEWIPC,
> - * CLONE_NEWUSER, CLONE_NEWPID, CLONE_NEWNET, CLONE_PTRACE, CLONE_UNTRACED.
> - * The checks against the invalid thread masks above will catch these.
> - * (The one remaining unallocated bit is 0x1000 which used to be CLONE_PID.)
> - */
> -
I think some of the above can be usefully moved to clone.h to discuss
the various clone/fork options QEMU can and can't support.
> /* Define DEBUG_ERESTARTSYS to force every syscall to be restarted
> * once. This exercises the codepaths for restart.
> */
> @@ -1104,7 +1079,7 @@ static inline rlim_t target_to_host_rlim(abi_ulong
> target_rlim)
> {
> abi_ulong target_rlim_swap;
> rlim_t result;
> -
> +
> target_rlim_swap = tswapal(target_rlim);
> if (target_rlim_swap == TARGET_RLIM_INFINITY)
> return RLIM_INFINITY;
> @@ -1112,7 +1087,7 @@ static inline rlim_t target_to_host_rlim(abi_ulong
> target_rlim)
> result = target_rlim_swap;
> if (target_rlim_swap != (rlim_t)result)
> return RLIM_INFINITY;
> -
> +
> return result;
> }
> #endif
> @@ -1122,13 +1097,13 @@ static inline abi_ulong host_to_target_rlim(rlim_t
> rlim)
> {
> abi_ulong target_rlim_swap;
> abi_ulong result;
> -
> +
> if (rlim == RLIM_INFINITY || rlim != (abi_long)rlim)
> target_rlim_swap = TARGET_RLIM_INFINITY;
> else
> target_rlim_swap = rlim;
> result = tswapal(target_rlim_swap);
> -
> +
> return result;
> }
> #endif
> @@ -1615,10 +1590,11 @@ static inline abi_long target_to_host_cmsg(struct
> msghdr *msgh,
> abi_ulong target_cmsg_addr;
> struct target_cmsghdr *target_cmsg, *target_cmsg_start;
> socklen_t space = 0;
> -
> +
> msg_controllen = tswapal(target_msgh->msg_controllen);
> - if (msg_controllen < sizeof (struct target_cmsghdr))
> + if (msg_controllen < sizeof(struct target_cmsghdr)) {
> goto the_end;
> + }
> target_cmsg_addr = tswapal(target_msgh->msg_control);
> target_cmsg = lock_user(VERIFY_READ, target_cmsg_addr, msg_controllen,
> 1);
> target_cmsg_start = target_cmsg;
> @@ -1703,8 +1679,9 @@ static inline abi_long host_to_target_cmsg(struct
> target_msghdr *target_msgh,
> socklen_t space = 0;
>
> msg_controllen = tswapal(target_msgh->msg_controllen);
> - if (msg_controllen < sizeof (struct target_cmsghdr))
> + if (msg_controllen < sizeof(struct target_cmsghdr)) {
> goto the_end;
> + }
Try and avoid un-related whitespace fixes in code that is otherwise unchanged.
> target_cmsg_addr = tswapal(target_msgh->msg_control);
> target_cmsg = lock_user(VERIFY_WRITE, target_cmsg_addr, msg_controllen,
> 0);
> target_cmsg_start = target_cmsg;
> @@ -5750,9 +5727,10 @@ abi_long do_set_thread_area(CPUX86State *env,
> abi_ulong ptr)
> }
> unlock_user_struct(target_ldt_info, ptr, 1);
>
> - if (ldt_info.entry_number < TARGET_GDT_ENTRY_TLS_MIN ||
> - ldt_info.entry_number > TARGET_GDT_ENTRY_TLS_MAX)
> - return -TARGET_EINVAL;
> + if (ldt_info.entry_number < TARGET_GDT_ENTRY_TLS_MIN ||
> + ldt_info.entry_number > TARGET_GDT_ENTRY_TLS_MAX) {
> + return -TARGET_EINVAL;
> + }
> seg_32bit = ldt_info.flags & 1;
> contents = (ldt_info.flags >> 1) & 3;
> read_exec_only = (ldt_info.flags >> 3) & 1;
> @@ -5828,7 +5806,7 @@ static abi_long do_get_thread_area(CPUX86State *env,
> abi_ulong ptr)
> lp = (uint32_t *)(gdt_table + idx);
> entry_1 = tswap32(lp[0]);
> entry_2 = tswap32(lp[1]);
> -
> +
> read_exec_only = ((entry_2 >> 9) & 1) ^ 1;
> contents = (entry_2 >> 10) & 3;
> seg_not_present = ((entry_2 >> 15) & 1) ^ 1;
> @@ -5844,8 +5822,8 @@ static abi_long do_get_thread_area(CPUX86State *env,
> abi_ulong ptr)
> (read_exec_only << 3) | (limit_in_pages << 4) |
> (seg_not_present << 5) | (useable << 6) | (lm << 7);
> limit = (entry_1 & 0xffff) | (entry_2 & 0xf0000);
> - base_addr = (entry_1 >> 16) |
> - (entry_2 & 0xff000000) |
> + base_addr = (entry_1 >> 16) |
> + (entry_2 & 0xff000000) |
> ((entry_2 & 0xff) << 16);
> target_ldt_info->base_addr = tswapal(base_addr);
> target_ldt_info->limit = tswap32(limit);
> @@ -5895,53 +5873,71 @@ abi_long do_arch_prctl(CPUX86State *env, int code,
> abi_ulong addr)
>
> #endif /* defined(TARGET_I386) */
>
> -#define NEW_STACK_SIZE 0x40000
> -
> -
> static pthread_mutex_t clone_lock = PTHREAD_MUTEX_INITIALIZER;
> typedef struct {
> - CPUArchState *env;
> + /* Used to synchronize thread/process creation between parent and child.
> */
> pthread_mutex_t mutex;
> pthread_cond_t cond;
> - pthread_t thread;
> - uint32_t tid;
> + /*
> + * Guest pointers for implementing CLONE_PARENT_SETTID
> + * and CLONE_CHILD_SETTID.
> + */
> abi_ulong child_tidptr;
> abi_ulong parent_tidptr;
> - sigset_t sigmask;
> -} new_thread_info;
> + struct {
> + sigset_t sigmask;
> + CPUArchState *env;
> + bool register_thread;
> + bool signal_setup;
> + } child;
> +} clone_info;
>
> -static void *clone_func(void *arg)
> +static int clone_run(void *arg)
> {
> - new_thread_info *info = arg;
> + clone_info *info = (clone_info *) arg;
> CPUArchState *env;
> CPUState *cpu;
> TaskState *ts;
> + uint32_t tid;
>
> - rcu_register_thread();
> - tcg_register_thread();
> - env = info->env;
> + if (info->child.register_thread) {
> + rcu_register_thread();
> + tcg_register_thread();
> + }
> +
> + env = info->child.env;
> cpu = env_cpu(env);
> thread_cpu = cpu;
> ts = (TaskState *)cpu->opaque;
> - info->tid = sys_gettid();
> + tid = sys_gettid();
> task_settid(ts);
> - if (info->child_tidptr)
> - put_user_u32(info->tid, info->child_tidptr);
> - if (info->parent_tidptr)
> - put_user_u32(info->tid, info->parent_tidptr);
> +
> qemu_guest_random_seed_thread_part2(cpu->random_seed);
> - /* Enable signals. */
> - sigprocmask(SIG_SETMASK, &info->sigmask, NULL);
> - /* Signal to the parent that we're ready. */
> - pthread_mutex_lock(&info->mutex);
> - pthread_cond_broadcast(&info->cond);
> - pthread_mutex_unlock(&info->mutex);
> - /* Wait until the parent has finished initializing the tls state. */
> - pthread_mutex_lock(&clone_lock);
> - pthread_mutex_unlock(&clone_lock);
> +
> + if (info->parent_tidptr) {
> + /*
> + * Even when memory is not shared, parent_tidptr is set before the
> + * process copy, so we need to set it in the child.
> + */
> + put_user_u32(tid, info->parent_tidptr);
> + }
> +
> + if (info->child_tidptr) {
> + put_user_u32(tid, info->child_tidptr);
> + }
> +
> + /* Enable signals. */
> + sigprocmask(SIG_SETMASK, &info->child.sigmask, NULL);
> +
> + if (info->child.signal_setup) {
> + pthread_mutex_lock(&info->mutex);
> + pthread_cond_broadcast(&info->cond);
> + pthread_mutex_unlock(&info->mutex);
> + }
> +
> cpu_loop(env);
> /* never exits */
> - return NULL;
> + _exit(1); /* avoid compiler warning. */
> }
>
> /* do_fork() Must return host values and target errnos (unlike most
> @@ -5951,139 +5947,131 @@ static int do_fork(CPUArchState *env, unsigned int
> flags, abi_ulong newsp,
> abi_ulong child_tidptr)
> {
> CPUState *cpu = env_cpu(env);
> - int ret;
> + int proc_flags, host_sig, ret;
> TaskState *ts;
> CPUState *new_cpu;
> - CPUArchState *new_env;
> - sigset_t sigmask;
> + sigset_t block_sigmask;
> + sigset_t orig_sigmask;
> + clone_info info;
> + TaskState *parent_ts = (TaskState *)cpu->opaque;
>
> - flags &= ~CLONE_IGNORED_FLAGS;
> + memset(&info, 0, sizeof(info));
> +
> + /*
> + * When cloning the actual subprocess, we don't need to worry about any
> + * flags that can be ignored, or emulated in QEMU. proc_flags holds only
> + * the flags that need to be passed to `clone` itself.
> + */
> + proc_flags = flags & ~(CLONE_EMULATED_FLAGS | CLONE_IGNORED_FLAGS);
> +
> + /*
> + * The exit signal is included in the flags. That signal needs to be
> mapped
> + * to the appropriate host signal, and we need to check if that signal is
> + * supported.
> + */
> + host_sig = target_to_host_signal(proc_flags & CSIGNAL);
> + if (host_sig > SIGRTMAX) {
> + qemu_log_mask(LOG_UNIMP,
> + "guest signal %d not supported for exit_signal",
> + proc_flags & CSIGNAL);
> + return -TARGET_EINVAL;
> + }
> + proc_flags = (proc_flags & ~CSIGNAL) | host_sig;
>
> /* Emulate vfork() with fork() */
> - if (flags & CLONE_VFORK)
> - flags &= ~(CLONE_VFORK | CLONE_VM);
> + if (proc_flags & CLONE_VFORK) {
> + proc_flags &= ~(CLONE_VFORK | CLONE_VM);
> + }
>
> - if (flags & CLONE_VM) {
> - TaskState *parent_ts = (TaskState *)cpu->opaque;
> - new_thread_info info;
> - pthread_attr_t attr;
> + if (!clone_flags_are_fork(proc_flags) &&
> + !clone_flags_are_thread(proc_flags)) {
> + qemu_log_mask(LOG_UNIMP, "unsupported clone flags");
> + return -TARGET_EINVAL;
> + }
>
> - if (((flags & CLONE_THREAD_FLAGS) != CLONE_THREAD_FLAGS) ||
> - (flags & CLONE_INVALID_THREAD_FLAGS)) {
> - return -TARGET_EINVAL;
> - }
> + pthread_mutex_init(&info.mutex, NULL);
> + pthread_mutex_lock(&info.mutex);
> + pthread_cond_init(&info.cond, NULL);
>
> - ts = g_new0(TaskState, 1);
> - init_task_state(ts);
> + ts = g_new0(TaskState, 1);
> + init_task_state(ts);
>
> - /* Grab a mutex so that thread setup appears atomic. */
> - pthread_mutex_lock(&clone_lock);
> + /* Guard CPU copy. It is not thread-safe. */
Why not - isn't that the point of the lock?
<snip>
> case TARGET_NR_setdomainname:
> if (!(p = lock_user_string(arg1)))
> @@ -10873,8 +10865,10 @@ static abi_long do_syscall1(void *cpu_env, int num,
> abi_long arg1,
> return get_errno(fchown(arg1, low2highuid(arg2), low2highgid(arg3)));
> #if defined(TARGET_NR_fchownat)
> case TARGET_NR_fchownat:
> - if (!(p = lock_user_string(arg2)))
> + p = lock_user_string(arg2)
> + if (!p) {
> return -TARGET_EFAULT;
> + }
This has dropped a ; killing the build
--
Alex Bennée