[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [RFC v3 PATCH 2/5] cpus-common: Cache allocated work it
From: |
Alex Bennée |
Subject: |
Re: [Qemu-devel] [RFC v3 PATCH 2/5] cpus-common: Cache allocated work items |
Date: |
Tue, 05 Sep 2017 13:28:43 +0100 |
User-agent: |
mu4e 0.9.19; emacs 25.2.50.3 |
Pranith Kumar <address@hidden> writes:
> Using heaptrack, I found that quite a few of our temporary allocations
> are coming from allocating work items. Instead of doing this
> continously, we can cache the allocated items and reuse them instead
> of freeing them.
>
> Stats from an ARM64 guest (boot+shutdown):
>
> heaptrack stats(before):
> allocations: 1471317
> leaked allocations: 73824
> temporary allocations: 651293
>
> heaptrack stats(after):
> allocations: 1143130
> leaked allocations: 73693
> temporary allocations: 487342
>
> The improvement in speedup is minor and within error margins, however I think
> the
> patch is still worth. We can also explore atomics instead of taking a lock for
> the work item pool.
When we where doing the original MTTCG work I looked at using GArray for
the work queue, see:
http://lists.gnu.org/archive/html/qemu-devel/2016-08/msg00367.html
specifically:
Subject: [PATCH v5 13/13] cpu-exec: replace cpu->queued_work with GArray
Date: Tue, 2 Aug 2016 18:27:44 +0100
Message-Id: <address@hidden>
which I personally think might yield better results than messing around
with custom allocators and GSlice and the like. You still get the
dynamic sizing of a malloc based array but for operations like insertion
and iterating through the work queue should be cache friendly.
Once the array has (transparently) reached a reasonable size to service
all allocations in the usual servicing period the same memory can be
used over and over again ;-)
My fondness for arrays is informed by comments by Bjarne Stroustrup:
https://www.youtube.com/watch?v=YQs6IC-vgmo
Obviously this patch would need to be re-worked given how much the code
has changes since it was merged.
>
> Signed-off-by: Pranith Kumar <address@hidden>
> ---
> cpus-common.c | 75
> +++++++++++++++++++++++++++++++++++++++++++++++------------
> 1 file changed, 60 insertions(+), 15 deletions(-)
>
> diff --git a/cpus-common.c b/cpus-common.c
> index 59f751ecf9..ccf5f50e4e 100644
> --- a/cpus-common.c
> +++ b/cpus-common.c
> @@ -24,6 +24,7 @@
> #include "sysemu/cpus.h"
>
> static QemuMutex qemu_cpu_list_lock;
> +static QemuMutex qemu_wi_pool_lock;
> static QemuCond exclusive_cond;
> static QemuCond exclusive_resume;
> static QemuCond qemu_work_cond;
> @@ -33,6 +34,49 @@ static QemuCond qemu_work_cond;
> */
> static int pending_cpus;
>
> +typedef struct qemu_work_item {
> + struct qemu_work_item *next;
> + run_on_cpu_func func;
> + run_on_cpu_data data;
> + bool free, exclusive, done;
> +} qemu_work_item;
> +
> +typedef struct qemu_wi_pool {
> + qemu_work_item *head;
> + int num_items;
> +} qemu_wi_pool;
> +
> +qemu_wi_pool *wi_free_pool;
> +
> +static void qemu_init_workitem_pool(void)
> +{
> + wi_free_pool = g_malloc0(sizeof(qemu_wi_pool));
> +}
> +
> +static void qemu_wi_pool_insert(qemu_work_item *item)
> +{
> + qemu_mutex_lock(&qemu_wi_pool_lock);
> + qemu_work_item *curr = atomic_read(&wi_free_pool->head);
> + item->next = curr;
> + wi_free_pool->head = item;
> + qemu_mutex_unlock(&qemu_wi_pool_lock);
> +}
> +
> +static qemu_work_item *qemu_wi_pool_remove(void)
> +{
> + qemu_mutex_lock(&qemu_wi_pool_lock);
> + qemu_work_item *curr = atomic_read(&wi_free_pool->head);
> + if (curr == NULL) {
> + goto out;
> + }
> + wi_free_pool->head = curr->next;
> + curr->next = NULL;
> +
> + out:
> + qemu_mutex_unlock(&qemu_wi_pool_lock);
> + return curr;
> +}
> +
> void qemu_init_cpu_list(void)
> {
> /* This is needed because qemu_init_cpu_list is also called by the
> @@ -43,6 +87,9 @@ void qemu_init_cpu_list(void)
> qemu_cond_init(&exclusive_cond);
> qemu_cond_init(&exclusive_resume);
> qemu_cond_init(&qemu_work_cond);
> +
> + qemu_init_workitem_pool();
> + qemu_mutex_init(&qemu_wi_pool_lock);
> }
>
> void cpu_list_lock(void)
> @@ -106,14 +153,7 @@ void cpu_list_remove(CPUState *cpu)
> qemu_mutex_unlock(&qemu_cpu_list_lock);
> }
>
> -struct qemu_work_item {
> - struct qemu_work_item *next;
> - run_on_cpu_func func;
> - run_on_cpu_data data;
> - bool free, exclusive, done;
> -};
> -
> -static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
> +static void queue_work_on_cpu(CPUState *cpu, qemu_work_item *wi)
> {
> qemu_mutex_lock(&cpu->work_mutex);
> if (cpu->queued_work_first == NULL) {
> @@ -132,7 +172,7 @@ static void queue_work_on_cpu(CPUState *cpu, struct
> qemu_work_item *wi)
> void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data,
> QemuMutex *mutex)
> {
> - struct qemu_work_item wi;
> + qemu_work_item wi;
>
> if (qemu_cpu_is_self(cpu)) {
> func(cpu, data);
> @@ -156,9 +196,11 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func,
> run_on_cpu_data data,
>
> void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data
> data)
> {
> - struct qemu_work_item *wi;
> + qemu_work_item *wi = qemu_wi_pool_remove();
>
> - wi = g_malloc0(sizeof(struct qemu_work_item));
> + if (!wi) {
> + wi = g_malloc0(sizeof(qemu_work_item));
> + }
> wi->func = func;
> wi->data = data;
> wi->free = true;
> @@ -299,9 +341,11 @@ void cpu_exec_end(CPUState *cpu)
> void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func,
> run_on_cpu_data data)
> {
> - struct qemu_work_item *wi;
> + qemu_work_item *wi = qemu_wi_pool_remove();
>
> - wi = g_malloc0(sizeof(struct qemu_work_item));
> + if (!wi) {
> + wi = g_malloc0(sizeof(qemu_work_item));
> + }
> wi->func = func;
> wi->data = data;
> wi->free = true;
> @@ -312,7 +356,7 @@ void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func
> func,
>
> void process_queued_cpu_work(CPUState *cpu)
> {
> - struct qemu_work_item *wi;
> + qemu_work_item *wi;
>
> if (cpu->queued_work_first == NULL) {
> return;
> @@ -343,7 +387,8 @@ void process_queued_cpu_work(CPUState *cpu)
> }
> qemu_mutex_lock(&cpu->work_mutex);
> if (wi->free) {
> - g_free(wi);
> + memset(wi, 0, sizeof(qemu_work_item));
> + qemu_wi_pool_insert(wi);
> } else {
> atomic_mb_set(&wi->done, true);
> }
--
Alex Bennée
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [Qemu-devel] [RFC v3 PATCH 2/5] cpus-common: Cache allocated work items,
Alex Bennée <=