[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v3 25/34] tests: add atomic_add-bench
From: |
Alex Bennée |
Subject: |
Re: [Qemu-devel] [PATCH v3 25/34] tests: add atomic_add-bench |
Date: |
Wed, 14 Sep 2016 14:53:14 +0100 |
User-agent: |
mu4e 0.9.17; emacs 25.1.12 |
Richard Henderson <address@hidden> writes:
> From: "Emilio G. Cota" <address@hidden>
>
> With this microbenchmark we can measure the overhead of emulating atomic
> instructions with a configurable degree of contention.
>
> The benchmark spawns $n threads, each performing $o atomic ops (additions)
> in a loop. Each atomic operation is performed on a different cache line
> (assuming lines are 64b long) that is randomly selected from a range [0, $r).
>
> [ Note: each $foo corresponds to a -foo flag ]
>
> Signed-off-by: Emilio G. Cota <address@hidden>
> Signed-off-by: Richard Henderson <address@hidden>
> Message-Id: <address@hidden>
> ---
> tests/.gitignore | 1 +
> tests/Makefile.include | 4 +-
> tests/atomic_add-bench.c | 180
> +++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 184 insertions(+), 1 deletion(-)
> create mode 100644 tests/atomic_add-bench.c
>
> diff --git a/tests/.gitignore b/tests/.gitignore
> index dbb5263..ec3137a 100644
> --- a/tests/.gitignore
> +++ b/tests/.gitignore
> @@ -1,3 +1,4 @@
> +atomic_add-bench
> check-qdict
> check-qfloat
> check-qint
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index 14be491..e1957ed 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -421,7 +421,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o
> tests/check-qdict.o \
> tests/test-opts-visitor.o tests/test-qmp-event.o \
> tests/rcutorture.o tests/test-rcu-list.o \
> tests/test-qdist.o \
> - tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o
> + tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \
> + tests/atomic_add-bench.o
>
> $(test-obj-y): QEMU_INCLUDES += -Itests
> QEMU_CFLAGS += -I$(SRC_PATH)/tests
> @@ -465,6 +466,7 @@ tests/test-qdist$(EXESUF): tests/test-qdist.o
> $(test-util-obj-y)
> tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y)
> tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF)
> $(test-util-obj-y)
> tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
> +tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o
> $(test-util-obj-y)
This probably more properly lives in tests/tcg/generic or some such but
that needs the tcg/tests being rehabilitated into the build system so at
least here it gets built.
>
> tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
> hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
> diff --git a/tests/atomic_add-bench.c b/tests/atomic_add-bench.c
> new file mode 100644
> index 0000000..5bbecf6
> --- /dev/null
> +++ b/tests/atomic_add-bench.c
I wonder if this would be worth making atomic-bench and adding the other
atomic operations into the benchmark? I know given the current helper
overhead its unlikely to show much difference between the ops but if we
move to backend support for the tcg atomics it would be a useful tool to
have.
> @@ -0,0 +1,180 @@
> +#include "qemu/osdep.h"
> +#include "qemu/thread.h"
> +#include "qemu/host-utils.h"
> +#include "qemu/processor.h"
> +
> +struct thread_info {
> + uint64_t r;
> +} QEMU_ALIGNED(64);
> +
> +struct count {
> + unsigned long val;
> +} QEMU_ALIGNED(64);
> +
> +static QemuThread *threads;
> +static struct thread_info *th_info;
> +static unsigned int n_threads = 1;
> +static unsigned int n_ready_threads;
> +static struct count *counts;
> +static unsigned long n_ops = 10000;
> +static double duration;
> +static unsigned int range = 1;
> +static bool test_start;
> +
> +static const char commands_string[] =
> + " -n = number of threads\n"
> + " -o = number of ops per thread\n"
> + " -r = range (will be rounded up to pow2)";
> +
> +static void usage_complete(char *argv[])
> +{
> + fprintf(stderr, "Usage: %s [options]\n", argv[0]);
> + fprintf(stderr, "options:\n%s\n", commands_string);
> +}
> +
> +/*
> + * From: https://en.wikipedia.org/wiki/Xorshift
> + * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
> + * guaranteed to be >= INT_MAX).
> + */
> +static uint64_t xorshift64star(uint64_t x)
> +{
> + x ^= x >> 12; /* a */
> + x ^= x << 25; /* b */
> + x ^= x >> 27; /* c */
> + return x * UINT64_C(2685821657736338717);
> +}
> +
> +static void *thread_func(void *arg)
> +{
> + struct thread_info *info = arg;
> + unsigned long i;
> +
> + atomic_inc(&n_ready_threads);
> + while (!atomic_mb_read(&test_start)) {
> + cpu_relax();
> + }
> +
> + for (i = 0; i < n_ops; i++) {
> + unsigned int index;
> +
> + info->r = xorshift64star(info->r);
> + index = info->r & (range - 1);
> + atomic_inc(&counts[index].val);
> + }
> + return NULL;
> +}
> +
> +static inline
> +uint64_t ts_subtract(const struct timespec *a, const struct timespec *b)
> +{
> + uint64_t ns;
> +
> + ns = (b->tv_sec - a->tv_sec) * 1000000000ULL;
> + ns += (b->tv_nsec - a->tv_nsec);
> + return ns;
> +}
> +
> +static void run_test(void)
> +{
> + unsigned int i;
> + struct timespec ts_start, ts_end;
> +
> + while (atomic_read(&n_ready_threads) != n_threads) {
> + cpu_relax();
> + }
> + atomic_mb_set(&test_start, true);
> +
> + clock_gettime(CLOCK_MONOTONIC, &ts_start);
> + for (i = 0; i < n_threads; i++) {
> + qemu_thread_join(&threads[i]);
> + }
> + clock_gettime(CLOCK_MONOTONIC, &ts_end);
> + duration = ts_subtract(&ts_start, &ts_end) / 1e9;
> +}
> +
> +static void create_threads(void)
> +{
> + unsigned int i;
> +
> + threads = g_new(QemuThread, n_threads);
> + th_info = g_new(struct thread_info, n_threads);
> + counts = qemu_memalign(64, sizeof(*counts) * range);
This fails on my setup as AFAICT qemu_memalign doesn't give you zeroed
memory. I added a memset after to zero it out.
> +
> + for (i = 0; i < n_threads; i++) {
> + struct thread_info *info = &th_info[i];
> +
> + info->r = (i + 1) ^ time(NULL);
> + qemu_thread_create(&threads[i], NULL, thread_func, info,
> + QEMU_THREAD_JOINABLE);
> + }
> +}
> +
> +static void pr_params(void)
> +{
> + printf("Parameters:\n");
> + printf(" # of threads: %u\n", n_threads);
> + printf(" n_ops: %lu\n", n_ops);
> + printf(" ops' range: %u\n", range);
> +}
> +
> +static void pr_stats(void)
> +{
> + unsigned long long val = 0;
> + unsigned int i;
> + double tx;
> +
> + for (i = 0; i < range; i++) {
> + val += counts[i].val;
> + }
> + assert(val == n_threads * n_ops);
Again while I was testing this failed due to the above. It would proably
also be worth reporting the fail condition for the test so my current
hacky patch looks like:
modified tests/atomic_add-bench.c
@@ -100,6 +100,7 @@ static void create_threads(void)
threads = g_new(QemuThread, n_threads);
th_info = g_new(struct thread_info, n_threads);
counts = qemu_memalign(64, sizeof(*counts) * range);
+ memset(counts, 0, sizeof(*counts) * range);
for (i = 0; i < n_threads; i++) {
struct thread_info *info = &th_info[i];
@@ -118,22 +119,29 @@ static void pr_params(void)
printf(" ops' range: %u\n", range);
}
-static void pr_stats(void)
+static int pr_stats(void)
{
- unsigned long long val = 0;
+ unsigned long long target_val, val = 0;
unsigned int i;
double tx;
for (i = 0; i < range; i++) {
val += counts[i].val;
}
- assert(val == n_threads * n_ops);
+
+ target_val = (n_threads * n_ops);
+ if (val != target_val) {
+ printf("Bad total: %llu vs %llu\n", val, target_val);
+ return -1;
+ };
tx = val / duration / 1e6;
printf("Results:\n");
printf("Duration: %.2f s\n", duration);
printf(" Throughput: %.2f Mops/s\n", tx);
printf(" Throughput/thread: %.2f Mops/s/thread\n", tx / n_threads);
+
+ return 0;
}
static void parse_args(int argc, char *argv[])
@@ -175,6 +183,5 @@ int main(int argc, char *argv[])
pr_params();
create_threads();
run_test();
- pr_stats();
- return 0;
+ return pr_stats();
}
--
Alex Bennée
- Re: [Qemu-devel] [PATCH v3 13/34] tcg: Add atomic helpers, (continued)
[Qemu-devel] [PATCH v3 19/34] target-i386: emulate LOCK'ed NOT using atomic helper, Richard Henderson, 2016/09/03
[Qemu-devel] [PATCH v3 15/34] tcg: Add CONFIG_ATOMIC64, Richard Henderson, 2016/09/03
[Qemu-devel] [PATCH v3 24/34] target-i386: remove helper_lock(), Richard Henderson, 2016/09/03
[Qemu-devel] [PATCH v3 21/34] target-i386: emulate LOCK'ed XADD using atomic helper, Richard Henderson, 2016/09/03
[Qemu-devel] [PATCH v3 25/34] tests: add atomic_add-bench, Richard Henderson, 2016/09/03
- Re: [Qemu-devel] [PATCH v3 25/34] tests: add atomic_add-bench,
Alex Bennée <=
[Qemu-devel] [PATCH v3 23/34] target-i386: emulate XCHG using atomic helper, Richard Henderson, 2016/09/03
[Qemu-devel] [PATCH v3 20/34] target-i386: emulate LOCK'ed NEG using cmpxchg helper, Richard Henderson, 2016/09/03
[Qemu-devel] [PATCH v3 22/34] target-i386: emulate LOCK'ed BTX ops using atomic helpers, Richard Henderson, 2016/09/03
[Qemu-devel] [PATCH v3 27/34] target-arm: emulate LL/SC using cmpxchg helpers, Richard Henderson, 2016/09/03
[Qemu-devel] [PATCH v3 26/34] target-arm: Rearrange aa32 load and store functions, Richard Henderson, 2016/09/03
[Qemu-devel] [PATCH v3 32/34] target-arm: remove EXCP_STREX + cpu_exclusive_{test, info}, Richard Henderson, 2016/09/03