[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization
From: |
Paolo Bonzini |
Subject: |
Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization |
Date: |
Thu, 12 Nov 2015 11:08:51 +0100 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.3.0 |
On 10/11/2015 03:51, Liang Li wrote:
> buffer_find_nonzero_offset() is a hot function during live migration.
> Now it use SSE2 intructions for optimization. For platform supports
> AVX2 instructions, use the AVX2 instructions for optimization can help
> to improve the performance about 30% comparing to SSE2.
> Zero page check can be faster with this optimization, the test result
> shows that for an 8GB RAM idle guest, this patch can help to shorten
> the total live migration time about 6%.
>
> This patch use the ifunc mechanism to select the proper function when
> running, for platform supports AVX2, excute the AVX2 instructions,
> else, excute the original code.
>
> Signed-off-by: Liang Li <address@hidden>
> ---
> include/qemu-common.h | 28 +++++++++++++++------
> util/Makefile.objs | 2 ++
> util/avx2.c | 69
> +++++++++++++++++++++++++++++++++++++++++++++++++++
> util/cutils.c | 53 +++++++++++++++++++++++++++++++++++++--
> 4 files changed, 143 insertions(+), 9 deletions(-)
> create mode 100644 util/avx2.c
>
> diff --git a/include/qemu-common.h b/include/qemu-common.h
> index 2f74540..9fa7501 100644
> --- a/include/qemu-common.h
> +++ b/include/qemu-common.h
> @@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp, const char
> *prefix, size_t size);
> #endif
>
> #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
> -static inline bool
> -can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> -{
> - return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> - * sizeof(VECTYPE)) == 0
> - && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
> -}
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
> +
> size_t buffer_find_nonzero_offset(const void *buf, size_t len);
>
> +extern bool
> +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
> +
> +extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
> +
> +extern bool
> +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len);
> +
> +extern size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len);
> +
> +__asm__(".type can_use_buffer_find_nonzero_offset, \%gnu_indirect_function");
> +__asm__(".type buffer_find_nonzero_offset, \%gnu_indirect_function");
> +
> +
> +void *can_use_buffer_find_nonzero_offset_ifunc(void) \
> + __asm__("can_use_buffer_find_nonzero_offset");
> +
> +void *buffer_find_nonzero_offset_ifunc(void) \
> + __asm__("buffer_find_nonzero_offset");
> /*
> * helper to parse debug environment variables
> */
> diff --git a/util/Makefile.objs b/util/Makefile.objs
> index d7cc399..6aacad7 100644
> --- a/util/Makefile.objs
> +++ b/util/Makefile.objs
> @@ -1,4 +1,5 @@
> util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
> +util-obj-y += avx2.o
> util-obj-$(CONFIG_POSIX) += compatfd.o
> util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
> util-obj-$(CONFIG_POSIX) += mmap-alloc.o
> @@ -29,3 +30,4 @@ util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o
> qemu-coroutine-io.o
> util-obj-y += qemu-coroutine-sleep.o
> util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
> util-obj-y += buffer.o
> +avx2.o-cflags := $(AVX2_CFLAGS)
> diff --git a/util/avx2.c b/util/avx2.c
> new file mode 100644
> index 0000000..0e6915a
> --- /dev/null
> +++ b/util/avx2.c
> @@ -0,0 +1,69 @@
> +#include "qemu-common.h"
> +
> +#ifdef __AVX2__
> +#include <immintrin.h>
> +#define AVX2_VECTYPE __m256i
> +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p))
> +#define AVX2_ALL_EQ(v1, v2) \
> + (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
> +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> +
> +inline bool
> +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> + * sizeof(AVX2_VECTYPE)) == 0
> + && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
> +}
> +
> +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> + const AVX2_VECTYPE *p = buf;
> + const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
> + size_t i;
> +
> + assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
> +
> + if (!len) {
> + return 0;
> + }
> +
> + for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
> + if (!AVX2_ALL_EQ(p[i], zero)) {
> + return i * sizeof(AVX2_VECTYPE);
> + }
> + }
> +
> + for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
> + i < len / sizeof(AVX2_VECTYPE);
> + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
> + AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
> + AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
> + AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
> + AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
> + AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
> + AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
> + if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
> + break;
> + }
> + }
> +
> + return i * sizeof(AVX2_VECTYPE);
> +}
> +
> +#else
> +/* use the original functions if avx2 is not enabled when buiding*/
> +
> +inline bool
> +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> + return can_use_buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +inline size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> + return buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +#endif
> +
> diff --git a/util/cutils.c b/util/cutils.c
> index cfeb848..cd478ce 100644
> --- a/util/cutils.c
> +++ b/util/cutils.c
> @@ -26,6 +26,7 @@
> #include <math.h>
> #include <limits.h>
> #include <errno.h>
> +#include <cpuid.h>
>
> #include "qemu/sockets.h"
> #include "qemu/iov.h"
> @@ -161,6 +162,54 @@ int qemu_fdatasync(int fd)
> #endif
> }
>
> +/* old compiler maynot define bit_AVX2 */
> +#ifndef bit_AVX2
> +#define bit_AVX2 (1 << 5)
> +#endif
> +
> +static inline bool avx2_support(void)
> +{
> + int a, b, c, d;
> +
> + if (__get_cpuid_max(0, NULL) < 7) {
> + printf("max cpuid < 7\n");
> + return false;
> + }
> +
> + __cpuid_count(7, 0, a, b, c, d);
> + printf("b = %x\n", b);
> + return b & bit_AVX2;
> +}
> +
> +void *buffer_find_nonzero_offset_ifunc(void)
> +{
> + printf("deciding %s\n", __func__);
> +
> + typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
> + buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
> +
> + return func;
> +}
> +
> +void *can_use_buffer_find_nonzero_offset_ifunc(void)
> +{
> + printf("deciding %s\n", __func__);
> +
> + typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
> + can_use_buffer_find_nonzero_offset_avx2 :
> + can_use_buffer_find_nonzero_offset_inner;
> +
> + return func;
> +}
> +
> +inline bool
> +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
> +{
> + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> + * sizeof(VECTYPE)) == 0
> + && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
> +}
> +
> /*
> * Searches for an area with non-zero content in a buffer
> *
> @@ -181,13 +230,13 @@ int qemu_fdatasync(int fd)
> * If the buffer is all zero the return value is equal to len.
> */
>
> -size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
> {
> const VECTYPE *p = buf;
> const VECTYPE zero = (VECTYPE){0};
> size_t i;
>
> - assert(can_use_buffer_find_nonzero_offset(buf, len));
> + assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
>
> if (!len) {
> return 0;
>
The main issue here is that you are not testing whether the compiler
supports gnu_indirect_function.
I suggest that you start by moving the functions to util/buffer-zero.c
Then the structure should be something like
#ifdef CONFIG_HAVE_AVX2
#include <immintrin.h>
#endif
... define buffer_find_nonzero_offset_inner ...
... define can_use_buffer_find_nonzero_offset_inner ...
#if defined CONFIG_HAVE_GNU_IFUNC && defined CONFIG_HAVE_AVX2
... define buffer_find_nonzero_offset_avx2 ...
... define can_use_buffer_find_nonzero_offset_avx2 ...
... define the indirect functions ...
#else
... define buffer_find_nonzero_offset that just calls
buffer_find_nonzero_offset_inner ...
... define can_use_buffer_find_nonzero_offset that just calls
can_use_buffer_find_nonzero_offset_inner ...
#endif
Thanks,
Paolo
- [Qemu-devel] [v2 0/2] add avx2 instruction optimization, Liang Li, 2015/11/09
- [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization, Liang Li, 2015/11/09
- Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization,
Paolo Bonzini <=
- Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization, Li, Liang Z, 2015/11/12
- Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization, Juan Quintela, 2015/11/12
- Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization, Li, Liang Z, 2015/11/12
- Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization, Paolo Bonzini, 2015/11/13
- Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization, Richard Henderson, 2015/11/12
- [Qemu-devel] [v2 2/2] configure: add options to config avx2, Liang Li, 2015/11/09
- Re: [Qemu-devel] [v2 0/2] add avx2 instruction optimization, Eric Blake, 2015/11/09