qemu-arm
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-arm] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking


From: Peter Maydell
Subject: Re: [Qemu-arm] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking
Date: Tue, 5 Apr 2016 15:36:43 +0100

On 4 April 2016 at 14:39,  <address@hidden> wrote:
> From: Vijay <address@hidden>
>
> Use Neon instructions to perform zero checking of
> buffer. This is helps in reducing downtime during
> live migration.
>
> Signed-off-by: Vijaya Kumar K <address@hidden>
> ---
>  util/cutils.c |   81 
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 81 insertions(+)
>
> diff --git a/util/cutils.c b/util/cutils.c
> index 43d1afb..d343b9a 100644
> --- a/util/cutils.c
> +++ b/util/cutils.c
> @@ -352,6 +352,87 @@ static void 
> *can_use_buffer_find_nonzero_offset_ifunc(void)
>      return func;
>  }
>  #pragma GCC pop_options
> +
> +#elif defined __aarch64__
> +#include "arm_neon.h"

Can we rely on all compilers having this, or do we need to
test in configure?

> +
> +#define NEON_VECTYPE               uint64x2_t
> +#define NEON_LOAD_N_ORR(v1, v2)    vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2))
> +#define NEON_ORR(v1, v2)           vorrq_u64(v1, v2)
> +#define NEON_EQ_ZERO(v1) \
> +        ((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \
> +         (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0)

The intrinsics are a bit confusing, but shouldn't we be
testing that both lanes of v1 are 0, rather than whether
either of them is? (so "&&", not "||").

> +
> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
> +
> +/*
> + * Zero page/buffer checking using SIMD(Neon)
> + */
> +
> +static bool
> +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
> +                   * sizeof(NEON_VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
> +}
> +
> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +    size_t i;
> +    NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
> +    NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;
> +    uint64_t const *data = buf;
> +
> +    assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
> +    len /= sizeof(unsigned long);
> +
> +    for (i = 0; i < len; i += 32) {
> +        d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
> +        d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
> +        d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
> +        d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
> +        d4 = NEON_ORR(d0, d1);
> +        d5 = NEON_ORR(d2, d3);
> +        d6 = NEON_ORR(d4, d5);
> +
> +        d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]);
> +        d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]);
> +        d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]);
> +        d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]);
> +        d11 = NEON_ORR(d7, d8);
> +        d12 = NEON_ORR(d9, d10);
> +        d13 = NEON_ORR(d11, d12);
> +
> +        d14 = NEON_ORR(d6, d13);
> +        if (NEON_EQ_ZERO(d14)) {
> +            break;
> +        }
> +    }

Both the other optimised find_nonzero implementations in this
file have two loops, not just one. Is it OK that this
implementation has only a single loop?

Paolo: do you know why we have two loops in the other
implementations?

> +
> +    return i * sizeof(unsigned long);
> +}
> +
> +static inline bool neon_support(void)
> +{
> +    /*
> +     * Check if neon feature is supported.
> +     * By default neon is supported for aarch64.
> +     */
> +    return true;
> +}

There doesn't seem much point in this. We can assume Neon exists
on any CPU we're going to run on (it's part of the ABI, the kernel
assumes it, etc etc). So you can just implement the functions without
the indirection functions below.

> +
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, 
> len) :
> +           can_use_buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
> +           buffer_find_nonzero_offset_inner(buf, len);
> +}
>  #else
>  bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
>  {
> --

thanks
-- PMM



reply via email to

[Prev in Thread] Current Thread [Next in Thread]