[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 4/4] target/arm: Convert PMULL.8 to gvec
From: |
Alex Bennée |
Subject: |
Re: [PATCH 4/4] target/arm: Convert PMULL.8 to gvec |
Date: |
Fri, 18 Oct 2019 18:54:34 +0100 |
User-agent: |
mu4e 1.3.5; emacs 27.0.50 |
Richard Henderson <address@hidden> writes:
> We still need two different helpers, since NEON and SVE2 get the
> inputs from different locations within the source vector. However,
> we can convert both to the same internal form for computation.
>
> The sve2 helper is not used yet, but adding it with this patch
> helps illustrate why the neon changes are helpful.
>
> Signed-off-by: Richard Henderson <address@hidden>
Reviewed-by: Alex Bennée <address@hidden>
Tested-by: Alex Bennée <address@hidden>
> ---
> target/arm/helper-sve.h | 2 ++
> target/arm/helper.h | 3 +-
> target/arm/neon_helper.c | 32 --------------------
> target/arm/translate-a64.c | 27 +++++++++++------
> target/arm/translate.c | 26 ++++++++---------
> target/arm/vec_helper.c | 60 ++++++++++++++++++++++++++++++++++++++
> 6 files changed, 95 insertions(+), 55 deletions(-)
>
> diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
> index 9e79182ab4..2f47279155 100644
> --- a/target/arm/helper-sve.h
> +++ b/target/arm/helper-sve.h
> @@ -1574,3 +1574,5 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG,
> void, env, ptr, ptr, ptr, tl, i32)
> DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG,
> void, env, ptr, ptr, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> diff --git a/target/arm/helper.h b/target/arm/helper.h
> index d954399b7e..8a8517cf34 100644
> --- a/target/arm/helper.h
> +++ b/target/arm/helper.h
> @@ -335,7 +335,6 @@ DEF_HELPER_2(neon_sub_u8, i32, i32, i32)
> DEF_HELPER_2(neon_sub_u16, i32, i32, i32)
> DEF_HELPER_2(neon_mul_u8, i32, i32, i32)
> DEF_HELPER_2(neon_mul_u16, i32, i32, i32)
> -DEF_HELPER_2(neon_mull_p8, i64, i32, i32)
>
> DEF_HELPER_2(neon_tst_u8, i32, i32, i32)
> DEF_HELPER_2(neon_tst_u16, i32, i32, i32)
> @@ -688,6 +687,8 @@ DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void,
> ptr, ptr, ptr, i32)
> DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
>
> +DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +
> #ifdef TARGET_AARCH64
> #include "helper-a64.h"
> #include "helper-sve.h"
> diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
> index 6a107da0e1..c7a8438b42 100644
> --- a/target/arm/neon_helper.c
> +++ b/target/arm/neon_helper.c
> @@ -1129,38 +1129,6 @@ NEON_VOP(mul_u8, neon_u8, 4)
> NEON_VOP(mul_u16, neon_u16, 2)
> #undef NEON_FN
>
> -/* Polynomial multiplication is like integer multiplication except the
> - partial products are XORed, not added. */
> -uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
> -{
> - uint64_t result = 0;
> - uint64_t mask;
> - uint64_t op2ex = op2;
> - op2ex = (op2ex & 0xff) |
> - ((op2ex & 0xff00) << 8) |
> - ((op2ex & 0xff0000) << 16) |
> - ((op2ex & 0xff000000) << 24);
> - while (op1) {
> - mask = 0;
> - if (op1 & 1) {
> - mask |= 0xffff;
> - }
> - if (op1 & (1 << 8)) {
> - mask |= (0xffffU << 16);
> - }
> - if (op1 & (1 << 16)) {
> - mask |= (0xffffULL << 32);
> - }
> - if (op1 & (1 << 24)) {
> - mask |= (0xffffULL << 48);
> - }
> - result ^= op2ex & mask;
> - op1 = (op1 >> 1) & 0x7f7f7f7f;
> - op2ex <<= 1;
> - }
> - return result;
> -}
> -
> #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
> NEON_VOP(tst_u8, neon_u8, 4)
> NEON_VOP(tst_u16, neon_u16, 2)
> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index 12588d18df..2934e4fc16 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -10483,10 +10483,6 @@ static void handle_3rd_widening(DisasContext *s, int
> is_q, int is_u, int size,
> gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
> tcg_passres, tcg_passres);
> break;
> - case 14: /* PMULL */
> - assert(size == 0);
> - gen_helper_neon_mull_p8(tcg_passres, tcg_op1, tcg_op2);
> - break;
> default:
> g_assert_not_reached();
> }
> @@ -10650,11 +10646,21 @@ static void disas_simd_three_reg_diff(DisasContext
> *s, uint32_t insn)
> handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm);
> break;
> case 14: /* PMULL, PMULL2 */
> - if (is_u || size == 1 || size == 2) {
> + if (is_u) {
> unallocated_encoding(s);
> return;
> }
> - if (size == 3) {
> + switch (size) {
> + case 0: /* PMULL.P8 */
> + if (!fp_access_check(s)) {
> + return;
> + }
> + /* The Q field specifies lo/hi half input for this insn. */
> + gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
> + gen_helper_neon_pmull_h);
> + break;
> +
> + case 3: /* PMULL.P64 */
> if (!dc_isar_feature(aa64_pmull, s)) {
> unallocated_encoding(s);
> return;
> @@ -10665,9 +10671,13 @@ static void disas_simd_three_reg_diff(DisasContext
> *s, uint32_t insn)
> /* The Q field specifies lo/hi half input for this insn. */
> gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
> gen_helper_gvec_pmull_q);
> - return;
> + break;
> +
> + default:
> + unallocated_encoding(s);
> + break;
> }
> - goto is_widening;
> + return;
> case 9: /* SQDMLAL, SQDMLAL2 */
> case 11: /* SQDMLSL, SQDMLSL2 */
> case 13: /* SQDMULL, SQDMULL2 */
> @@ -10688,7 +10698,6 @@ static void disas_simd_three_reg_diff(DisasContext
> *s, uint32_t insn)
> unallocated_encoding(s);
> return;
> }
> - is_widening:
> if (!fp_access_check(s)) {
> return;
> }
> diff --git a/target/arm/translate.c b/target/arm/translate.c
> index 4e34249672..c3abf130cc 100644
> --- a/target/arm/translate.c
> +++ b/target/arm/translate.c
> @@ -5873,15 +5873,20 @@ static int disas_neon_data_insn(DisasContext *s,
> uint32_t insn)
> return 1;
> }
>
> - /* Handle VMULL.P64 (Polynomial 64x64 to 128 bit multiply)
> - * outside the loop below as it only performs a single pass.
> - */
> - if (op == 14 && size == 2) {
> - if (!dc_isar_feature(aa32_pmull, s)) {
> - return 1;
> + /* Handle polynomial VMULL in a single pass. */
> + if (op == 14) {
> + if (size == 0) {
> + /* VMULL.P8 */
> + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
> + 0, gen_helper_neon_pmull_h);
> + } else {
> + /* VMULL.P64 */
> + if (!dc_isar_feature(aa32_pmull, s)) {
> + return 1;
> + }
> + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
> + 0, gen_helper_gvec_pmull_q);
> }
> - tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
> - 0, gen_helper_gvec_pmull_q);
> return 0;
> }
>
> @@ -5959,11 +5964,6 @@ static int disas_neon_data_insn(DisasContext *s,
> uint32_t insn)
> /* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */
> gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
> break;
> - case 14: /* Polynomial VMULL */
> - gen_helper_neon_mull_p8(cpu_V0, tmp, tmp2);
> - tcg_temp_free_i32(tmp2);
> - tcg_temp_free_i32(tmp);
> - break;
> default: /* 15 is RESERVED: caught earlier */
> abort();
> }
> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
> index 5c1074374e..04b4d7402d 100644
> --- a/target/arm/vec_helper.c
> +++ b/target/arm/vec_helper.c
> @@ -1197,3 +1197,63 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void
> *vm, uint32_t desc)
> }
> clear_tail(d, opr_sz, simd_maxsz(desc));
> }
> +
> +/*
> + * 8x8->16 polynomial multiply.
> + *
> + * The byte inputs are expanded to (or extracted from) half-words.
> + * Note that neon and sve2 get the inputs from different positions.
> + * This allows 4 bytes to be processed in parallel with uint64_t.
> + */
> +
> +static uint64_t expand_byte_to_half(uint64_t x)
> +{
> + return (x & 0x000000ff)
> + | ((x & 0x0000ff00) << 8)
> + | ((x & 0x00ff0000) << 16)
> + | ((x & 0xff000000) << 24);
> +}
> +
> +static uint64_t pmull_h(uint64_t op1, uint64_t op2)
> +{
> + uint64_t result = 0;
> + int i;
> +
> + for (i = 0; i < 8; ++i) {
> + uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
> + result ^= op2 & mask;
> + op1 >>= 1;
> + op2 <<= 1;
> + }
> + return result;
> +}
> +
> +void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
> +{
> + int hi = simd_data(desc);
> + uint64_t *d = vd, *n = vn, *m = vm;
> + uint64_t nn = n[hi], mm = m[hi];
> +
> + d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
> + nn >>= 32;
> + mm >>= 32;
> + d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
> +
> + clear_tail(d, 16, simd_maxsz(desc));
> +}
> +
> +#ifdef TARGET_AARCH64
> +void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
> +{
> + int shift = simd_data(desc) * 8;
> + intptr_t i, opr_sz = simd_oprsz(desc);
> + uint64_t *d = vd, *n = vn, *m = vm;
> +
> + for (i = 0; i < opr_sz / 8; ++i) {
> + uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
> + uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
> +
> + d[i] = pmull_h(nn, mm);
> + }
> +}
> +#endif
--
Alex Bennée
- [PATCH 0/4] target/arm vector improvements, Richard Henderson, 2019/10/17
- [PATCH 2/4] target/arm: Convert PMUL.8 to gvec, Richard Henderson, 2019/10/17
- [PATCH 1/4] target/arm: Vectorize USHL and SSHL, Richard Henderson, 2019/10/17
- [PATCH 3/4] target/arm: Convert PMULL.64 to gvec, Richard Henderson, 2019/10/17
- [PATCH 4/4] target/arm: Convert PMULL.8 to gvec, Richard Henderson, 2019/10/17
- Re: [PATCH 4/4] target/arm: Convert PMULL.8 to gvec,
Alex Bennée <=
- Re: [PATCH 0/4] target/arm vector improvements, no-reply, 2019/10/17
- Re: [PATCH 0/4] target/arm vector improvements, Alex Bennée, 2019/10/18