[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v3 45/81] target/arm: Implement SVE2 HISTCNT, HISTSEG
From: |
Richard Henderson |
Subject: |
Re: [PATCH v3 45/81] target/arm: Implement SVE2 HISTCNT, HISTSEG |
Date: |
Fri, 9 Oct 2020 07:35:05 -0500 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.10.0 |
On 10/9/20 1:13 AM, LIU Zhiwei wrote:
>
>
> On 2020/9/19 2:37, Richard Henderson wrote:
>> From: Stephen Long <steplong@quicinc.com>
>>
>> Signed-off-by: Stephen Long <steplong@quicinc.com>
>> Message-Id: <20200416173109.8856-1-steplong@quicinc.com>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>> v2: Fix overlap between output and input vectors.
>> ---
>> target/arm/helper-sve.h | 7 +++
>> target/arm/sve.decode | 6 ++
>> target/arm/sve_helper.c | 124 +++++++++++++++++++++++++++++++++++++
>> target/arm/translate-sve.c | 19 ++++++
>> 4 files changed, 156 insertions(+)
>>
>> diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
>> index 9e8641e1c0..34bbb767ef 100644
>> --- a/target/arm/helper-sve.h
>> +++ b/target/arm/helper-sve.h
>> @@ -2551,6 +2551,13 @@ DEF_HELPER_FLAGS_5(sve2_nmatch_ppzz_b,
>> TCG_CALL_NO_RWG,
>> DEF_HELPER_FLAGS_5(sve2_nmatch_ppzz_h, TCG_CALL_NO_RWG,
>> i32, ptr, ptr, ptr, ptr, i32)
>> +DEF_HELPER_FLAGS_5(sve2_histcnt_s, TCG_CALL_NO_RWG,
>> + void, ptr, ptr, ptr, ptr, i32)
>> +DEF_HELPER_FLAGS_5(sve2_histcnt_d, TCG_CALL_NO_RWG,
>> + void, ptr, ptr, ptr, ptr, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve2_histseg, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
>> +
>> DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_h, TCG_CALL_NO_RWG,
>> void, ptr, ptr, ptr, ptr, ptr, i32)
>> DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_s, TCG_CALL_NO_RWG,
>> diff --git a/target/arm/sve.decode b/target/arm/sve.decode
>> index 3121eabbf8..0edb72d4fb 100644
>> --- a/target/arm/sve.decode
>> +++ b/target/arm/sve.decode
>> @@ -146,6 +146,7 @@
>> &rprrr_esz rn=%reg_movprfx
>> @rdn_pg_rm_ra ........ esz:2 . ra:5 ... pg:3 rm:5 rd:5 \
>> &rprrr_esz rn=%reg_movprfx
>> +@rd_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5 &rprr_esz
>> # One register operand, with governing predicate, vector element size
>> @rd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 &rpr_esz
>> @@ -1336,6 +1337,11 @@ RSUBHNT 01000101 .. 1 ..... 011 111 .....
>> ..... @rd_rn_rm
>> MATCH 01000101 .. 1 ..... 100 ... ..... 0 .... @pd_pg_rn_rm
>> NMATCH 01000101 .. 1 ..... 100 ... ..... 1 .... @pd_pg_rn_rm
>> +### SVE2 Histogram Computation
>> +
>> +HISTCNT 01000101 .. 1 ..... 110 ... ..... ..... @rd_pg_rn_rm
>> +HISTSEG 01000101 .. 1 ..... 101 000 ..... ..... @rd_rn_rm
>> +
>> ## SVE2 floating-point pairwise operations
>> FADDP 01100100 .. 010 00 0 100 ... ..... ..... @rdn_pg_rm
>> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
>> index 19fbf94189..fa4848bc5c 100644
>> --- a/target/arm/sve_helper.c
>> +++ b/target/arm/sve_helper.c
>> @@ -7095,3 +7095,127 @@ DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
>> DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
>> #undef DO_PPZZ_MATCH
>> +
>> +void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
>> + uint32_t desc)
>> +{
>> + ARMVectorReg scratch;
>> + intptr_t i, j;
>> + intptr_t opr_sz = simd_oprsz(desc);
>> + uint32_t *d = vd, *n = vn, *m = vm;
>> + uint8_t *pg = vg;
>> +
>> + if (d == n) {
>> + n = memcpy(&scratch, n, opr_sz);
>> + if (d == m) {
>> + m = n;
>> + }
>> + } else if (d == m) {
>> + m = memcpy(&scratch, m, opr_sz);
>> + }
>> +
>> + for (i = 0; i < opr_sz; i += 4) {
>> + uint64_t count = 0;
>> + uint8_t pred;
>> +
>> + pred = pg[H1(i >> 3)] >> (i & 7);
>> + if (pred & 1) {
>> + uint32_t nn = n[H4(i >> 2)];
>> +
>> + for (j = 0; j <= i; j += 4) {
>> + pred = pg[H1(j >> 3)] >> (j & 7);
>> + if ((pred & 1) && nn == m[H4(j >> 2)]) {
>> + ++count;
>> + }
>> + }
>> + }
>> + d[H4(i >> 2)] = count;
>> + }
>> +}
>> +
>> +void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
>> + uint32_t desc)
>> +{
>> + ARMVectorReg scratch;
>> + intptr_t i, j;
>> + intptr_t opr_sz = simd_oprsz(desc);
>> + uint64_t *d = vd, *n = vn, *m = vm;
>> + uint8_t *pg = vg;
>> +
>> + if (d == n) {
>> + n = memcpy(&scratch, n, opr_sz);
>> + if (d == m) {
>> + m = n;
>> + }
>> + } else if (d == m) {
>> + m = memcpy(&scratch, m, opr_sz);
>> + }
>> +
>> + for (i = 0; i < opr_sz / 8; ++i) {
>> + uint64_t count = 0;
>> + if (pg[H1(i)] & 1) {
>> + uint64_t nn = n[i];
>> + for (j = 0; j <= i; ++j) {
>> + if ((pg[H1(j)] & 1) && nn == m[j]) {
>> + ++count;
>> + }
>> + }
>> + }
>> + d[i] = count;
>> + }
>> +}
>> +
>> +/*
>> + * Returns the number of bytes in m0 and m1 that match n.
>> + * See comment for do_match2().
>> + * */
>> +static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
>> +{
>> + int esz = MO_8;
>> + int bits = 8 << esz;
>> + uint64_t ones = dup_const(esz, 1);
>> + uint64_t signs = ones << (bits - 1);
>> + uint64_t cmp0, cmp1;
>> +
>> + cmp1 = dup_const(esz, n);
>> + cmp0 = cmp1 ^ m0;
>> + cmp1 = cmp1 ^ m1;
>> + cmp0 = (cmp0 - ones) & ~cmp0 & signs;
>> + cmp1 = (cmp1 - ones) & ~cmp1 & signs;
>> +
> Hi Richard,
>
> Although we can detect zero byte with this method, we can't use it to count
> the
> zero bytes.
>
> For example,
> IF
> cmp1 = 0x0100010001000100 , ones = 0x101010101010101, signs =
> 0x8080808080808080,
> THEN
> cmp1 = (cmp1 - ones) & ~cmp1 & signs = 0x8080808080808080
> So
> cmp1 will have 6 zeros by this method. In fact, cmp1 only have 4 zeros
> instead of 6 zeros.
>
> I don't find a "bit twiddling" way, if you find it, please let me know.
Thanks for noticing the error. We already have a bit twiddling example in qemu
for this in target/alpha:
uint64_t helper_cmpbe0(uint64_t a)
{
uint64_t m = 0x7f7f7f7f7f7f7f7fULL;
uint64_t c = ~(((a & m) + m) | a | m);
...
which produces the exact results that we need here.
r~