qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 45/81] target/arm: Implement SVE2 HISTCNT, HISTSEG


From: Richard Henderson
Subject: Re: [PATCH v3 45/81] target/arm: Implement SVE2 HISTCNT, HISTSEG
Date: Fri, 9 Oct 2020 07:35:05 -0500
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.10.0

On 10/9/20 1:13 AM, LIU Zhiwei wrote:
> 
> 
> On 2020/9/19 2:37, Richard Henderson wrote:
>> From: Stephen Long <steplong@quicinc.com>
>>
>> Signed-off-by: Stephen Long <steplong@quicinc.com>
>> Message-Id: <20200416173109.8856-1-steplong@quicinc.com>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>> v2: Fix overlap between output and input vectors.
>> ---
>>   target/arm/helper-sve.h    |   7 +++
>>   target/arm/sve.decode      |   6 ++
>>   target/arm/sve_helper.c    | 124 +++++++++++++++++++++++++++++++++++++
>>   target/arm/translate-sve.c |  19 ++++++
>>   4 files changed, 156 insertions(+)
>>
>> diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
>> index 9e8641e1c0..34bbb767ef 100644
>> --- a/target/arm/helper-sve.h
>> +++ b/target/arm/helper-sve.h
>> @@ -2551,6 +2551,13 @@ DEF_HELPER_FLAGS_5(sve2_nmatch_ppzz_b, 
>> TCG_CALL_NO_RWG,
>>   DEF_HELPER_FLAGS_5(sve2_nmatch_ppzz_h, TCG_CALL_NO_RWG,
>>                      i32, ptr, ptr, ptr, ptr, i32)
>>   +DEF_HELPER_FLAGS_5(sve2_histcnt_s, TCG_CALL_NO_RWG,
>> +                   void, ptr, ptr, ptr, ptr, i32)
>> +DEF_HELPER_FLAGS_5(sve2_histcnt_d, TCG_CALL_NO_RWG,
>> +                   void, ptr, ptr, ptr, ptr, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve2_histseg, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
>> +
>>   DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_h, TCG_CALL_NO_RWG,
>>                      void, ptr, ptr, ptr, ptr, ptr, i32)
>>   DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_s, TCG_CALL_NO_RWG,
>> diff --git a/target/arm/sve.decode b/target/arm/sve.decode
>> index 3121eabbf8..0edb72d4fb 100644
>> --- a/target/arm/sve.decode
>> +++ b/target/arm/sve.decode
>> @@ -146,6 +146,7 @@
>>                   &rprrr_esz rn=%reg_movprfx
>>   @rdn_pg_rm_ra   ........ esz:2 . ra:5  ... pg:3 rm:5 rd:5 \
>>                   &rprrr_esz rn=%reg_movprfx
>> +@rd_pg_rn_rm   ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5       &rprr_esz
>>     # One register operand, with governing predicate, vector element size
>>   @rd_pg_rn       ........ esz:2 ... ... ... pg:3 rn:5 rd:5       &rpr_esz
>> @@ -1336,6 +1337,11 @@ RSUBHNT         01000101 .. 1 ..... 011 111 .....
>> .....  @rd_rn_rm
>>   MATCH           01000101 .. 1 ..... 100 ... ..... 0 .... @pd_pg_rn_rm
>>   NMATCH          01000101 .. 1 ..... 100 ... ..... 1 .... @pd_pg_rn_rm
>>   +### SVE2 Histogram Computation
>> +
>> +HISTCNT         01000101 .. 1 ..... 110 ... ..... .....  @rd_pg_rn_rm
>> +HISTSEG         01000101 .. 1 ..... 101 000 ..... .....  @rd_rn_rm
>> +
>>   ## SVE2 floating-point pairwise operations
>>     FADDP           01100100 .. 010 00 0 100 ... ..... ..... @rdn_pg_rm
>> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
>> index 19fbf94189..fa4848bc5c 100644
>> --- a/target/arm/sve_helper.c
>> +++ b/target/arm/sve_helper.c
>> @@ -7095,3 +7095,127 @@ DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
>>   DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
>>     #undef DO_PPZZ_MATCH
>> +
>> +void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
>> +                            uint32_t desc)
>> +{
>> +    ARMVectorReg scratch;
>> +    intptr_t i, j;
>> +    intptr_t opr_sz = simd_oprsz(desc);
>> +    uint32_t *d = vd, *n = vn, *m = vm;
>> +    uint8_t *pg = vg;
>> +
>> +    if (d == n) {
>> +        n = memcpy(&scratch, n, opr_sz);
>> +        if (d == m) {
>> +            m = n;
>> +        }
>> +    } else if (d == m) {
>> +        m = memcpy(&scratch, m, opr_sz);
>> +    }
>> +
>> +    for (i = 0; i < opr_sz; i += 4) {
>> +        uint64_t count = 0;
>> +        uint8_t pred;
>> +
>> +        pred = pg[H1(i >> 3)] >> (i & 7);
>> +        if (pred & 1) {
>> +            uint32_t nn = n[H4(i >> 2)];
>> +
>> +            for (j = 0; j <= i; j += 4) {
>> +                pred = pg[H1(j >> 3)] >> (j & 7);
>> +                if ((pred & 1) && nn == m[H4(j >> 2)]) {
>> +                    ++count;
>> +                }
>> +            }
>> +        }
>> +        d[H4(i >> 2)] = count;
>> +    }
>> +}
>> +
>> +void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
>> +                            uint32_t desc)
>> +{
>> +    ARMVectorReg scratch;
>> +    intptr_t i, j;
>> +    intptr_t opr_sz = simd_oprsz(desc);
>> +    uint64_t *d = vd, *n = vn, *m = vm;
>> +    uint8_t *pg = vg;
>> +
>> +    if (d == n) {
>> +        n = memcpy(&scratch, n, opr_sz);
>> +        if (d == m) {
>> +            m = n;
>> +        }
>> +    } else if (d == m) {
>> +        m = memcpy(&scratch, m, opr_sz);
>> +    }
>> +
>> +    for (i = 0; i < opr_sz / 8; ++i) {
>> +        uint64_t count = 0;
>> +        if (pg[H1(i)] & 1) {
>> +            uint64_t nn = n[i];
>> +            for (j = 0; j <= i; ++j) {
>> +                if ((pg[H1(j)] & 1) && nn == m[j]) {
>> +                    ++count;
>> +                }
>> +            }
>> +        }
>> +        d[i] = count;
>> +    }
>> +}
>> +
>> +/*
>> + * Returns the number of bytes in m0 and m1 that match n.
>> + * See comment for do_match2().
>> + * */
>> +static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
>> +{
>> +    int esz = MO_8;
>> +    int bits = 8 << esz;
>> +    uint64_t ones = dup_const(esz, 1);
>> +    uint64_t signs = ones << (bits - 1);
>> +    uint64_t cmp0, cmp1;
>> +
>> +    cmp1 = dup_const(esz, n);
>> +    cmp0 = cmp1 ^ m0;
>> +    cmp1 = cmp1 ^ m1;
>> +    cmp0 = (cmp0 - ones) & ~cmp0 & signs;
>> +    cmp1 = (cmp1 - ones) & ~cmp1 & signs;
>> +
> Hi Richard,
> 
> Although we can detect zero byte with this method, we can't use it to count 
> the
> zero bytes.
> 
> For example,
> IF
>     cmp1 =  0x0100010001000100 , ones = 0x101010101010101, signs =
> 0x8080808080808080,
> THEN
>     cmp1 = (cmp1 - ones) & ~cmp1 & signs = 0x8080808080808080
> So
>     cmp1 will have 6 zeros by this method. In fact, cmp1 only have 4 zeros
> instead of 6 zeros.
> 
> I don't  find  a "bit twiddling" way,  if you find it, please let me know.

Thanks for noticing the error.  We already have a bit twiddling example in qemu
for this in target/alpha:

uint64_t helper_cmpbe0(uint64_t a)
{
    uint64_t m = 0x7f7f7f7f7f7f7f7fULL;
    uint64_t c = ~(((a & m) + m) | a | m);
    ...

which produces the exact results that we need here.


r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]