[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#41535: [PATCH] performance optimization for aarch64
From: |
Li Qiang |
Subject: |
bug#41535: [PATCH] performance optimization for aarch64 |
Date: |
Thu, 20 Aug 2020 16:55:26 +0800 |
User-agent: |
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Thunderbird/68.8.1 |
在 2020/5/30 17:17, Li Qiang 写道:
>
>
> 在 2020/5/26 10:39, l00374334 写道:
>> From: liqiang <liqiang64@huawei.com>
>>
>> By analyzing the compression and decompression process of gzip, I found
>>
>> that the hot spots of CRC32 and longest_match function are very high.
>>
>>
>>
>> On the aarch64 architecture, we can optimize the efficiency of crc32
>>
>> through the interface provided by the neon instruction set (12x faster
>>
>> in aarch64), and optimize the performance of random access code through
>>
>> prefetch instructions (about 5%~8% improvement). In some compression
>>
>> scenarios, loop expansion can also get a certain performance improvement
>>
>> (about 10%).
>>
>>
>>
>> Modify by Li Qiang.
>>
>> ---
>> configure | 14 ++++++++++++++
>> deflate.c | 30 +++++++++++++++++++++++++++++-
>> util.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 88 insertions(+), 1 deletion(-)
>>
>> diff --git a/configure b/configure
>> index cab3daf..dc80cb6 100644
>> --- a/configure
>> +++ b/configure
>> @@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext
>> conftest.$ac_ext
>> ;;
>>
>> arm* | aarch64 )
>> + cat confdefs.h - <<_ACEOF >conftest.$ac_ext
>> +/* end confdefs.h. */
>> +#if defined __ARM_NEON__ || defined __ARM_NEON
>> + int ok;
>> + #else
>> + error fail
>> + #endif
>> +
>> +_ACEOF
>> +if ac_fn_c_try_compile "$LINENO"
>> +then :
>> + CFLAGS="$CFLAGS -march=armv8-a+crc"
>> +fi
>> +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
>> # Assume arm with EABI.
>> # On arm64 systems, the C compiler may be generating code in one
>> of
>> # these ABIs:
>> diff --git a/deflate.c b/deflate.c
>> index 9d379e9..ee77ffd 100644
>> --- a/deflate.c
>> +++ b/deflate.c
>> @@ -378,6 +378,9 @@ longest_match(IPos cur_match)
>> register int len; /* length of current match
>> */
>>
>> int best_len = prev_length; /* best match length so far
>> */
>>
>> IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST :
>> NIL;
>>
>> +#ifdef __aarch64__
>>
>> + IPos next_match;
>>
>> +#endif
>>
>> /* Stop when cur_match becomes <= limit. To simplify the code,
>>
>> * we prevent matches with the string of window index 0.
>>
>> */
>>
>> @@ -411,6 +414,10 @@ longest_match(IPos cur_match)
>> do {
>>
>> Assert(cur_match < strstart, "no future");
>>
>> match = window + cur_match;
>>
>> +#ifdef __aarch64__
>>
>> + next_match = prev[cur_match & WMASK];
>>
>> + __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK])));
>>
>> +#endif
>>
>>
>>
>> /* Skip to next match if the match length cannot increase
>>
>> * or if the match length is less than 2:
>>
>> @@ -488,8 +495,14 @@ longest_match(IPos cur_match)
>> scan_end = scan[best_len];
>>
>> #endif
>>
>> }
>>
>> - } while ((cur_match = prev[cur_match & WMASK]) > limit
>>
>> + }
>>
>> +#ifdef __aarch64__
>>
>> + while ((cur_match = next_match) > limit
>>
>> + && --chain_length != 0);
>>
>> +#else
>>
>> + while ((cur_match = prev[cur_match & WMASK]) > limit
>>
>> && --chain_length != 0);
>>
>> +#endif
>>
>>
>>
>> return best_len;
>>
>> }
>>
>> @@ -777,7 +790,22 @@ deflate (int pack_level)
>> lookahead -= prev_length-1;
>>
>> prev_length -= 2;
>>
>> RSYNC_ROLL(strstart, prev_length+1);
>>
>> + while (prev_length >= 4) {
>>
>> + /* After actual verification, expanding this loop
>>
>> + * can improve its performance in certain scenarios.
>>
>> + */
>>
>> + prev_length -= 4;
>>
>> + strstart++;
>>
>> + INSERT_STRING(strstart, hash_head);
>>
>> + strstart++;
>>
>> + INSERT_STRING(strstart, hash_head);
>>
>> + strstart++;
>>
>> + INSERT_STRING(strstart, hash_head);
>>
>> + strstart++;
>>
>> + INSERT_STRING(strstart, hash_head);
>>
>> + }
>>
>> do {
>>
>> + if (prev_length == 0) break;
>>
>> strstart++;
>>
>> INSERT_STRING(strstart, hash_head);
>>
>> /* strstart never exceeds WSIZE-MAX_MATCH, so there are
>>
>> diff --git a/util.c b/util.c
>> index 0a0fc21..c9f0e52 100644
>> --- a/util.c
>> +++ b/util.c
>> @@ -38,6 +38,12 @@
>>
>>
>> static int write_buffer (int, voidp, unsigned int);
>>
>>
>>
>> +#if defined __ARM_NEON__ || defined __ARM_NEON
>>
>> +#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c],
>> %x[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c],
>> %w[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c],
>> %w[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c],
>> %w[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#else
>>
>> /* ========================================================================
>>
>> * Table of CRC-32's of all single-byte values (made by makecrc.c)
>>
>> */
>>
>> @@ -95,6 +101,7 @@ static const ulg crc_32_tab[] = {
>> 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
>>
>> 0x2d02ef8dL
>>
>> };
>>
>> +#endif
>>
>>
>>
>> /* Shift register contents. */
>>
>> static ulg crc = 0xffffffffL;
>>
>> @@ -132,6 +139,43 @@ ulg updcrc(s, n)
>> const uch *s; /* pointer to bytes to pump through */
>>
>> unsigned n; /* number of bytes in s[] */
>>
>> {
>>
>> +#if defined __ARM_NEON__ || defined __ARM_NEON
>>
>> + register ulg c;
>>
>> + static ulg crc = (ulg)0xffffffffL;
>>
>> + register const uint8_t *buf1;
>>
>> + register const uint16_t *buf2;
>>
>> + register const uint32_t *buf4;
>>
>> + register const uint64_t *buf8;
>>
>> + int64_t length = (int64_t)n;
>>
>> + buf8 = (const uint64_t *)(const void *)s;
>>
>> +
>>
>> + if (s == NULL) {
>>
>> + c = 0xffffffffL;
>>
>> + } else {
>>
>> + c = crc;
>>
>> + while(length >= sizeof(uint64_t)) {
>>
>> + CRC32D(c, *buf8++);
>>
>> + length -= sizeof(uint64_t);
>>
>> + }
>>
>> + buf4 = (const uint32_t *)(const void *)buf8;
>>
>> + if (length >= sizeof(uint32_t)) {
>>
>> + CRC32W(c, *buf4++);
>>
>> + length -= sizeof(uint32_t);
>>
>> + }
>>
>> + buf2 = (const uint16_t *)(const void *)buf4;
>>
>> + if(length >= sizeof(uint16_t)) {
>>
>> + CRC32H(c, *buf2++);
>>
>> + length -= sizeof(uint16_t);
>>
>> + }
>>
>> + buf1 = (const uint8_t *)(const void *)buf2;
>>
>> + if (length >= sizeof(uint8_t)) {
>>
>> + CRC32B(c, *buf1);
>>
>> + length -= sizeof(uint8_t);
>>
>> + }
>>
>> + }
>>
>> + crc = c;
>>
>> + return (c ^ 0xffffffffL);
>>
>> +#else
>>
>> register ulg c; /* temporary variable */
>>
>>
>>
>> if (s == NULL) {
>>
>> @@ -144,6 +188,7 @@ ulg updcrc(s, n)
>> }
>>
>> crc = c;
>>
>> return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */
>>
>> +#endif
>>
>> }
>>
>>
>>
>> /* Return a current CRC value. */
>>
>
> Please allow me to show a set of actual test data for this patch.
>
> First, I made an original version of the program "gzip-1.10" based
> on the gzip-1.10 source code, and then made an optimized version of
> the program "gzip-optimized" after applying my optimization patch.
>
> Next I use gzip-1.10 version to test the compression and decompression
> time on some **xml** files:
> [XML]# time ./gzip-1.10 *.xml
>
> real 0m5.099s
> user 0m4.384s
> sys 0m0.176s
> [XML]# time ./gzip-1.10 -d *.gz
>
> real 0m2.173s
> user 0m1.821s
> sys 0m0.348s
>
> Then use the optimized version to compare:
> [XML]# time ./gzip-optimized *.xml
>
> real 0m2.785s
> user 0m2.576s
> sys 0m0.204s
> [XML]# time ./gzip-optimized -d *.gz
>
> real 0m0.497s
> user 0m0.176s
> sys 0m0.320s
>
>
> The next test object is a large **log** file:
> [LOG]# time ./gzip-1.10 *.log
>
> real 0m8.883s
> user 0m8.652s
> sys 0m0.217s
> [LOG]# time ./gzip-1.10 -d *.gz
>
> real 0m3.049s
> user 0m2.604s
> sys 0m0.439s
>
> Also use the optimized version to compare:
> [LOG]# time ./gzip-optimized *.log
>
> real 0m6.882s
> user 0m6.607s
> sys 0m0.264s
> [LOG]# time ./gzip-optimized -d *.gz
>
> real 0m1.054s
> user 0m0.622s
> sys 0m0.431s
>
> The above experimental data are from the aarch64 platform.
>
Gentle ping.
: )
--
Best regards,
Li Qiang
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- bug#41535: [PATCH] performance optimization for aarch64,
Li Qiang <=