[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#41535: [PATCH] performance optimization for aarch64
From: |
Li Qiang |
Subject: |
bug#41535: [PATCH] performance optimization for aarch64 |
Date: |
Sat, 30 May 2020 17:17:49 +0800 |
User-agent: |
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Thunderbird/68.8.1 |
在 2020/5/26 10:39, l00374334 写道:
> From: liqiang <liqiang64@huawei.com>
>
> By analyzing the compression and decompression process of gzip, I found
>
> that the hot spots of CRC32 and longest_match function are very high.
>
>
>
> On the aarch64 architecture, we can optimize the efficiency of crc32
>
> through the interface provided by the neon instruction set (12x faster
>
> in aarch64), and optimize the performance of random access code through
>
> prefetch instructions (about 5%~8% improvement). In some compression
>
> scenarios, loop expansion can also get a certain performance improvement
>
> (about 10%).
>
>
>
> Modify by Li Qiang.
>
> ---
> configure | 14 ++++++++++++++
> deflate.c | 30 +++++++++++++++++++++++++++++-
> util.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 88 insertions(+), 1 deletion(-)
>
> diff --git a/configure b/configure
> index cab3daf..dc80cb6 100644
> --- a/configure
> +++ b/configure
> @@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext
> conftest.$ac_ext
> ;;
>
> arm* | aarch64 )
> + cat confdefs.h - <<_ACEOF >conftest.$ac_ext
> +/* end confdefs.h. */
> +#if defined __ARM_NEON__ || defined __ARM_NEON
> + int ok;
> + #else
> + error fail
> + #endif
> +
> +_ACEOF
> +if ac_fn_c_try_compile "$LINENO"
> +then :
> + CFLAGS="$CFLAGS -march=armv8-a+crc"
> +fi
> +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
> # Assume arm with EABI.
> # On arm64 systems, the C compiler may be generating code in one
> of
> # these ABIs:
> diff --git a/deflate.c b/deflate.c
> index 9d379e9..ee77ffd 100644
> --- a/deflate.c
> +++ b/deflate.c
> @@ -378,6 +378,9 @@ longest_match(IPos cur_match)
> register int len; /* length of current match */
>
> int best_len = prev_length; /* best match length so far
> */
>
> IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL;
>
> +#ifdef __aarch64__
>
> + IPos next_match;
>
> +#endif
>
> /* Stop when cur_match becomes <= limit. To simplify the code,
>
> * we prevent matches with the string of window index 0.
>
> */
>
> @@ -411,6 +414,10 @@ longest_match(IPos cur_match)
> do {
>
> Assert(cur_match < strstart, "no future");
>
> match = window + cur_match;
>
> +#ifdef __aarch64__
>
> + next_match = prev[cur_match & WMASK];
>
> + __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK])));
>
> +#endif
>
>
>
> /* Skip to next match if the match length cannot increase
>
> * or if the match length is less than 2:
>
> @@ -488,8 +495,14 @@ longest_match(IPos cur_match)
> scan_end = scan[best_len];
>
> #endif
>
> }
>
> - } while ((cur_match = prev[cur_match & WMASK]) > limit
>
> + }
>
> +#ifdef __aarch64__
>
> + while ((cur_match = next_match) > limit
>
> + && --chain_length != 0);
>
> +#else
>
> + while ((cur_match = prev[cur_match & WMASK]) > limit
>
> && --chain_length != 0);
>
> +#endif
>
>
>
> return best_len;
>
> }
>
> @@ -777,7 +790,22 @@ deflate (int pack_level)
> lookahead -= prev_length-1;
>
> prev_length -= 2;
>
> RSYNC_ROLL(strstart, prev_length+1);
>
> + while (prev_length >= 4) {
>
> + /* After actual verification, expanding this loop
>
> + * can improve its performance in certain scenarios.
>
> + */
>
> + prev_length -= 4;
>
> + strstart++;
>
> + INSERT_STRING(strstart, hash_head);
>
> + strstart++;
>
> + INSERT_STRING(strstart, hash_head);
>
> + strstart++;
>
> + INSERT_STRING(strstart, hash_head);
>
> + strstart++;
>
> + INSERT_STRING(strstart, hash_head);
>
> + }
>
> do {
>
> + if (prev_length == 0) break;
>
> strstart++;
>
> INSERT_STRING(strstart, hash_head);
>
> /* strstart never exceeds WSIZE-MAX_MATCH, so there are
>
> diff --git a/util.c b/util.c
> index 0a0fc21..c9f0e52 100644
> --- a/util.c
> +++ b/util.c
> @@ -38,6 +38,12 @@
>
>
> static int write_buffer (int, voidp, unsigned int);
>
>
>
> +#if defined __ARM_NEON__ || defined __ARM_NEON
>
> +#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c],
> %x[v]":[c]"+r"(crc):[v]"r"(val))
>
> +#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c],
> %w[v]":[c]"+r"(crc):[v]"r"(val))
>
> +#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c],
> %w[v]":[c]"+r"(crc):[v]"r"(val))
>
> +#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c],
> %w[v]":[c]"+r"(crc):[v]"r"(val))
>
> +#else
>
> /* ========================================================================
>
> * Table of CRC-32's of all single-byte values (made by makecrc.c)
>
> */
>
> @@ -95,6 +101,7 @@ static const ulg crc_32_tab[] = {
> 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
>
> 0x2d02ef8dL
>
> };
>
> +#endif
>
>
>
> /* Shift register contents. */
>
> static ulg crc = 0xffffffffL;
>
> @@ -132,6 +139,43 @@ ulg updcrc(s, n)
> const uch *s; /* pointer to bytes to pump through */
>
> unsigned n; /* number of bytes in s[] */
>
> {
>
> +#if defined __ARM_NEON__ || defined __ARM_NEON
>
> + register ulg c;
>
> + static ulg crc = (ulg)0xffffffffL;
>
> + register const uint8_t *buf1;
>
> + register const uint16_t *buf2;
>
> + register const uint32_t *buf4;
>
> + register const uint64_t *buf8;
>
> + int64_t length = (int64_t)n;
>
> + buf8 = (const uint64_t *)(const void *)s;
>
> +
>
> + if (s == NULL) {
>
> + c = 0xffffffffL;
>
> + } else {
>
> + c = crc;
>
> + while(length >= sizeof(uint64_t)) {
>
> + CRC32D(c, *buf8++);
>
> + length -= sizeof(uint64_t);
>
> + }
>
> + buf4 = (const uint32_t *)(const void *)buf8;
>
> + if (length >= sizeof(uint32_t)) {
>
> + CRC32W(c, *buf4++);
>
> + length -= sizeof(uint32_t);
>
> + }
>
> + buf2 = (const uint16_t *)(const void *)buf4;
>
> + if(length >= sizeof(uint16_t)) {
>
> + CRC32H(c, *buf2++);
>
> + length -= sizeof(uint16_t);
>
> + }
>
> + buf1 = (const uint8_t *)(const void *)buf2;
>
> + if (length >= sizeof(uint8_t)) {
>
> + CRC32B(c, *buf1);
>
> + length -= sizeof(uint8_t);
>
> + }
>
> + }
>
> + crc = c;
>
> + return (c ^ 0xffffffffL);
>
> +#else
>
> register ulg c; /* temporary variable */
>
>
>
> if (s == NULL) {
>
> @@ -144,6 +188,7 @@ ulg updcrc(s, n)
> }
>
> crc = c;
>
> return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */
>
> +#endif
>
> }
>
>
>
> /* Return a current CRC value. */
>
Please allow me to show a set of actual test data for this patch.
First, I made an original version of the program "gzip-1.10" based
on the gzip-1.10 source code, and then made an optimized version of
the program "gzip-optimized" after applying my optimization patch.
Next I use gzip-1.10 version to test the compression and decompression
time on some **xml** files:
[XML]# time ./gzip-1.10 *.xml
real 0m5.099s
user 0m4.384s
sys 0m0.176s
[XML]# time ./gzip-1.10 -d *.gz
real 0m2.173s
user 0m1.821s
sys 0m0.348s
Then use the optimized version to compare:
[XML]# time ./gzip-optimized *.xml
real 0m2.785s
user 0m2.576s
sys 0m0.204s
[XML]# time ./gzip-optimized -d *.gz
real 0m0.497s
user 0m0.176s
sys 0m0.320s
The next test object is a large **log** file:
[LOG]# time ./gzip-1.10 *.log
real 0m8.883s
user 0m8.652s
sys 0m0.217s
[LOG]# time ./gzip-1.10 -d *.gz
real 0m3.049s
user 0m2.604s
sys 0m0.439s
Also use the optimized version to compare:
[LOG]# time ./gzip-optimized *.log
real 0m6.882s
user 0m6.607s
sys 0m0.264s
[LOG]# time ./gzip-optimized -d *.gz
real 0m1.054s
user 0m0.622s
sys 0m0.431s
The above experimental data are from the aarch64 platform.
--
Best regards,
Li Qiang