[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 1/2] target/mips: Improve performance for MSA bi
From: |
Aleksandar Markovic |
Subject: |
Re: [Qemu-devel] [PATCH 1/2] target/mips: Improve performance for MSA binary operations |
Date: |
Mon, 3 Jun 2019 13:10:20 +0000 |
> From: Alex Bennée <address@hidden>
> Sent: Sunday, June 2, 2019 3:22 PM
> To: address@hidden
> Cc: Aleksandar Rikalo; Aleksandar Markovic; address@hidden
> Subject: Re: [Qemu-devel] [PATCH 1/2] target/mips: Improve performance for
> MSA binary operations
> Mateja Marjanovic <address@hidden> writes:
> > From: Mateja Marjanovic <address@hidden>
> >
> > Eliminate loops for better performance.
> Have you done any measurements of the bellow loop unrolling? Because
> this is something that maybe we can achieve and let the compiler make
> the choice.
I know that Mateja did extensive performance measurements, and I am
asking him to give us some samples.
As for code generation, here are disassemblies of function
helper_msa_add_a_df() before and after this patch:
(it is visible the compiler did not perform unrolling loops by itself)
BEFORE:
Dump of assembler code for function helper_msa_add_a_df:
0x00000000001500b0 <+0>: cmp $0x1,%esi
0x00000000001500b3 <+3>: je 0x150258 <helper_msa_add_a_df+424>
0x00000000001500b9 <+9>: jb 0x1501e8 <helper_msa_add_a_df+312>
0x00000000001500bf <+15>: cmp $0x2,%esi
0x00000000001500c2 <+18>: je 0x150180 <helper_msa_add_a_df+208>
0x00000000001500c8 <+24>: cmp $0x3,%esi
0x00000000001500cb <+27>: jne 0x1502c2 <helper_msa_add_a_df+530>
0x00000000001500d1 <+33>: mov %ecx,%ecx
0x00000000001500d3 <+35>: mov %edx,%edx
0x00000000001500d5 <+37>: lea 0x22(%rcx),%rax
0x00000000001500d9 <+41>: lea 0x22(%rdx),%r10
0x00000000001500dd <+45>: shl $0x4,%rcx
0x00000000001500e1 <+49>: add %rdi,%rcx
0x00000000001500e4 <+52>: shl $0x4,%rdx
0x00000000001500e8 <+56>: shl $0x4,%rax
0x00000000001500ec <+60>: shl $0x4,%r10
0x00000000001500f0 <+64>: add %rdi,%rax
0x00000000001500f3 <+67>: mov 0x8(%rax),%r9
0x00000000001500f7 <+71>: mov 0x8(%rax),%rsi
0x00000000001500fb <+75>: mov %r8d,%eax
0x00000000001500fe <+78>: sar $0x3f,%r9
0x0000000000150102 <+82>: xor %r9,%rsi
0x0000000000150105 <+85>: sub %r9,%rsi
0x0000000000150108 <+88>: mov %rsi,%r9
0x000000000015010b <+91>: lea 0x22(%rax),%rsi
0x000000000015010f <+95>: shl $0x4,%rax
0x0000000000150113 <+99>: lea (%rdi,%rax,1),%rax
0x0000000000150117 <+103>: shl $0x4,%rsi
0x000000000015011b <+107>: add %rdi,%rsi
0x000000000015011e <+110>: mov 0x8(%rsi),%r8
0x0000000000150122 <+114>: mov 0x8(%rsi),%r11
0x0000000000150126 <+118>: sar $0x3f,%r8
0x000000000015012a <+122>: xor %r8,%r11
0x000000000015012d <+125>: mov %r11,%rsi
0x0000000000150130 <+128>: sub %r8,%rsi
0x0000000000150133 <+131>: add %r9,%rsi
0x0000000000150136 <+134>: mov %rsi,0x8(%rdi,%r10,1)
0x000000000015013b <+139>: mov 0x230(%rcx),%rsi
0x0000000000150142 <+146>: mov 0x230(%rcx),%r8
0x0000000000150149 <+153>: sar $0x3f,%rsi
0x000000000015014d <+157>: xor %rsi,%r8
0x0000000000150150 <+160>: mov %r8,%rcx
0x0000000000150153 <+163>: mov 0x230(%rax),%r8
0x000000000015015a <+170>: sub %rsi,%rcx
0x000000000015015d <+173>: mov 0x230(%rax),%rsi
0x0000000000150164 <+180>: sar $0x3f,%rsi
0x0000000000150168 <+184>: xor %rsi,%r8
0x000000000015016b <+187>: mov %r8,%rax
0x000000000015016e <+190>: sub %rsi,%rax
0x0000000000150171 <+193>: add %rcx,%rax
0x0000000000150174 <+196>: mov %rax,0x230(%rdi,%rdx,1)
0x000000000015017c <+204>: retq
0x000000000015017d <+205>: nopl (%rax)
0x0000000000150180 <+208>: mov %r8d,%r8d
0x0000000000150183 <+211>: mov %ecx,%ecx
0x0000000000150185 <+213>: mov %edx,%edx
0x0000000000150187 <+215>: mov %r8,%rax
0x000000000015018a <+218>: neg %r8
0x000000000015018d <+221>: shl $0x4,%rcx
0x0000000000150191 <+225>: shl $0x4,%rax
0x0000000000150195 <+229>: shl $0x4,%r8
0x0000000000150199 <+233>: shl $0x4,%rdx
0x000000000015019d <+237>: lea 0x228(%rdi,%rax,1),%r9
0x00000000001501a5 <+245>: lea 0x238(%rdi,%rax,1),%rdi
0x00000000001501ad <+253>: lea (%r9,%r8,1),%r10
0x00000000001501b1 <+257>: add $0x4,%r9
0x00000000001501b5 <+261>: movslq (%r10,%rcx,1),%rax
0x00000000001501b9 <+265>: mov %rax,%rsi
0x00000000001501bc <+268>: sar $0x3f,%rsi
0x00000000001501c0 <+272>: xor %rsi,%rax
0x00000000001501c3 <+275>: sub %rsi,%rax
0x00000000001501c6 <+278>: movslq -0x4(%r9),%rsi
0x00000000001501ca <+282>: mov %rsi,%r11
0x00000000001501cd <+285>: sar $0x3f,%r11
0x00000000001501d1 <+289>: xor %r11,%rsi
0x00000000001501d4 <+292>: sub %r11,%rsi
0x00000000001501d7 <+295>: add %rsi,%rax
0x00000000001501da <+298>: cmp %rdi,%r9
0x00000000001501dd <+301>: mov %eax,(%r10,%rdx,1)
0x00000000001501e1 <+305>: jne 0x1501ad <helper_msa_add_a_df+253>
0x00000000001501e3 <+307>: repz retq
0x00000000001501e5 <+309>: nopl (%rax)
0x00000000001501e8 <+312>: mov %r8d,%r8d
0x00000000001501eb <+315>: mov %ecx,%ecx
0x00000000001501ed <+317>: mov %edx,%edx
0x00000000001501ef <+319>: mov %r8,%rax
0x00000000001501f2 <+322>: neg %r8
0x00000000001501f5 <+325>: shl $0x4,%rcx
0x00000000001501f9 <+329>: shl $0x4,%rax
0x00000000001501fd <+333>: shl $0x4,%r8
0x0000000000150201 <+337>: shl $0x4,%rdx
0x0000000000150205 <+341>: lea 0x228(%rdi,%rax,1),%r9
0x000000000015020d <+349>: lea 0x238(%rdi,%rax,1),%r11
0x0000000000150215 <+357>: nopl (%rax)
0x0000000000150218 <+360>: lea (%r8,%r9,1),%rdi
0x000000000015021c <+364>: add $0x1,%r9
0x0000000000150220 <+368>: movsbq (%rdi,%rcx,1),%rax
0x0000000000150225 <+373>: mov %rax,%rsi
0x0000000000150228 <+376>: sar $0x3f,%rsi
0x000000000015022c <+380>: xor %rsi,%rax
0x000000000015022f <+383>: sub %rsi,%rax
0x0000000000150232 <+386>: movsbq -0x1(%r9),%rsi
0x0000000000150237 <+391>: mov %rsi,%r10
0x000000000015023a <+394>: sar $0x3f,%r10
0x000000000015023e <+398>: xor %r10,%rsi
0x0000000000150241 <+401>: sub %r10,%rsi
0x0000000000150244 <+404>: add %rsi,%rax
0x0000000000150247 <+407>: cmp %r9,%r11
0x000000000015024a <+410>: mov %al,(%rdi,%rdx,1)
0x000000000015024d <+413>: jne 0x150218 <helper_msa_add_a_df+360>
0x000000000015024f <+415>: repz retq
0x0000000000150251 <+417>: nopl 0x0(%rax)
0x0000000000150258 <+424>: mov %r8d,%r8d
0x000000000015025b <+427>: mov %ecx,%ecx
0x000000000015025d <+429>: mov %edx,%edx
0x000000000015025f <+431>: mov %r8,%rax
0x0000000000150262 <+434>: neg %r8
0x0000000000150265 <+437>: shl $0x4,%rcx
0x0000000000150269 <+441>: shl $0x4,%rax
0x000000000015026d <+445>: shl $0x4,%r8
0x0000000000150271 <+449>: shl $0x4,%rdx
0x0000000000150275 <+453>: lea 0x228(%rdi,%rax,1),%r9
0x000000000015027d <+461>: lea 0x238(%rdi,%rax,1),%r10
0x0000000000150285 <+469>: nopl (%rax)
0x0000000000150288 <+472>: lea (%r8,%r9,1),%rdi
0x000000000015028c <+476>: add $0x2,%r9
0x0000000000150290 <+480>: movswq (%rdi,%rcx,1),%rax
0x0000000000150295 <+485>: mov %rax,%rsi
0x0000000000150298 <+488>: sar $0x3f,%rsi
0x000000000015029c <+492>: xor %rsi,%rax
0x000000000015029f <+495>: sub %rsi,%rax
0x00000000001502a2 <+498>: movswq -0x2(%r9),%rsi
0x00000000001502a7 <+503>: mov %rsi,%r11
0x00000000001502aa <+506>: sar $0x3f,%r11
0x00000000001502ae <+510>: xor %r11,%rsi
0x00000000001502b1 <+513>: sub %r11,%rsi
0x00000000001502b4 <+516>: add %rsi,%rax
0x00000000001502b7 <+519>: cmp %r10,%r9
0x00000000001502ba <+522>: mov %ax,(%rdi,%rdx,1)
0x00000000001502be <+526>: jne 0x150288 <helper_msa_add_a_df+472>
0x00000000001502c0 <+528>: repz retq
0x00000000001502c2 <+530>: lea 0x13c3b7(%rip),%rcx # 0x28c680
<__PRETTY_FUNCTION__.26062>
0x00000000001502c9 <+537>: lea 0x13b830(%rip),%rsi # 0x28bb00
0x00000000001502d0 <+544>: lea 0x1c7204(%rip),%rdi # 0x3174db
0x00000000001502d7 <+551>: sub $0x8,%rsp
0x00000000001502db <+555>: mov $0x357,%edx
0x00000000001502e0 <+560>: callq 0x8eeb8
End of assembler dump.
AFTER:
0x00000000001548d0 <+0>: cmp $0x1,%esi
0x00000000001548d3 <+3>: je 0x154e00 <helper_msa_add_a_df+1328>
0x00000000001548d9 <+9>: jb 0x154a98 <helper_msa_add_a_df+456>
0x00000000001548df <+15>: cmp $0x2,%esi
0x00000000001548e2 <+18>: je 0x1549a0 <helper_msa_add_a_df+208>
0x00000000001548e8 <+24>: cmp $0x3,%esi
0x00000000001548eb <+27>: jne 0x154fd1 <helper_msa_add_a_df+1793>
0x00000000001548f1 <+33>: mov %ecx,%eax
0x00000000001548f3 <+35>: mov %r8d,%r8d
0x00000000001548f6 <+38>: mov %edx,%edx
0x00000000001548f8 <+40>: lea 0x22(%rax),%rcx
0x00000000001548fc <+44>: lea 0x22(%rdx),%r9
0x0000000000154900 <+48>: shl $0x4,%rax
0x0000000000154904 <+52>: add %rdi,%rax
0x0000000000154907 <+55>: shl $0x4,%rdx
0x000000000015490b <+59>: shl $0x4,%rcx
0x000000000015490f <+63>: shl $0x4,%r9
0x0000000000154913 <+67>: add %rdi,%rcx
0x0000000000154916 <+70>: mov 0x8(%rcx),%rsi
0x000000000015491a <+74>: mov 0x8(%rcx),%r11
0x000000000015491e <+78>: sar $0x3f,%rsi
0x0000000000154922 <+82>: xor %rsi,%r11
0x0000000000154925 <+85>: mov %r11,%rcx
0x0000000000154928 <+88>: sub %rsi,%rcx
0x000000000015492b <+91>: mov %rcx,%rsi
0x000000000015492e <+94>: lea 0x22(%r8),%rcx
0x0000000000154932 <+98>: shl $0x4,%r8
0x0000000000154936 <+102>: add %rdi,%r8
0x0000000000154939 <+105>: shl $0x4,%rcx
0x000000000015493d <+109>: add %rdi,%rcx
0x0000000000154940 <+112>: mov 0x8(%rcx),%r10
0x0000000000154944 <+116>: mov 0x8(%rcx),%r11
0x0000000000154948 <+120>: sar $0x3f,%r10
0x000000000015494c <+124>: xor %r10,%r11
0x000000000015494f <+127>: mov %r11,%rcx
0x0000000000154952 <+130>: sub %r10,%rcx
0x0000000000154955 <+133>: add %rsi,%rcx
0x0000000000154958 <+136>: mov %rcx,0x8(%rdi,%r9,1)
0x000000000015495d <+141>: mov 0x230(%rax),%rcx
0x0000000000154964 <+148>: mov 0x230(%rax),%rsi
0x000000000015496b <+155>: sar $0x3f,%rcx
0x000000000015496f <+159>: xor %rcx,%rsi
0x0000000000154972 <+162>: mov %rsi,%rax
0x0000000000154975 <+165>: mov 0x230(%r8),%rsi
0x000000000015497c <+172>: sub %rcx,%rax
0x000000000015497f <+175>: mov %rax,%rcx
0x0000000000154982 <+178>: mov 0x230(%r8),%rax
0x0000000000154989 <+185>: sar $0x3f,%rsi
0x000000000015498d <+189>: xor %rsi,%rax
0x0000000000154990 <+192>: sub %rsi,%rax
0x0000000000154993 <+195>: add %rcx,%rax
0x0000000000154996 <+198>: mov %rax,0x230(%rdi,%rdx,1)
0x000000000015499e <+206>: retq
0x000000000015499f <+207>: nop
0x00000000001549a0 <+208>: mov %ecx,%ecx
0x00000000001549a2 <+210>: mov %r8d,%r8d
0x00000000001549a5 <+213>: mov %edx,%edx
0x00000000001549a7 <+215>: lea 0x22(%rcx),%rax
0x00000000001549ab <+219>: lea 0x22(%rdx),%r9
0x00000000001549af <+223>: shl $0x4,%rcx
0x00000000001549b3 <+227>: add %rdi,%rcx
0x00000000001549b6 <+230>: shl $0x4,%rdx
0x00000000001549ba <+234>: shl $0x4,%rax
0x00000000001549be <+238>: shl $0x4,%r9
0x00000000001549c2 <+242>: add %rdi,%rdx
0x00000000001549c5 <+245>: movslq 0x8(%rdi,%rax,1),%rax
0x00000000001549ca <+250>: mov %rax,%rsi
0x00000000001549cd <+253>: sar $0x3f,%rsi
0x00000000001549d1 <+257>: xor %rsi,%rax
0x00000000001549d4 <+260>: sub %rsi,%rax
0x00000000001549d7 <+263>: lea 0x22(%r8),%rsi
0x00000000001549db <+267>: shl $0x4,%r8
0x00000000001549df <+271>: shl $0x4,%rsi
0x00000000001549e3 <+275>: movslq 0x8(%rdi,%rsi,1),%rsi
0x00000000001549e8 <+280>: mov %rsi,%r10
0x00000000001549eb <+283>: sar $0x3f,%r10
0x00000000001549ef <+287>: xor %r10,%rsi
0x00000000001549f2 <+290>: sub %r10,%rsi
0x00000000001549f5 <+293>: add %rsi,%rax
0x00000000001549f8 <+296>: mov %eax,0x8(%rdi,%r9,1)
0x00000000001549fd <+301>: movslq 0x22c(%rcx),%rax
0x0000000000154a04 <+308>: add %r8,%rdi
0x0000000000154a07 <+311>: mov %rax,%rsi
0x0000000000154a0a <+314>: sar $0x3f,%rsi
0x0000000000154a0e <+318>: xor %rsi,%rax
0x0000000000154a11 <+321>: sub %rsi,%rax
0x0000000000154a14 <+324>: movslq 0x22c(%rdi),%rsi
0x0000000000154a1b <+331>: mov %rsi,%r8
0x0000000000154a1e <+334>: sar $0x3f,%r8
0x0000000000154a22 <+338>: xor %r8,%rsi
0x0000000000154a25 <+341>: sub %r8,%rsi
0x0000000000154a28 <+344>: add %rsi,%rax
0x0000000000154a2b <+347>: mov %eax,0x22c(%rdx)
0x0000000000154a31 <+353>: movslq 0x230(%rcx),%rax
0x0000000000154a38 <+360>: mov %rax,%rsi
0x0000000000154a3b <+363>: sar $0x3f,%rsi
0x0000000000154a3f <+367>: xor %rsi,%rax
0x0000000000154a42 <+370>: sub %rsi,%rax
0x0000000000154a45 <+373>: movslq 0x230(%rdi),%rsi
0x0000000000154a4c <+380>: mov %rsi,%r8
0x0000000000154a4f <+383>: sar $0x3f,%r8
0x0000000000154a53 <+387>: xor %r8,%rsi
0x0000000000154a56 <+390>: sub %r8,%rsi
0x0000000000154a59 <+393>: add %rsi,%rax
0x0000000000154a5c <+396>: mov %eax,0x230(%rdx)
0x0000000000154a62 <+402>: movslq 0x234(%rcx),%rax
0x0000000000154a69 <+409>: mov %rax,%rcx
0x0000000000154a6c <+412>: sar $0x3f,%rcx
0x0000000000154a70 <+416>: xor %rcx,%rax
0x0000000000154a73 <+419>: sub %rcx,%rax
0x0000000000154a76 <+422>: movslq 0x234(%rdi),%rcx
0x0000000000154a7d <+429>: mov %rcx,%rsi
0x0000000000154a80 <+432>: sar $0x3f,%rsi
0x0000000000154a84 <+436>: xor %rsi,%rcx
0x0000000000154a87 <+439>: sub %rsi,%rcx
0x0000000000154a8a <+442>: add %rcx,%rax
0x0000000000154a8d <+445>: mov %eax,0x234(%rdx)
0x0000000000154a93 <+451>: retq
0x0000000000154a94 <+452>: nopl 0x0(%rax)
0x0000000000154a98 <+456>: mov %ecx,%eax
0x0000000000154a9a <+458>: mov %r8d,%r8d
0x0000000000154a9d <+461>: mov %edx,%edx
0x0000000000154a9f <+463>: lea 0x22(%rax),%rcx
0x0000000000154aa3 <+467>: lea 0x22(%rdx),%r9
0x0000000000154aa7 <+471>: shl $0x4,%rax
0x0000000000154aab <+475>: lea (%rdi,%rax,1),%rax
0x0000000000154aaf <+479>: shl $0x4,%rdx
0x0000000000154ab3 <+483>: shl $0x4,%rcx
0x0000000000154ab7 <+487>: shl $0x4,%r9
0x0000000000154abb <+491>: add %rdi,%rdx
0x0000000000154abe <+494>: movsbq 0x8(%rdi,%rcx,1),%rsi
0x0000000000154ac4 <+500>: mov %rsi,%rcx
0x0000000000154ac7 <+503>: sar $0x3f,%rcx
0x0000000000154acb <+507>: xor %rcx,%rsi
0x0000000000154ace <+510>: sub %rcx,%rsi
0x0000000000154ad1 <+513>: lea 0x22(%r8),%rcx
0x0000000000154ad5 <+517>: shl $0x4,%r8
0x0000000000154ad9 <+521>: shl $0x4,%rcx
0x0000000000154add <+525>: movsbq 0x8(%rdi,%rcx,1),%rcx
0x0000000000154ae3 <+531>: mov %rcx,%r10
0x0000000000154ae6 <+534>: sar $0x3f,%r10
0x0000000000154aea <+538>: xor %r10,%rcx
0x0000000000154aed <+541>: sub %r10,%rcx
0x0000000000154af0 <+544>: add %rcx,%rsi
0x0000000000154af3 <+547>: mov %sil,0x8(%rdi,%r9,1)
0x0000000000154af8 <+552>: movsbq 0x229(%rax),%rcx
0x0000000000154b00 <+560>: add %r8,%rdi
0x0000000000154b03 <+563>: mov %rcx,%rsi
0x0000000000154b06 <+566>: sar $0x3f,%rsi
0x0000000000154b0a <+570>: xor %rsi,%rcx
0x0000000000154b0d <+573>: sub %rsi,%rcx
0x0000000000154b10 <+576>: movsbq 0x229(%rdi),%rsi
0x0000000000154b18 <+584>: mov %rsi,%r8
0x0000000000154b1b <+587>: sar $0x3f,%r8
0x0000000000154b1f <+591>: xor %r8,%rsi
0x0000000000154b22 <+594>: sub %r8,%rsi
0x0000000000154b25 <+597>: add %rsi,%rcx
0x0000000000154b28 <+600>: mov %cl,0x229(%rdx)
0x0000000000154b2e <+606>: movsbq 0x22a(%rax),%rcx
0x0000000000154b36 <+614>: mov %rcx,%rsi
0x0000000000154b39 <+617>: sar $0x3f,%rsi
0x0000000000154b3d <+621>: xor %rsi,%rcx
0x0000000000154b40 <+624>: sub %rsi,%rcx
0x0000000000154b43 <+627>: movsbq 0x22a(%rdi),%rsi
0x0000000000154b4b <+635>: mov %rsi,%r8
0x0000000000154b4e <+638>: sar $0x3f,%r8
0x0000000000154b52 <+642>: xor %r8,%rsi
0x0000000000154b55 <+645>: sub %r8,%rsi
0x0000000000154b58 <+648>: add %rsi,%rcx
0x0000000000154b5b <+651>: mov %cl,0x22a(%rdx)
0x0000000000154b61 <+657>: movsbq 0x22b(%rax),%rcx
0x0000000000154b69 <+665>: mov %rcx,%rsi
0x0000000000154b6c <+668>: sar $0x3f,%rsi
0x0000000000154b70 <+672>: xor %rsi,%rcx
0x0000000000154b73 <+675>: sub %rsi,%rcx
0x0000000000154b76 <+678>: movsbq 0x22b(%rdi),%rsi
0x0000000000154b7e <+686>: mov %rsi,%r8
0x0000000000154b81 <+689>: sar $0x3f,%r8
0x0000000000154b85 <+693>: xor %r8,%rsi
0x0000000000154b88 <+696>: sub %r8,%rsi
0x0000000000154b8b <+699>: add %rsi,%rcx
0x0000000000154b8e <+702>: mov %cl,0x22b(%rdx)
0x0000000000154b94 <+708>: movsbq 0x22c(%rax),%rcx
0x0000000000154b9c <+716>: mov %rcx,%rsi
0x0000000000154b9f <+719>: sar $0x3f,%rsi
0x0000000000154ba3 <+723>: xor %rsi,%rcx
0x0000000000154ba6 <+726>: sub %rsi,%rcx
0x0000000000154ba9 <+729>: movsbq 0x22c(%rdi),%rsi
0x0000000000154bb1 <+737>: mov %rsi,%r8
0x0000000000154bb4 <+740>: sar $0x3f,%r8
0x0000000000154bb8 <+744>: xor %r8,%rsi
0x0000000000154bbb <+747>: sub %r8,%rsi
0x0000000000154bbe <+750>: add %rsi,%rcx
0x0000000000154bc1 <+753>: mov %cl,0x22c(%rdx)
0x0000000000154bc7 <+759>: movsbq 0x22d(%rax),%rcx
0x0000000000154bcf <+767>: mov %rcx,%rsi
0x0000000000154bd2 <+770>: sar $0x3f,%rsi
0x0000000000154bd6 <+774>: xor %rsi,%rcx
0x0000000000154bd9 <+777>: sub %rsi,%rcx
0x0000000000154bdc <+780>: movsbq 0x22d(%rdi),%rsi
0x0000000000154be4 <+788>: mov %rsi,%r8
0x0000000000154be7 <+791>: sar $0x3f,%r8
0x0000000000154beb <+795>: xor %r8,%rsi
0x0000000000154bee <+798>: sub %r8,%rsi
0x0000000000154bf1 <+801>: add %rsi,%rcx
0x0000000000154bf4 <+804>: mov %cl,0x22d(%rdx)
0x0000000000154bfa <+810>: movsbq 0x22e(%rax),%rcx
0x0000000000154c02 <+818>: mov %rcx,%rsi
0x0000000000154c05 <+821>: sar $0x3f,%rsi
0x0000000000154c09 <+825>: xor %rsi,%rcx
0x0000000000154c0c <+828>: sub %rsi,%rcx
0x0000000000154c0f <+831>: movsbq 0x22e(%rdi),%rsi
0x0000000000154c17 <+839>: mov %rsi,%r8
0x0000000000154c1a <+842>: sar $0x3f,%r8
0x0000000000154c1e <+846>: xor %r8,%rsi
0x0000000000154c21 <+849>: sub %r8,%rsi
0x0000000000154c24 <+852>: add %rsi,%rcx
0x0000000000154c27 <+855>: mov %cl,0x22e(%rdx)
0x0000000000154c2d <+861>: movsbq 0x22f(%rax),%rcx
0x0000000000154c35 <+869>: mov %rcx,%rsi
0x0000000000154c38 <+872>: sar $0x3f,%rsi
0x0000000000154c3c <+876>: xor %rsi,%rcx
0x0000000000154c3f <+879>: sub %rsi,%rcx
0x0000000000154c42 <+882>: movsbq 0x22f(%rdi),%rsi
0x0000000000154c4a <+890>: mov %rsi,%r8
0x0000000000154c4d <+893>: sar $0x3f,%r8
0x0000000000154c51 <+897>: xor %r8,%rsi
0x0000000000154c54 <+900>: sub %r8,%rsi
0x0000000000154c57 <+903>: add %rsi,%rcx
0x0000000000154c5a <+906>: mov %cl,0x22f(%rdx)
0x0000000000154c60 <+912>: movsbq 0x230(%rax),%rcx
0x0000000000154c68 <+920>: mov %rcx,%rsi
0x0000000000154c6b <+923>: sar $0x3f,%rsi
0x0000000000154c6f <+927>: xor %rsi,%rcx
0x0000000000154c72 <+930>: sub %rsi,%rcx
0x0000000000154c75 <+933>: movsbq 0x230(%rdi),%rsi
0x0000000000154c7d <+941>: mov %rsi,%r8
0x0000000000154c80 <+944>: sar $0x3f,%r8
0x0000000000154c84 <+948>: xor %r8,%rsi
0x0000000000154c87 <+951>: sub %r8,%rsi
0x0000000000154c8a <+954>: add %rsi,%rcx
0x0000000000154c8d <+957>: mov %cl,0x230(%rdx)
0x0000000000154c93 <+963>: movsbq 0x231(%rax),%rcx
0x0000000000154c9b <+971>: mov %rcx,%rsi
0x0000000000154c9e <+974>: sar $0x3f,%rsi
0x0000000000154ca2 <+978>: xor %rsi,%rcx
0x0000000000154ca5 <+981>: sub %rsi,%rcx
0x0000000000154ca8 <+984>: movsbq 0x231(%rdi),%rsi
0x0000000000154cb0 <+992>: mov %rsi,%r8
0x0000000000154cb3 <+995>: sar $0x3f,%r8
0x0000000000154cb7 <+999>: xor %r8,%rsi
0x0000000000154cba <+1002>: sub %r8,%rsi
0x0000000000154cbd <+1005>: add %rsi,%rcx
0x0000000000154cc0 <+1008>: mov %cl,0x231(%rdx)
0x0000000000154cc6 <+1014>: movsbq 0x232(%rax),%rcx
0x0000000000154cce <+1022>: mov %rcx,%rsi
0x0000000000154cd1 <+1025>: sar $0x3f,%rsi
0x0000000000154cd5 <+1029>: xor %rsi,%rcx
0x0000000000154cd8 <+1032>: sub %rsi,%rcx
0x0000000000154cdb <+1035>: movsbq 0x232(%rdi),%rsi
0x0000000000154ce3 <+1043>: mov %rsi,%r8
0x0000000000154ce6 <+1046>: sar $0x3f,%r8
0x0000000000154cea <+1050>: xor %r8,%rsi
0x0000000000154ced <+1053>: sub %r8,%rsi
0x0000000000154cf0 <+1056>: add %rsi,%rcx
0x0000000000154cf3 <+1059>: mov %cl,0x232(%rdx)
0x0000000000154cf9 <+1065>: movsbq 0x233(%rdi),%rcx
0x0000000000154d01 <+1073>: mov %rcx,%rsi
0x0000000000154d04 <+1076>: sar $0x3f,%rsi
0x0000000000154d08 <+1080>: xor %rsi,%rcx
0x0000000000154d0b <+1083>: sub %rsi,%rcx
0x0000000000154d0e <+1086>: movsbq 0x233(%rax),%rsi
0x0000000000154d16 <+1094>: mov %rsi,%r8
0x0000000000154d19 <+1097>: sar $0x3f,%r8
0x0000000000154d1d <+1101>: xor %r8,%rsi
0x0000000000154d20 <+1104>: sub %r8,%rsi
0x0000000000154d23 <+1107>: add %rsi,%rcx
0x0000000000154d26 <+1110>: mov %cl,0x233(%rdx)
0x0000000000154d2c <+1116>: movsbq 0x234(%rdi),%rcx
0x0000000000154d34 <+1124>: mov %rcx,%rsi
0x0000000000154d37 <+1127>: sar $0x3f,%rsi
0x0000000000154d3b <+1131>: xor %rsi,%rcx
0x0000000000154d3e <+1134>: sub %rsi,%rcx
0x0000000000154d41 <+1137>: movsbq 0x234(%rax),%rsi
0x0000000000154d49 <+1145>: mov %rsi,%r8
0x0000000000154d4c <+1148>: sar $0x3f,%r8
0x0000000000154d50 <+1152>: xor %r8,%rsi
0x0000000000154d53 <+1155>: sub %r8,%rsi
0x0000000000154d56 <+1158>: add %rsi,%rcx
0x0000000000154d59 <+1161>: mov %cl,0x234(%rdx)
0x0000000000154d5f <+1167>: movsbq 0x235(%rax),%rcx
0x0000000000154d67 <+1175>: mov %rcx,%rsi
0x0000000000154d6a <+1178>: sar $0x3f,%rsi
0x0000000000154d6e <+1182>: xor %rsi,%rcx
0x0000000000154d71 <+1185>: sub %rsi,%rcx
0x0000000000154d74 <+1188>: movsbq 0x235(%rdi),%rsi
0x0000000000154d7c <+1196>: mov %rsi,%r8
0x0000000000154d7f <+1199>: sar $0x3f,%r8
0x0000000000154d83 <+1203>: xor %r8,%rsi
0x0000000000154d86 <+1206>: sub %r8,%rsi
0x0000000000154d89 <+1209>: add %rsi,%rcx
0x0000000000154d8c <+1212>: mov %cl,0x235(%rdx)
0x0000000000154d92 <+1218>: movsbq 0x236(%rdi),%rcx
0x0000000000154d9a <+1226>: mov %rcx,%rsi
0x0000000000154d9d <+1229>: sar $0x3f,%rsi
0x0000000000154da1 <+1233>: xor %rsi,%rcx
0x0000000000154da4 <+1236>: sub %rsi,%rcx
0x0000000000154da7 <+1239>: movsbq 0x236(%rax),%rsi
0x0000000000154daf <+1247>: mov %rsi,%r8
0x0000000000154db2 <+1250>: sar $0x3f,%r8
0x0000000000154db6 <+1254>: xor %r8,%rsi
0x0000000000154db9 <+1257>: sub %r8,%rsi
0x0000000000154dbc <+1260>: add %rsi,%rcx
0x0000000000154dbf <+1263>: mov %cl,0x236(%rdx)
0x0000000000154dc5 <+1269>: movsbq 0x237(%rax),%rax
0x0000000000154dcd <+1277>: mov %rax,%rcx
0x0000000000154dd0 <+1280>: sar $0x3f,%rcx
0x0000000000154dd4 <+1284>: xor %rcx,%rax
0x0000000000154dd7 <+1287>: sub %rcx,%rax
0x0000000000154dda <+1290>: movsbq 0x237(%rdi),%rcx
0x0000000000154de2 <+1298>: mov %rcx,%rsi
0x0000000000154de5 <+1301>: sar $0x3f,%rsi
0x0000000000154de9 <+1305>: xor %rsi,%rcx
0x0000000000154dec <+1308>: sub %rsi,%rcx
0x0000000000154def <+1311>: add %rcx,%rax
0x0000000000154df2 <+1314>: mov %al,0x237(%rdx)
0x0000000000154df8 <+1320>: retq
0x0000000000154df9 <+1321>: nopl 0x0(%rax)
0x0000000000154e00 <+1328>: mov %ecx,%eax
0x0000000000154e02 <+1330>: mov %r8d,%r8d
0x0000000000154e05 <+1333>: mov %edx,%edx
0x0000000000154e07 <+1335>: lea 0x22(%rax),%rcx
0x0000000000154e0b <+1339>: lea 0x22(%rdx),%r9
0x0000000000154e0f <+1343>: shl $0x4,%rax
0x0000000000154e13 <+1347>: lea (%rdi,%rax,1),%rax
0x0000000000154e17 <+1351>: shl $0x4,%rdx
0x0000000000154e1b <+1355>: shl $0x4,%rcx
0x0000000000154e1f <+1359>: shl $0x4,%r9
0x0000000000154e23 <+1363>: add %rdi,%rdx
0x0000000000154e26 <+1366>: movswq 0x8(%rdi,%rcx,1),%rsi
0x0000000000154e2c <+1372>: mov %rsi,%rcx
0x0000000000154e2f <+1375>: sar $0x3f,%rcx
0x0000000000154e33 <+1379>: xor %rcx,%rsi
0x0000000000154e36 <+1382>: sub %rcx,%rsi
0x0000000000154e39 <+1385>: lea 0x22(%r8),%rcx
0x0000000000154e3d <+1389>: shl $0x4,%r8
0x0000000000154e41 <+1393>: shl $0x4,%rcx
0x0000000000154e45 <+1397>: movswq 0x8(%rdi,%rcx,1),%rcx
0x0000000000154e4b <+1403>: mov %rcx,%r10
0x0000000000154e4e <+1406>: sar $0x3f,%r10
0x0000000000154e52 <+1410>: xor %r10,%rcx
0x0000000000154e55 <+1413>: sub %r10,%rcx
0x0000000000154e58 <+1416>: add %rcx,%rsi
0x0000000000154e5b <+1419>: mov %si,0x8(%rdi,%r9,1)
0x0000000000154e61 <+1425>: movswq 0x22a(%rax),%rcx
0x0000000000154e69 <+1433>: add %r8,%rdi
0x0000000000154e6c <+1436>: mov %rcx,%rsi
0x0000000000154e6f <+1439>: sar $0x3f,%rsi
0x0000000000154e73 <+1443>: xor %rsi,%rcx
0x0000000000154e76 <+1446>: sub %rsi,%rcx
0x0000000000154e79 <+1449>: movswq 0x22a(%rdi),%rsi
0x0000000000154e81 <+1457>: mov %rsi,%r8
0x0000000000154e84 <+1460>: sar $0x3f,%r8
0x0000000000154e88 <+1464>: xor %r8,%rsi
0x0000000000154e8b <+1467>: sub %r8,%rsi
0x0000000000154e8e <+1470>: add %rsi,%rcx
0x0000000000154e91 <+1473>: mov %cx,0x22a(%rdx)
0x0000000000154e98 <+1480>: movswq 0x22c(%rax),%rcx
0x0000000000154ea0 <+1488>: mov %rcx,%rsi
0x0000000000154ea3 <+1491>: sar $0x3f,%rsi
0x0000000000154ea7 <+1495>: xor %rsi,%rcx
0x0000000000154eaa <+1498>: sub %rsi,%rcx
0x0000000000154ead <+1501>: movswq 0x22c(%rdi),%rsi
0x0000000000154eb5 <+1509>: mov %rsi,%r8
0x0000000000154eb8 <+1512>: sar $0x3f,%r8
0x0000000000154ebc <+1516>: xor %r8,%rsi
0x0000000000154ebf <+1519>: sub %r8,%rsi
0x0000000000154ec2 <+1522>: add %rsi,%rcx
0x0000000000154ec5 <+1525>: mov %cx,0x22c(%rdx)
0x0000000000154ecc <+1532>: movswq 0x22e(%rax),%rcx
0x0000000000154ed4 <+1540>: mov %rcx,%rsi
0x0000000000154ed7 <+1543>: sar $0x3f,%rsi
0x0000000000154edb <+1547>: xor %rsi,%rcx
0x0000000000154ede <+1550>: sub %rsi,%rcx
0x0000000000154ee1 <+1553>: movswq 0x22e(%rdi),%rsi
0x0000000000154ee9 <+1561>: mov %rsi,%r8
0x0000000000154eec <+1564>: sar $0x3f,%r8
0x0000000000154ef0 <+1568>: xor %r8,%rsi
0x0000000000154ef3 <+1571>: sub %r8,%rsi
0x0000000000154ef6 <+1574>: add %rsi,%rcx
0x0000000000154ef9 <+1577>: mov %cx,0x22e(%rdx)
0x0000000000154f00 <+1584>: movswq 0x230(%rax),%rcx
0x0000000000154f08 <+1592>: mov %rcx,%rsi
0x0000000000154f0b <+1595>: sar $0x3f,%rsi
0x0000000000154f0f <+1599>: xor %rsi,%rcx
0x0000000000154f12 <+1602>: sub %rsi,%rcx
0x0000000000154f15 <+1605>: movswq 0x230(%rdi),%rsi
0x0000000000154f1d <+1613>: mov %rsi,%r8
0x0000000000154f20 <+1616>: sar $0x3f,%r8
0x0000000000154f24 <+1620>: xor %r8,%rsi
0x0000000000154f27 <+1623>: sub %r8,%rsi
0x0000000000154f2a <+1626>: add %rsi,%rcx
0x0000000000154f2d <+1629>: mov %cx,0x230(%rdx)
0x0000000000154f34 <+1636>: movswq 0x232(%rax),%rcx
0x0000000000154f3c <+1644>: mov %rcx,%rsi
0x0000000000154f3f <+1647>: sar $0x3f,%rsi
0x0000000000154f43 <+1651>: xor %rsi,%rcx
0x0000000000154f46 <+1654>: sub %rsi,%rcx
0x0000000000154f49 <+1657>: movswq 0x232(%rdi),%rsi
0x0000000000154f51 <+1665>: mov %rsi,%r8
0x0000000000154f54 <+1668>: sar $0x3f,%r8
0x0000000000154f58 <+1672>: xor %r8,%rsi
0x0000000000154f5b <+1675>: sub %r8,%rsi
0x0000000000154f5e <+1678>: add %rsi,%rcx
0x0000000000154f61 <+1681>: mov %cx,0x232(%rdx)
0x0000000000154f68 <+1688>: movswq 0x234(%rax),%rcx
0x0000000000154f70 <+1696>: mov %rcx,%rsi
0x0000000000154f73 <+1699>: sar $0x3f,%rsi
0x0000000000154f77 <+1703>: xor %rsi,%rcx
0x0000000000154f7a <+1706>: sub %rsi,%rcx
0x0000000000154f7d <+1709>: movswq 0x234(%rdi),%rsi
0x0000000000154f85 <+1717>: mov %rsi,%r8
0x0000000000154f88 <+1720>: sar $0x3f,%r8
0x0000000000154f8c <+1724>: xor %r8,%rsi
0x0000000000154f8f <+1727>: sub %r8,%rsi
0x0000000000154f92 <+1730>: add %rsi,%rcx
0x0000000000154f95 <+1733>: mov %cx,0x234(%rdx)
0x0000000000154f9c <+1740>: movswq 0x236(%rax),%rax
0x0000000000154fa4 <+1748>: mov %rax,%rcx
0x0000000000154fa7 <+1751>: sar $0x3f,%rcx
0x0000000000154fab <+1755>: xor %rcx,%rax
0x0000000000154fae <+1758>: sub %rcx,%rax
0x0000000000154fb1 <+1761>: movswq 0x236(%rdi),%rcx
0x0000000000154fb9 <+1769>: mov %rcx,%rsi
0x0000000000154fbc <+1772>: sar $0x3f,%rsi
0x0000000000154fc0 <+1776>: xor %rsi,%rcx
0x0000000000154fc3 <+1779>: sub %rsi,%rcx
0x0000000000154fc6 <+1782>: add %rcx,%rax
0x0000000000154fc9 <+1785>: mov %ax,0x236(%rdx)
0x0000000000154fd0 <+1792>: retq
0x0000000000154fd1 <+1793>: lea 0x14faa8(%rip),%rcx # 0x2a4a80
<__PRETTY_FUNCTION__.25843>
0x0000000000154fd8 <+1800>: lea 0x14ef81(%rip),%rsi # 0x2a3f60
0x0000000000154fdf <+1807>: lea 0x1da975(%rip),%rdi # 0x32f95b
0x0000000000154fe6 <+1814>: sub $0x8,%rsp
0x0000000000154fea <+1818>: mov $0x368,%edx
0x0000000000154fef <+1823>: callq 0x8f170
> >
> > Signed-off-by: Mateja Marjanovic <address@hidden>
> > ---
> > target/mips/msa_helper.c | 43 ++++++++++++++++++++++++++++++-------------
> > 1 file changed, 30 insertions(+), 13 deletions(-)
> >
> > diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> > index 4c7ec05..1152fda 100644
> > --- a/target/mips/msa_helper.c
> > +++ b/target/mips/msa_helper.c
> > @@ -804,28 +804,45 @@ void helper_msa_ ## func ## _df(CPUMIPSState *env,
> > uint32_t df, \
> > wr_t *pwd = &(env->active_fpu.fpr[wd].wr); \
> > wr_t *pws = &(env->active_fpu.fpr[ws].wr); \
> > wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> > \
> If we can ensure alignment for the various vector registers then the
> compiler always has the option of using host vectors (certainly for int
> and logic operations).
> > - uint32_t i; \
> > \
> > switch (df) { \
> > case DF_BYTE: \
> > - for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) { \
> > - pwd->b[i] = msa_ ## func ## _df(df, pws->b[i], pwt->b[i]); \
> > - } \
> > + pwd->b[0] = msa_ ## func ## _df(df, pws->b[0], pwt->b[0]); \
> > + pwd->b[1] = msa_ ## func ## _df(df, pws->b[1], pwt->b[1]); \
> > + pwd->b[2] = msa_ ## func ## _df(df, pws->b[2], pwt->b[2]); \
> > + pwd->b[3] = msa_ ## func ## _df(df, pws->b[3], pwt->b[3]); \
> > + pwd->b[4] = msa_ ## func ## _df(df, pws->b[4], pwt->b[4]); \
> > + pwd->b[5] = msa_ ## func ## _df(df, pws->b[5], pwt->b[5]); \
> > + pwd->b[6] = msa_ ## func ## _df(df, pws->b[6], pwt->b[6]); \
> > + pwd->b[7] = msa_ ## func ## _df(df, pws->b[7], pwt->b[7]); \
> > + pwd->b[8] = msa_ ## func ## _df(df, pws->b[8], pwt->b[8]); \
> > + pwd->b[9] = msa_ ## func ## _df(df, pws->b[9], pwt->b[9]); \
> > + pwd->b[10] = msa_ ## func ## _df(df, pws->b[10], pwt->b[10]); \
> > + pwd->b[11] = msa_ ## func ## _df(df, pws->b[11], pwt->b[11]); \
> > + pwd->b[12] = msa_ ## func ## _df(df, pws->b[12], pwt->b[12]); \
> > + pwd->b[13] = msa_ ## func ## _df(df, pws->b[13], pwt->b[13]); \
> > + pwd->b[14] = msa_ ## func ## _df(df, pws->b[14], pwt->b[14]); \
> > + pwd->b[15] = msa_ ## func ## _df(df, pws->b[15], pwt->b[15]); \
> > break; \
> > case DF_HALF: \
> > - for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) { \
> > - pwd->h[i] = msa_ ## func ## _df(df, pws->h[i], pwt->h[i]); \
> > - } \
> > + pwd->h[0] = msa_ ## func ## _df(df, pws->h[0], pwt->h[0]); \
> > + pwd->h[1] = msa_ ## func ## _df(df, pws->h[1], pwt->h[1]); \
> > + pwd->h[2] = msa_ ## func ## _df(df, pws->h[2], pwt->h[2]); \
> > + pwd->h[3] = msa_ ## func ## _df(df, pws->h[3], pwt->h[3]); \
> > + pwd->h[4] = msa_ ## func ## _df(df, pws->h[4], pwt->h[4]); \
> > + pwd->h[5] = msa_ ## func ## _df(df, pws->h[5], pwt->h[5]); \
> > + pwd->h[6] = msa_ ## func ## _df(df, pws->h[6], pwt->h[6]); \
> > + pwd->h[7] = msa_ ## func ## _df(df, pws->h[7], pwt->h[7]); \
> > break; \
> > case DF_WORD: \
> > - for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) { \
> > - pwd->w[i] = msa_ ## func ## _df(df, pws->w[i], pwt->w[i]); \
> > - } \
> > + pwd->w[0] = msa_ ## func ## _df(df, pws->w[0], pwt->w[0]); \
> > + pwd->w[1] = msa_ ## func ## _df(df, pws->w[1], pwt->w[1]); \
> > + pwd->w[2] = msa_ ## func ## _df(df, pws->w[2], pwt->w[2]); \
> > + pwd->w[3] = msa_ ## func ## _df(df, pws->w[3], pwt->w[3]); \
> > break; \
> > case DF_DOUBLE: \
> > - for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) { \
> > - pwd->d[i] = msa_ ## func ## _df(df, pws->d[i], pwt->d[i]); \
> > - } \
> > + pwd->d[0] = msa_ ## func ## _df(df, pws->d[0], pwt->d[0]); \
> > + pwd->d[1] = msa_ ## func ## _df(df, pws->d[1], pwt->d[1]); \
> > break; \
> > default: \
> > assert(0); \
--
Alex Bennée