qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 1/2] target/mips: Improve performance for MSA bi


From: Mateja Marjanovic
Subject: Re: [Qemu-devel] [PATCH 1/2] target/mips: Improve performance for MSA binary operations
Date: Mon, 3 Jun 2019 15:29:11 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Thunderbird/60.7.0


On 3.6.19. 15:10, Aleksandar Markovic wrote:
From: Alex Bennée <address@hidden>
Sent: Sunday, June 2, 2019 3:22 PM
To: address@hidden
Cc: Aleksandar Rikalo; Aleksandar Markovic; address@hidden
Subject: Re: [Qemu-devel] [PATCH 1/2] target/mips: Improve performance for MSA 
binary operations

Mateja Marjanovic <address@hidden> writes:
From: Mateja Marjanovic <address@hidden>

Eliminate loops for better performance.
Have you done any measurements of the bellow loop unrolling? Because
this is something that maybe we can achieve and let the compiler make
the choice.
I know that Mateja did extensive performance measurements, and I am
asking him to give us some samples.
Yes, here are some of the binop instructions with a loop and without
it:

|| intruction   ||       with-loop     ||    without-loop   ||
===============================
||  addv.b      ||    147.356 ms     ||    84.249 ms     ||
||  addv.h      ||     83.612 ms      ||    44.808 ms     ||
||  addv.w      ||     49.952 ms     ||    34.128 ms     ||
||  addv.d      ||     29.834 ms      ||    34.103 ms     ||
||  asub_s.b   ||    187.939 ms     ||    143.658 ms   ||
||  asub_s.h   ||    105.266 ms     ||     82.424 ms    ||
||  asub_s.w  ||     68.941 ms      ||     57.923 ms    ||
||  asub_s.d   ||     41.536 ms      ||     42.092 ms    ||
||  max_s.b    ||    156.036 ms    ||    100.628 ms    ||
||  max_s.h    ||     87.100 ms     ||     64.905 ms     ||
||  max_s.w   ||     52.339 ms     ||     40.632 ms     ||
||  max_s.d    ||     34.873 ms     ||     35.562 ms     ||
===============================


As for code generation, here are disassemblies of function
helper_msa_add_a_df() before and after this patch:

(it is visible the compiler did not perform unrolling loops by itself)

BEFORE:

Dump of assembler code for function helper_msa_add_a_df:
    0x00000000001500b0 <+0>:      cmp    $0x1,%esi
    0x00000000001500b3 <+3>:      je     0x150258 <helper_msa_add_a_df+424>
    0x00000000001500b9 <+9>:      jb     0x1501e8 <helper_msa_add_a_df+312>
    0x00000000001500bf <+15>:     cmp    $0x2,%esi
    0x00000000001500c2 <+18>:     je     0x150180 <helper_msa_add_a_df+208>
    0x00000000001500c8 <+24>:     cmp    $0x3,%esi
    0x00000000001500cb <+27>:     jne    0x1502c2 <helper_msa_add_a_df+530>
    0x00000000001500d1 <+33>:     mov    %ecx,%ecx
    0x00000000001500d3 <+35>:     mov    %edx,%edx
    0x00000000001500d5 <+37>:     lea    0x22(%rcx),%rax
    0x00000000001500d9 <+41>:     lea    0x22(%rdx),%r10
    0x00000000001500dd <+45>:     shl    $0x4,%rcx
    0x00000000001500e1 <+49>:     add    %rdi,%rcx
    0x00000000001500e4 <+52>:     shl    $0x4,%rdx
    0x00000000001500e8 <+56>:     shl    $0x4,%rax
    0x00000000001500ec <+60>:     shl    $0x4,%r10
    0x00000000001500f0 <+64>:     add    %rdi,%rax
    0x00000000001500f3 <+67>:     mov    0x8(%rax),%r9
    0x00000000001500f7 <+71>:     mov    0x8(%rax),%rsi
    0x00000000001500fb <+75>:     mov    %r8d,%eax
    0x00000000001500fe <+78>:     sar    $0x3f,%r9
    0x0000000000150102 <+82>:     xor    %r9,%rsi
    0x0000000000150105 <+85>:     sub    %r9,%rsi
    0x0000000000150108 <+88>:     mov    %rsi,%r9
    0x000000000015010b <+91>:     lea    0x22(%rax),%rsi
    0x000000000015010f <+95>:     shl    $0x4,%rax
    0x0000000000150113 <+99>:     lea    (%rdi,%rax,1),%rax
    0x0000000000150117 <+103>:    shl    $0x4,%rsi
    0x000000000015011b <+107>:    add    %rdi,%rsi
    0x000000000015011e <+110>:    mov    0x8(%rsi),%r8
    0x0000000000150122 <+114>:    mov    0x8(%rsi),%r11
    0x0000000000150126 <+118>:    sar    $0x3f,%r8
    0x000000000015012a <+122>:    xor    %r8,%r11
    0x000000000015012d <+125>:    mov    %r11,%rsi
    0x0000000000150130 <+128>:    sub    %r8,%rsi
    0x0000000000150133 <+131>:    add    %r9,%rsi
    0x0000000000150136 <+134>:    mov    %rsi,0x8(%rdi,%r10,1)
    0x000000000015013b <+139>:    mov    0x230(%rcx),%rsi
    0x0000000000150142 <+146>:    mov    0x230(%rcx),%r8
    0x0000000000150149 <+153>:    sar    $0x3f,%rsi
    0x000000000015014d <+157>:    xor    %rsi,%r8
    0x0000000000150150 <+160>:    mov    %r8,%rcx
    0x0000000000150153 <+163>:    mov    0x230(%rax),%r8
    0x000000000015015a <+170>:    sub    %rsi,%rcx
    0x000000000015015d <+173>:    mov    0x230(%rax),%rsi
    0x0000000000150164 <+180>:    sar    $0x3f,%rsi
    0x0000000000150168 <+184>:    xor    %rsi,%r8
    0x000000000015016b <+187>:    mov    %r8,%rax
    0x000000000015016e <+190>:    sub    %rsi,%rax
    0x0000000000150171 <+193>:    add    %rcx,%rax
    0x0000000000150174 <+196>:    mov    %rax,0x230(%rdi,%rdx,1)
    0x000000000015017c <+204>:    retq
    0x000000000015017d <+205>:    nopl   (%rax)
    0x0000000000150180 <+208>:    mov    %r8d,%r8d
    0x0000000000150183 <+211>:    mov    %ecx,%ecx
    0x0000000000150185 <+213>:    mov    %edx,%edx
    0x0000000000150187 <+215>:    mov    %r8,%rax
    0x000000000015018a <+218>:    neg    %r8
    0x000000000015018d <+221>:    shl    $0x4,%rcx
    0x0000000000150191 <+225>:    shl    $0x4,%rax
    0x0000000000150195 <+229>:    shl    $0x4,%r8
    0x0000000000150199 <+233>:    shl    $0x4,%rdx
    0x000000000015019d <+237>:    lea    0x228(%rdi,%rax,1),%r9
    0x00000000001501a5 <+245>:    lea    0x238(%rdi,%rax,1),%rdi
    0x00000000001501ad <+253>:    lea    (%r9,%r8,1),%r10
    0x00000000001501b1 <+257>:    add    $0x4,%r9
    0x00000000001501b5 <+261>:    movslq (%r10,%rcx,1),%rax
    0x00000000001501b9 <+265>:    mov    %rax,%rsi
    0x00000000001501bc <+268>:    sar    $0x3f,%rsi
    0x00000000001501c0 <+272>:    xor    %rsi,%rax
    0x00000000001501c3 <+275>:    sub    %rsi,%rax
    0x00000000001501c6 <+278>:    movslq -0x4(%r9),%rsi
    0x00000000001501ca <+282>:    mov    %rsi,%r11
    0x00000000001501cd <+285>:    sar    $0x3f,%r11
    0x00000000001501d1 <+289>:    xor    %r11,%rsi
    0x00000000001501d4 <+292>:    sub    %r11,%rsi
    0x00000000001501d7 <+295>:    add    %rsi,%rax
    0x00000000001501da <+298>:    cmp    %rdi,%r9
    0x00000000001501dd <+301>:    mov    %eax,(%r10,%rdx,1)
    0x00000000001501e1 <+305>:    jne    0x1501ad <helper_msa_add_a_df+253>
    0x00000000001501e3 <+307>:    repz retq
    0x00000000001501e5 <+309>:    nopl   (%rax)
    0x00000000001501e8 <+312>:    mov    %r8d,%r8d
    0x00000000001501eb <+315>:    mov    %ecx,%ecx
    0x00000000001501ed <+317>:    mov    %edx,%edx
    0x00000000001501ef <+319>:    mov    %r8,%rax
    0x00000000001501f2 <+322>:    neg    %r8
    0x00000000001501f5 <+325>:    shl    $0x4,%rcx
    0x00000000001501f9 <+329>:    shl    $0x4,%rax
    0x00000000001501fd <+333>:    shl    $0x4,%r8
    0x0000000000150201 <+337>:    shl    $0x4,%rdx
    0x0000000000150205 <+341>:    lea    0x228(%rdi,%rax,1),%r9
    0x000000000015020d <+349>:    lea    0x238(%rdi,%rax,1),%r11
    0x0000000000150215 <+357>:    nopl   (%rax)
    0x0000000000150218 <+360>:    lea    (%r8,%r9,1),%rdi
    0x000000000015021c <+364>:    add    $0x1,%r9
    0x0000000000150220 <+368>:    movsbq (%rdi,%rcx,1),%rax
    0x0000000000150225 <+373>:    mov    %rax,%rsi
    0x0000000000150228 <+376>:    sar    $0x3f,%rsi
    0x000000000015022c <+380>:    xor    %rsi,%rax
    0x000000000015022f <+383>:    sub    %rsi,%rax
    0x0000000000150232 <+386>:    movsbq -0x1(%r9),%rsi
    0x0000000000150237 <+391>:    mov    %rsi,%r10
    0x000000000015023a <+394>:    sar    $0x3f,%r10
    0x000000000015023e <+398>:    xor    %r10,%rsi
    0x0000000000150241 <+401>:    sub    %r10,%rsi
    0x0000000000150244 <+404>:    add    %rsi,%rax
    0x0000000000150247 <+407>:    cmp    %r9,%r11
    0x000000000015024a <+410>:    mov    %al,(%rdi,%rdx,1)
    0x000000000015024d <+413>:    jne    0x150218 <helper_msa_add_a_df+360>
    0x000000000015024f <+415>:    repz retq
    0x0000000000150251 <+417>:    nopl   0x0(%rax)
    0x0000000000150258 <+424>:    mov    %r8d,%r8d
    0x000000000015025b <+427>:    mov    %ecx,%ecx
    0x000000000015025d <+429>:    mov    %edx,%edx
    0x000000000015025f <+431>:    mov    %r8,%rax
    0x0000000000150262 <+434>:    neg    %r8
    0x0000000000150265 <+437>:    shl    $0x4,%rcx
    0x0000000000150269 <+441>:    shl    $0x4,%rax
    0x000000000015026d <+445>:    shl    $0x4,%r8
    0x0000000000150271 <+449>:    shl    $0x4,%rdx
    0x0000000000150275 <+453>:    lea    0x228(%rdi,%rax,1),%r9
    0x000000000015027d <+461>:    lea    0x238(%rdi,%rax,1),%r10
    0x0000000000150285 <+469>:    nopl   (%rax)
    0x0000000000150288 <+472>:    lea    (%r8,%r9,1),%rdi
    0x000000000015028c <+476>:    add    $0x2,%r9
    0x0000000000150290 <+480>:    movswq (%rdi,%rcx,1),%rax
    0x0000000000150295 <+485>:    mov    %rax,%rsi
    0x0000000000150298 <+488>:    sar    $0x3f,%rsi
    0x000000000015029c <+492>:    xor    %rsi,%rax
    0x000000000015029f <+495>:    sub    %rsi,%rax
    0x00000000001502a2 <+498>:    movswq -0x2(%r9),%rsi
    0x00000000001502a7 <+503>:    mov    %rsi,%r11
    0x00000000001502aa <+506>:    sar    $0x3f,%r11
    0x00000000001502ae <+510>:    xor    %r11,%rsi
    0x00000000001502b1 <+513>:    sub    %r11,%rsi
    0x00000000001502b4 <+516>:    add    %rsi,%rax
    0x00000000001502b7 <+519>:    cmp    %r10,%r9
    0x00000000001502ba <+522>:    mov    %ax,(%rdi,%rdx,1)
    0x00000000001502be <+526>:    jne    0x150288 <helper_msa_add_a_df+472>
    0x00000000001502c0 <+528>:    repz retq
    0x00000000001502c2 <+530>:    lea    0x13c3b7(%rip),%rcx        # 0x28c680 
<__PRETTY_FUNCTION__.26062>
    0x00000000001502c9 <+537>:    lea    0x13b830(%rip),%rsi        # 0x28bb00
    0x00000000001502d0 <+544>:    lea    0x1c7204(%rip),%rdi        # 0x3174db
    0x00000000001502d7 <+551>:    sub    $0x8,%rsp
    0x00000000001502db <+555>:    mov    $0x357,%edx
    0x00000000001502e0 <+560>:    callq  0x8eeb8
End of assembler dump.


AFTER:

    0x00000000001548d0 <+0>:      cmp    $0x1,%esi
    0x00000000001548d3 <+3>:      je     0x154e00 <helper_msa_add_a_df+1328>
    0x00000000001548d9 <+9>:      jb     0x154a98 <helper_msa_add_a_df+456>
    0x00000000001548df <+15>:     cmp    $0x2,%esi
    0x00000000001548e2 <+18>:     je     0x1549a0 <helper_msa_add_a_df+208>
    0x00000000001548e8 <+24>:     cmp    $0x3,%esi
    0x00000000001548eb <+27>:     jne    0x154fd1 <helper_msa_add_a_df+1793>
    0x00000000001548f1 <+33>:     mov    %ecx,%eax
    0x00000000001548f3 <+35>:     mov    %r8d,%r8d
    0x00000000001548f6 <+38>:     mov    %edx,%edx
    0x00000000001548f8 <+40>:     lea    0x22(%rax),%rcx
    0x00000000001548fc <+44>:     lea    0x22(%rdx),%r9
    0x0000000000154900 <+48>:     shl    $0x4,%rax
    0x0000000000154904 <+52>:     add    %rdi,%rax
    0x0000000000154907 <+55>:     shl    $0x4,%rdx
    0x000000000015490b <+59>:     shl    $0x4,%rcx
    0x000000000015490f <+63>:     shl    $0x4,%r9
    0x0000000000154913 <+67>:     add    %rdi,%rcx
    0x0000000000154916 <+70>:     mov    0x8(%rcx),%rsi
    0x000000000015491a <+74>:     mov    0x8(%rcx),%r11
    0x000000000015491e <+78>:     sar    $0x3f,%rsi
    0x0000000000154922 <+82>:     xor    %rsi,%r11
    0x0000000000154925 <+85>:     mov    %r11,%rcx
    0x0000000000154928 <+88>:     sub    %rsi,%rcx
    0x000000000015492b <+91>:     mov    %rcx,%rsi
    0x000000000015492e <+94>:     lea    0x22(%r8),%rcx
    0x0000000000154932 <+98>:     shl    $0x4,%r8
    0x0000000000154936 <+102>:    add    %rdi,%r8
    0x0000000000154939 <+105>:    shl    $0x4,%rcx
    0x000000000015493d <+109>:    add    %rdi,%rcx
    0x0000000000154940 <+112>:    mov    0x8(%rcx),%r10
    0x0000000000154944 <+116>:    mov    0x8(%rcx),%r11
    0x0000000000154948 <+120>:    sar    $0x3f,%r10
    0x000000000015494c <+124>:    xor    %r10,%r11
    0x000000000015494f <+127>:    mov    %r11,%rcx
    0x0000000000154952 <+130>:    sub    %r10,%rcx
    0x0000000000154955 <+133>:    add    %rsi,%rcx
    0x0000000000154958 <+136>:    mov    %rcx,0x8(%rdi,%r9,1)
    0x000000000015495d <+141>:    mov    0x230(%rax),%rcx
    0x0000000000154964 <+148>:    mov    0x230(%rax),%rsi
    0x000000000015496b <+155>:    sar    $0x3f,%rcx
    0x000000000015496f <+159>:    xor    %rcx,%rsi
    0x0000000000154972 <+162>:    mov    %rsi,%rax
    0x0000000000154975 <+165>:    mov    0x230(%r8),%rsi
    0x000000000015497c <+172>:    sub    %rcx,%rax
    0x000000000015497f <+175>:    mov    %rax,%rcx
    0x0000000000154982 <+178>:    mov    0x230(%r8),%rax
    0x0000000000154989 <+185>:    sar    $0x3f,%rsi
    0x000000000015498d <+189>:    xor    %rsi,%rax
    0x0000000000154990 <+192>:    sub    %rsi,%rax
    0x0000000000154993 <+195>:    add    %rcx,%rax
    0x0000000000154996 <+198>:    mov    %rax,0x230(%rdi,%rdx,1)
    0x000000000015499e <+206>:    retq
    0x000000000015499f <+207>:    nop
    0x00000000001549a0 <+208>:    mov    %ecx,%ecx
    0x00000000001549a2 <+210>:    mov    %r8d,%r8d
    0x00000000001549a5 <+213>:    mov    %edx,%edx
    0x00000000001549a7 <+215>:    lea    0x22(%rcx),%rax
    0x00000000001549ab <+219>:    lea    0x22(%rdx),%r9
    0x00000000001549af <+223>:    shl    $0x4,%rcx
    0x00000000001549b3 <+227>:    add    %rdi,%rcx
    0x00000000001549b6 <+230>:    shl    $0x4,%rdx
    0x00000000001549ba <+234>:    shl    $0x4,%rax
    0x00000000001549be <+238>:    shl    $0x4,%r9
    0x00000000001549c2 <+242>:    add    %rdi,%rdx
    0x00000000001549c5 <+245>:    movslq 0x8(%rdi,%rax,1),%rax
    0x00000000001549ca <+250>:    mov    %rax,%rsi
    0x00000000001549cd <+253>:    sar    $0x3f,%rsi
    0x00000000001549d1 <+257>:    xor    %rsi,%rax
    0x00000000001549d4 <+260>:    sub    %rsi,%rax
    0x00000000001549d7 <+263>:    lea    0x22(%r8),%rsi
    0x00000000001549db <+267>:    shl    $0x4,%r8
    0x00000000001549df <+271>:    shl    $0x4,%rsi
    0x00000000001549e3 <+275>:    movslq 0x8(%rdi,%rsi,1),%rsi
    0x00000000001549e8 <+280>:    mov    %rsi,%r10
    0x00000000001549eb <+283>:    sar    $0x3f,%r10
    0x00000000001549ef <+287>:    xor    %r10,%rsi
    0x00000000001549f2 <+290>:    sub    %r10,%rsi
    0x00000000001549f5 <+293>:    add    %rsi,%rax
    0x00000000001549f8 <+296>:    mov    %eax,0x8(%rdi,%r9,1)
    0x00000000001549fd <+301>:    movslq 0x22c(%rcx),%rax
    0x0000000000154a04 <+308>:    add    %r8,%rdi
    0x0000000000154a07 <+311>:    mov    %rax,%rsi
    0x0000000000154a0a <+314>:    sar    $0x3f,%rsi
    0x0000000000154a0e <+318>:    xor    %rsi,%rax
    0x0000000000154a11 <+321>:    sub    %rsi,%rax
    0x0000000000154a14 <+324>:    movslq 0x22c(%rdi),%rsi
    0x0000000000154a1b <+331>:    mov    %rsi,%r8
    0x0000000000154a1e <+334>:    sar    $0x3f,%r8
    0x0000000000154a22 <+338>:    xor    %r8,%rsi
    0x0000000000154a25 <+341>:    sub    %r8,%rsi
    0x0000000000154a28 <+344>:    add    %rsi,%rax
    0x0000000000154a2b <+347>:    mov    %eax,0x22c(%rdx)
    0x0000000000154a31 <+353>:    movslq 0x230(%rcx),%rax
    0x0000000000154a38 <+360>:    mov    %rax,%rsi
    0x0000000000154a3b <+363>:    sar    $0x3f,%rsi
    0x0000000000154a3f <+367>:    xor    %rsi,%rax
    0x0000000000154a42 <+370>:    sub    %rsi,%rax
    0x0000000000154a45 <+373>:    movslq 0x230(%rdi),%rsi
    0x0000000000154a4c <+380>:    mov    %rsi,%r8
    0x0000000000154a4f <+383>:    sar    $0x3f,%r8
    0x0000000000154a53 <+387>:    xor    %r8,%rsi
    0x0000000000154a56 <+390>:    sub    %r8,%rsi
    0x0000000000154a59 <+393>:    add    %rsi,%rax
    0x0000000000154a5c <+396>:    mov    %eax,0x230(%rdx)
    0x0000000000154a62 <+402>:    movslq 0x234(%rcx),%rax
    0x0000000000154a69 <+409>:    mov    %rax,%rcx
    0x0000000000154a6c <+412>:    sar    $0x3f,%rcx
    0x0000000000154a70 <+416>:    xor    %rcx,%rax
    0x0000000000154a73 <+419>:    sub    %rcx,%rax
    0x0000000000154a76 <+422>:    movslq 0x234(%rdi),%rcx
    0x0000000000154a7d <+429>:    mov    %rcx,%rsi
    0x0000000000154a80 <+432>:    sar    $0x3f,%rsi
    0x0000000000154a84 <+436>:    xor    %rsi,%rcx
    0x0000000000154a87 <+439>:    sub    %rsi,%rcx
    0x0000000000154a8a <+442>:    add    %rcx,%rax
    0x0000000000154a8d <+445>:    mov    %eax,0x234(%rdx)
    0x0000000000154a93 <+451>:    retq
    0x0000000000154a94 <+452>:    nopl   0x0(%rax)
    0x0000000000154a98 <+456>:    mov    %ecx,%eax
    0x0000000000154a9a <+458>:    mov    %r8d,%r8d
    0x0000000000154a9d <+461>:    mov    %edx,%edx
    0x0000000000154a9f <+463>:    lea    0x22(%rax),%rcx
    0x0000000000154aa3 <+467>:    lea    0x22(%rdx),%r9
    0x0000000000154aa7 <+471>:    shl    $0x4,%rax
    0x0000000000154aab <+475>:    lea    (%rdi,%rax,1),%rax
    0x0000000000154aaf <+479>:    shl    $0x4,%rdx
    0x0000000000154ab3 <+483>:    shl    $0x4,%rcx
    0x0000000000154ab7 <+487>:    shl    $0x4,%r9
    0x0000000000154abb <+491>:    add    %rdi,%rdx
    0x0000000000154abe <+494>:    movsbq 0x8(%rdi,%rcx,1),%rsi
    0x0000000000154ac4 <+500>:    mov    %rsi,%rcx
    0x0000000000154ac7 <+503>:    sar    $0x3f,%rcx
    0x0000000000154acb <+507>:    xor    %rcx,%rsi
    0x0000000000154ace <+510>:    sub    %rcx,%rsi
    0x0000000000154ad1 <+513>:    lea    0x22(%r8),%rcx
    0x0000000000154ad5 <+517>:    shl    $0x4,%r8
    0x0000000000154ad9 <+521>:    shl    $0x4,%rcx
    0x0000000000154add <+525>:    movsbq 0x8(%rdi,%rcx,1),%rcx
    0x0000000000154ae3 <+531>:    mov    %rcx,%r10
    0x0000000000154ae6 <+534>:    sar    $0x3f,%r10
    0x0000000000154aea <+538>:    xor    %r10,%rcx
    0x0000000000154aed <+541>:    sub    %r10,%rcx
    0x0000000000154af0 <+544>:    add    %rcx,%rsi
    0x0000000000154af3 <+547>:    mov    %sil,0x8(%rdi,%r9,1)
    0x0000000000154af8 <+552>:    movsbq 0x229(%rax),%rcx
    0x0000000000154b00 <+560>:    add    %r8,%rdi
    0x0000000000154b03 <+563>:    mov    %rcx,%rsi
    0x0000000000154b06 <+566>:    sar    $0x3f,%rsi
    0x0000000000154b0a <+570>:    xor    %rsi,%rcx
    0x0000000000154b0d <+573>:    sub    %rsi,%rcx
    0x0000000000154b10 <+576>:    movsbq 0x229(%rdi),%rsi
    0x0000000000154b18 <+584>:    mov    %rsi,%r8
    0x0000000000154b1b <+587>:    sar    $0x3f,%r8
    0x0000000000154b1f <+591>:    xor    %r8,%rsi
    0x0000000000154b22 <+594>:    sub    %r8,%rsi
    0x0000000000154b25 <+597>:    add    %rsi,%rcx
    0x0000000000154b28 <+600>:    mov    %cl,0x229(%rdx)
    0x0000000000154b2e <+606>:    movsbq 0x22a(%rax),%rcx
    0x0000000000154b36 <+614>:    mov    %rcx,%rsi
    0x0000000000154b39 <+617>:    sar    $0x3f,%rsi
    0x0000000000154b3d <+621>:    xor    %rsi,%rcx
    0x0000000000154b40 <+624>:    sub    %rsi,%rcx
    0x0000000000154b43 <+627>:    movsbq 0x22a(%rdi),%rsi
    0x0000000000154b4b <+635>:    mov    %rsi,%r8
    0x0000000000154b4e <+638>:    sar    $0x3f,%r8
    0x0000000000154b52 <+642>:    xor    %r8,%rsi
    0x0000000000154b55 <+645>:    sub    %r8,%rsi
    0x0000000000154b58 <+648>:    add    %rsi,%rcx
    0x0000000000154b5b <+651>:    mov    %cl,0x22a(%rdx)
    0x0000000000154b61 <+657>:    movsbq 0x22b(%rax),%rcx
    0x0000000000154b69 <+665>:    mov    %rcx,%rsi
    0x0000000000154b6c <+668>:    sar    $0x3f,%rsi
    0x0000000000154b70 <+672>:    xor    %rsi,%rcx
    0x0000000000154b73 <+675>:    sub    %rsi,%rcx
    0x0000000000154b76 <+678>:    movsbq 0x22b(%rdi),%rsi
    0x0000000000154b7e <+686>:    mov    %rsi,%r8
    0x0000000000154b81 <+689>:    sar    $0x3f,%r8
    0x0000000000154b85 <+693>:    xor    %r8,%rsi
    0x0000000000154b88 <+696>:    sub    %r8,%rsi
    0x0000000000154b8b <+699>:    add    %rsi,%rcx
    0x0000000000154b8e <+702>:    mov    %cl,0x22b(%rdx)
    0x0000000000154b94 <+708>:    movsbq 0x22c(%rax),%rcx
    0x0000000000154b9c <+716>:    mov    %rcx,%rsi
    0x0000000000154b9f <+719>:    sar    $0x3f,%rsi
    0x0000000000154ba3 <+723>:    xor    %rsi,%rcx
    0x0000000000154ba6 <+726>:    sub    %rsi,%rcx
    0x0000000000154ba9 <+729>:    movsbq 0x22c(%rdi),%rsi
    0x0000000000154bb1 <+737>:    mov    %rsi,%r8
    0x0000000000154bb4 <+740>:    sar    $0x3f,%r8
    0x0000000000154bb8 <+744>:    xor    %r8,%rsi
    0x0000000000154bbb <+747>:    sub    %r8,%rsi
    0x0000000000154bbe <+750>:    add    %rsi,%rcx
    0x0000000000154bc1 <+753>:    mov    %cl,0x22c(%rdx)
    0x0000000000154bc7 <+759>:    movsbq 0x22d(%rax),%rcx
    0x0000000000154bcf <+767>:    mov    %rcx,%rsi
    0x0000000000154bd2 <+770>:    sar    $0x3f,%rsi
    0x0000000000154bd6 <+774>:    xor    %rsi,%rcx
    0x0000000000154bd9 <+777>:    sub    %rsi,%rcx
    0x0000000000154bdc <+780>:    movsbq 0x22d(%rdi),%rsi
    0x0000000000154be4 <+788>:    mov    %rsi,%r8
    0x0000000000154be7 <+791>:    sar    $0x3f,%r8
    0x0000000000154beb <+795>:    xor    %r8,%rsi
    0x0000000000154bee <+798>:    sub    %r8,%rsi
    0x0000000000154bf1 <+801>:    add    %rsi,%rcx
    0x0000000000154bf4 <+804>:    mov    %cl,0x22d(%rdx)
    0x0000000000154bfa <+810>:    movsbq 0x22e(%rax),%rcx
    0x0000000000154c02 <+818>:    mov    %rcx,%rsi
    0x0000000000154c05 <+821>:    sar    $0x3f,%rsi
    0x0000000000154c09 <+825>:    xor    %rsi,%rcx
    0x0000000000154c0c <+828>:    sub    %rsi,%rcx
    0x0000000000154c0f <+831>:    movsbq 0x22e(%rdi),%rsi
    0x0000000000154c17 <+839>:    mov    %rsi,%r8
    0x0000000000154c1a <+842>:    sar    $0x3f,%r8
    0x0000000000154c1e <+846>:    xor    %r8,%rsi
    0x0000000000154c21 <+849>:    sub    %r8,%rsi
    0x0000000000154c24 <+852>:    add    %rsi,%rcx
    0x0000000000154c27 <+855>:    mov    %cl,0x22e(%rdx)
    0x0000000000154c2d <+861>:    movsbq 0x22f(%rax),%rcx
    0x0000000000154c35 <+869>:    mov    %rcx,%rsi
    0x0000000000154c38 <+872>:    sar    $0x3f,%rsi
    0x0000000000154c3c <+876>:    xor    %rsi,%rcx
    0x0000000000154c3f <+879>:    sub    %rsi,%rcx
    0x0000000000154c42 <+882>:    movsbq 0x22f(%rdi),%rsi
    0x0000000000154c4a <+890>:    mov    %rsi,%r8
    0x0000000000154c4d <+893>:    sar    $0x3f,%r8
    0x0000000000154c51 <+897>:    xor    %r8,%rsi
    0x0000000000154c54 <+900>:    sub    %r8,%rsi
    0x0000000000154c57 <+903>:    add    %rsi,%rcx
    0x0000000000154c5a <+906>:    mov    %cl,0x22f(%rdx)
    0x0000000000154c60 <+912>:    movsbq 0x230(%rax),%rcx
    0x0000000000154c68 <+920>:    mov    %rcx,%rsi
    0x0000000000154c6b <+923>:    sar    $0x3f,%rsi
    0x0000000000154c6f <+927>:    xor    %rsi,%rcx
    0x0000000000154c72 <+930>:    sub    %rsi,%rcx
    0x0000000000154c75 <+933>:    movsbq 0x230(%rdi),%rsi
    0x0000000000154c7d <+941>:    mov    %rsi,%r8
    0x0000000000154c80 <+944>:    sar    $0x3f,%r8
    0x0000000000154c84 <+948>:    xor    %r8,%rsi
    0x0000000000154c87 <+951>:    sub    %r8,%rsi
    0x0000000000154c8a <+954>:    add    %rsi,%rcx
    0x0000000000154c8d <+957>:    mov    %cl,0x230(%rdx)
    0x0000000000154c93 <+963>:    movsbq 0x231(%rax),%rcx
    0x0000000000154c9b <+971>:    mov    %rcx,%rsi
    0x0000000000154c9e <+974>:    sar    $0x3f,%rsi
    0x0000000000154ca2 <+978>:    xor    %rsi,%rcx
    0x0000000000154ca5 <+981>:    sub    %rsi,%rcx
    0x0000000000154ca8 <+984>:    movsbq 0x231(%rdi),%rsi
    0x0000000000154cb0 <+992>:    mov    %rsi,%r8
    0x0000000000154cb3 <+995>:    sar    $0x3f,%r8
    0x0000000000154cb7 <+999>:    xor    %r8,%rsi
    0x0000000000154cba <+1002>:   sub    %r8,%rsi
    0x0000000000154cbd <+1005>:   add    %rsi,%rcx
    0x0000000000154cc0 <+1008>:   mov    %cl,0x231(%rdx)
    0x0000000000154cc6 <+1014>:   movsbq 0x232(%rax),%rcx
    0x0000000000154cce <+1022>:   mov    %rcx,%rsi
    0x0000000000154cd1 <+1025>:   sar    $0x3f,%rsi
    0x0000000000154cd5 <+1029>:   xor    %rsi,%rcx
    0x0000000000154cd8 <+1032>:   sub    %rsi,%rcx
    0x0000000000154cdb <+1035>:   movsbq 0x232(%rdi),%rsi
    0x0000000000154ce3 <+1043>:   mov    %rsi,%r8
    0x0000000000154ce6 <+1046>:   sar    $0x3f,%r8
    0x0000000000154cea <+1050>:   xor    %r8,%rsi
    0x0000000000154ced <+1053>:   sub    %r8,%rsi
    0x0000000000154cf0 <+1056>:   add    %rsi,%rcx
    0x0000000000154cf3 <+1059>:   mov    %cl,0x232(%rdx)
    0x0000000000154cf9 <+1065>:   movsbq 0x233(%rdi),%rcx
    0x0000000000154d01 <+1073>:   mov    %rcx,%rsi
    0x0000000000154d04 <+1076>:   sar    $0x3f,%rsi
    0x0000000000154d08 <+1080>:   xor    %rsi,%rcx
    0x0000000000154d0b <+1083>:   sub    %rsi,%rcx
    0x0000000000154d0e <+1086>:   movsbq 0x233(%rax),%rsi
    0x0000000000154d16 <+1094>:   mov    %rsi,%r8
    0x0000000000154d19 <+1097>:   sar    $0x3f,%r8
    0x0000000000154d1d <+1101>:   xor    %r8,%rsi
    0x0000000000154d20 <+1104>:   sub    %r8,%rsi
    0x0000000000154d23 <+1107>:   add    %rsi,%rcx
    0x0000000000154d26 <+1110>:   mov    %cl,0x233(%rdx)
    0x0000000000154d2c <+1116>:   movsbq 0x234(%rdi),%rcx
    0x0000000000154d34 <+1124>:   mov    %rcx,%rsi
    0x0000000000154d37 <+1127>:   sar    $0x3f,%rsi
    0x0000000000154d3b <+1131>:   xor    %rsi,%rcx
    0x0000000000154d3e <+1134>:   sub    %rsi,%rcx
    0x0000000000154d41 <+1137>:   movsbq 0x234(%rax),%rsi
    0x0000000000154d49 <+1145>:   mov    %rsi,%r8
    0x0000000000154d4c <+1148>:   sar    $0x3f,%r8
    0x0000000000154d50 <+1152>:   xor    %r8,%rsi
    0x0000000000154d53 <+1155>:   sub    %r8,%rsi
    0x0000000000154d56 <+1158>:   add    %rsi,%rcx
    0x0000000000154d59 <+1161>:   mov    %cl,0x234(%rdx)
    0x0000000000154d5f <+1167>:   movsbq 0x235(%rax),%rcx
    0x0000000000154d67 <+1175>:   mov    %rcx,%rsi
    0x0000000000154d6a <+1178>:   sar    $0x3f,%rsi
    0x0000000000154d6e <+1182>:   xor    %rsi,%rcx
    0x0000000000154d71 <+1185>:   sub    %rsi,%rcx
    0x0000000000154d74 <+1188>:   movsbq 0x235(%rdi),%rsi
    0x0000000000154d7c <+1196>:   mov    %rsi,%r8
    0x0000000000154d7f <+1199>:   sar    $0x3f,%r8
    0x0000000000154d83 <+1203>:   xor    %r8,%rsi
    0x0000000000154d86 <+1206>:   sub    %r8,%rsi
    0x0000000000154d89 <+1209>:   add    %rsi,%rcx
    0x0000000000154d8c <+1212>:   mov    %cl,0x235(%rdx)
    0x0000000000154d92 <+1218>:   movsbq 0x236(%rdi),%rcx
    0x0000000000154d9a <+1226>:   mov    %rcx,%rsi
    0x0000000000154d9d <+1229>:   sar    $0x3f,%rsi
    0x0000000000154da1 <+1233>:   xor    %rsi,%rcx
    0x0000000000154da4 <+1236>:   sub    %rsi,%rcx
    0x0000000000154da7 <+1239>:   movsbq 0x236(%rax),%rsi
    0x0000000000154daf <+1247>:   mov    %rsi,%r8
    0x0000000000154db2 <+1250>:   sar    $0x3f,%r8
    0x0000000000154db6 <+1254>:   xor    %r8,%rsi
    0x0000000000154db9 <+1257>:   sub    %r8,%rsi
    0x0000000000154dbc <+1260>:   add    %rsi,%rcx
    0x0000000000154dbf <+1263>:   mov    %cl,0x236(%rdx)
    0x0000000000154dc5 <+1269>:   movsbq 0x237(%rax),%rax
    0x0000000000154dcd <+1277>:   mov    %rax,%rcx
    0x0000000000154dd0 <+1280>:   sar    $0x3f,%rcx
    0x0000000000154dd4 <+1284>:   xor    %rcx,%rax
    0x0000000000154dd7 <+1287>:   sub    %rcx,%rax
    0x0000000000154dda <+1290>:   movsbq 0x237(%rdi),%rcx
    0x0000000000154de2 <+1298>:   mov    %rcx,%rsi
    0x0000000000154de5 <+1301>:   sar    $0x3f,%rsi
    0x0000000000154de9 <+1305>:   xor    %rsi,%rcx
    0x0000000000154dec <+1308>:   sub    %rsi,%rcx
    0x0000000000154def <+1311>:   add    %rcx,%rax
    0x0000000000154df2 <+1314>:   mov    %al,0x237(%rdx)
    0x0000000000154df8 <+1320>:   retq
    0x0000000000154df9 <+1321>:   nopl   0x0(%rax)
    0x0000000000154e00 <+1328>:   mov    %ecx,%eax
    0x0000000000154e02 <+1330>:   mov    %r8d,%r8d
    0x0000000000154e05 <+1333>:   mov    %edx,%edx
    0x0000000000154e07 <+1335>:   lea    0x22(%rax),%rcx
    0x0000000000154e0b <+1339>:   lea    0x22(%rdx),%r9
    0x0000000000154e0f <+1343>:   shl    $0x4,%rax
    0x0000000000154e13 <+1347>:   lea    (%rdi,%rax,1),%rax
    0x0000000000154e17 <+1351>:   shl    $0x4,%rdx
    0x0000000000154e1b <+1355>:   shl    $0x4,%rcx
    0x0000000000154e1f <+1359>:   shl    $0x4,%r9
    0x0000000000154e23 <+1363>:   add    %rdi,%rdx
    0x0000000000154e26 <+1366>:   movswq 0x8(%rdi,%rcx,1),%rsi
    0x0000000000154e2c <+1372>:   mov    %rsi,%rcx
    0x0000000000154e2f <+1375>:   sar    $0x3f,%rcx
    0x0000000000154e33 <+1379>:   xor    %rcx,%rsi
    0x0000000000154e36 <+1382>:   sub    %rcx,%rsi
    0x0000000000154e39 <+1385>:   lea    0x22(%r8),%rcx
    0x0000000000154e3d <+1389>:   shl    $0x4,%r8
    0x0000000000154e41 <+1393>:   shl    $0x4,%rcx
    0x0000000000154e45 <+1397>:   movswq 0x8(%rdi,%rcx,1),%rcx
    0x0000000000154e4b <+1403>:   mov    %rcx,%r10
    0x0000000000154e4e <+1406>:   sar    $0x3f,%r10
    0x0000000000154e52 <+1410>:   xor    %r10,%rcx
    0x0000000000154e55 <+1413>:   sub    %r10,%rcx
    0x0000000000154e58 <+1416>:   add    %rcx,%rsi
    0x0000000000154e5b <+1419>:   mov    %si,0x8(%rdi,%r9,1)
    0x0000000000154e61 <+1425>:   movswq 0x22a(%rax),%rcx
    0x0000000000154e69 <+1433>:   add    %r8,%rdi
    0x0000000000154e6c <+1436>:   mov    %rcx,%rsi
    0x0000000000154e6f <+1439>:   sar    $0x3f,%rsi
    0x0000000000154e73 <+1443>:   xor    %rsi,%rcx
    0x0000000000154e76 <+1446>:   sub    %rsi,%rcx
    0x0000000000154e79 <+1449>:   movswq 0x22a(%rdi),%rsi
    0x0000000000154e81 <+1457>:   mov    %rsi,%r8
    0x0000000000154e84 <+1460>:   sar    $0x3f,%r8
    0x0000000000154e88 <+1464>:   xor    %r8,%rsi
    0x0000000000154e8b <+1467>:   sub    %r8,%rsi
    0x0000000000154e8e <+1470>:   add    %rsi,%rcx
    0x0000000000154e91 <+1473>:   mov    %cx,0x22a(%rdx)
    0x0000000000154e98 <+1480>:   movswq 0x22c(%rax),%rcx
    0x0000000000154ea0 <+1488>:   mov    %rcx,%rsi
    0x0000000000154ea3 <+1491>:   sar    $0x3f,%rsi
    0x0000000000154ea7 <+1495>:   xor    %rsi,%rcx
    0x0000000000154eaa <+1498>:   sub    %rsi,%rcx
    0x0000000000154ead <+1501>:   movswq 0x22c(%rdi),%rsi
    0x0000000000154eb5 <+1509>:   mov    %rsi,%r8
    0x0000000000154eb8 <+1512>:   sar    $0x3f,%r8
    0x0000000000154ebc <+1516>:   xor    %r8,%rsi
    0x0000000000154ebf <+1519>:   sub    %r8,%rsi
    0x0000000000154ec2 <+1522>:   add    %rsi,%rcx
    0x0000000000154ec5 <+1525>:   mov    %cx,0x22c(%rdx)
    0x0000000000154ecc <+1532>:   movswq 0x22e(%rax),%rcx
    0x0000000000154ed4 <+1540>:   mov    %rcx,%rsi
    0x0000000000154ed7 <+1543>:   sar    $0x3f,%rsi
    0x0000000000154edb <+1547>:   xor    %rsi,%rcx
    0x0000000000154ede <+1550>:   sub    %rsi,%rcx
    0x0000000000154ee1 <+1553>:   movswq 0x22e(%rdi),%rsi
    0x0000000000154ee9 <+1561>:   mov    %rsi,%r8
    0x0000000000154eec <+1564>:   sar    $0x3f,%r8
    0x0000000000154ef0 <+1568>:   xor    %r8,%rsi
    0x0000000000154ef3 <+1571>:   sub    %r8,%rsi
    0x0000000000154ef6 <+1574>:   add    %rsi,%rcx
    0x0000000000154ef9 <+1577>:   mov    %cx,0x22e(%rdx)
    0x0000000000154f00 <+1584>:   movswq 0x230(%rax),%rcx
    0x0000000000154f08 <+1592>:   mov    %rcx,%rsi
    0x0000000000154f0b <+1595>:   sar    $0x3f,%rsi
    0x0000000000154f0f <+1599>:   xor    %rsi,%rcx
    0x0000000000154f12 <+1602>:   sub    %rsi,%rcx
    0x0000000000154f15 <+1605>:   movswq 0x230(%rdi),%rsi
    0x0000000000154f1d <+1613>:   mov    %rsi,%r8
    0x0000000000154f20 <+1616>:   sar    $0x3f,%r8
    0x0000000000154f24 <+1620>:   xor    %r8,%rsi
    0x0000000000154f27 <+1623>:   sub    %r8,%rsi
    0x0000000000154f2a <+1626>:   add    %rsi,%rcx
    0x0000000000154f2d <+1629>:   mov    %cx,0x230(%rdx)
    0x0000000000154f34 <+1636>:   movswq 0x232(%rax),%rcx
    0x0000000000154f3c <+1644>:   mov    %rcx,%rsi
    0x0000000000154f3f <+1647>:   sar    $0x3f,%rsi
    0x0000000000154f43 <+1651>:   xor    %rsi,%rcx
    0x0000000000154f46 <+1654>:   sub    %rsi,%rcx
    0x0000000000154f49 <+1657>:   movswq 0x232(%rdi),%rsi
    0x0000000000154f51 <+1665>:   mov    %rsi,%r8
    0x0000000000154f54 <+1668>:   sar    $0x3f,%r8
    0x0000000000154f58 <+1672>:   xor    %r8,%rsi
    0x0000000000154f5b <+1675>:   sub    %r8,%rsi
    0x0000000000154f5e <+1678>:   add    %rsi,%rcx
    0x0000000000154f61 <+1681>:   mov    %cx,0x232(%rdx)
    0x0000000000154f68 <+1688>:   movswq 0x234(%rax),%rcx
    0x0000000000154f70 <+1696>:   mov    %rcx,%rsi
    0x0000000000154f73 <+1699>:   sar    $0x3f,%rsi
    0x0000000000154f77 <+1703>:   xor    %rsi,%rcx
    0x0000000000154f7a <+1706>:   sub    %rsi,%rcx
    0x0000000000154f7d <+1709>:   movswq 0x234(%rdi),%rsi
    0x0000000000154f85 <+1717>:   mov    %rsi,%r8
    0x0000000000154f88 <+1720>:   sar    $0x3f,%r8
    0x0000000000154f8c <+1724>:   xor    %r8,%rsi
    0x0000000000154f8f <+1727>:   sub    %r8,%rsi
    0x0000000000154f92 <+1730>:   add    %rsi,%rcx
    0x0000000000154f95 <+1733>:   mov    %cx,0x234(%rdx)
    0x0000000000154f9c <+1740>:   movswq 0x236(%rax),%rax
    0x0000000000154fa4 <+1748>:   mov    %rax,%rcx
    0x0000000000154fa7 <+1751>:   sar    $0x3f,%rcx
    0x0000000000154fab <+1755>:   xor    %rcx,%rax
    0x0000000000154fae <+1758>:   sub    %rcx,%rax
    0x0000000000154fb1 <+1761>:   movswq 0x236(%rdi),%rcx
    0x0000000000154fb9 <+1769>:   mov    %rcx,%rsi
    0x0000000000154fbc <+1772>:   sar    $0x3f,%rsi
    0x0000000000154fc0 <+1776>:   xor    %rsi,%rcx
    0x0000000000154fc3 <+1779>:   sub    %rsi,%rcx
    0x0000000000154fc6 <+1782>:   add    %rcx,%rax
    0x0000000000154fc9 <+1785>:   mov    %ax,0x236(%rdx)
    0x0000000000154fd0 <+1792>:   retq
    0x0000000000154fd1 <+1793>:   lea    0x14faa8(%rip),%rcx        # 0x2a4a80 
<__PRETTY_FUNCTION__.25843>
    0x0000000000154fd8 <+1800>:   lea    0x14ef81(%rip),%rsi        # 0x2a3f60
    0x0000000000154fdf <+1807>:   lea    0x1da975(%rip),%rdi        # 0x32f95b
    0x0000000000154fe6 <+1814>:   sub    $0x8,%rsp
    0x0000000000154fea <+1818>:   mov    $0x368,%edx
    0x0000000000154fef <+1823>:   callq  0x8f170


Signed-off-by: Mateja Marjanovic <address@hidden>
---
  target/mips/msa_helper.c | 43 ++++++++++++++++++++++++++++++-------------
  1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 4c7ec05..1152fda 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -804,28 +804,45 @@ void helper_msa_ ## func ## _df(CPUMIPSState *env, 
uint32_t df,         \
      wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
      wr_t *pws = &(env->active_fpu.fpr[ws].wr);                          \
      wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
\
If we can ensure alignment for the various vector registers then the
compiler always has the option of using host vectors (certainly for int
and logic operations).
Very interesting, could you please tell me more about that, so I can
understand better.

Thanks,
Mateja
-    uint32_t i;                                                         \
                                                                          \
      switch (df) {                                                       \
      case DF_BYTE:                                                       \
-        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {                    \
-            pwd->b[i] = msa_ ## func ## _df(df, pws->b[i], pwt->b[i]);  \
-        }                                                               \
+        pwd->b[0]  = msa_ ## func ## _df(df, pws->b[0], pwt->b[0]);     \
+        pwd->b[1]  = msa_ ## func ## _df(df, pws->b[1], pwt->b[1]);     \
+        pwd->b[2]  = msa_ ## func ## _df(df, pws->b[2], pwt->b[2]);     \
+        pwd->b[3]  = msa_ ## func ## _df(df, pws->b[3], pwt->b[3]);     \
+        pwd->b[4]  = msa_ ## func ## _df(df, pws->b[4], pwt->b[4]);     \
+        pwd->b[5]  = msa_ ## func ## _df(df, pws->b[5], pwt->b[5]);     \
+        pwd->b[6]  = msa_ ## func ## _df(df, pws->b[6], pwt->b[6]);     \
+        pwd->b[7]  = msa_ ## func ## _df(df, pws->b[7], pwt->b[7]);     \
+        pwd->b[8]  = msa_ ## func ## _df(df, pws->b[8], pwt->b[8]);     \
+        pwd->b[9]  = msa_ ## func ## _df(df, pws->b[9], pwt->b[9]);     \
+        pwd->b[10] = msa_ ## func ## _df(df, pws->b[10], pwt->b[10]);   \
+        pwd->b[11] = msa_ ## func ## _df(df, pws->b[11], pwt->b[11]);   \
+        pwd->b[12] = msa_ ## func ## _df(df, pws->b[12], pwt->b[12]);   \
+        pwd->b[13] = msa_ ## func ## _df(df, pws->b[13], pwt->b[13]);   \
+        pwd->b[14] = msa_ ## func ## _df(df, pws->b[14], pwt->b[14]);   \
+        pwd->b[15] = msa_ ## func ## _df(df, pws->b[15], pwt->b[15]);   \
          break;                                                          \
      case DF_HALF:                                                       \
-        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {                    \
-            pwd->h[i] = msa_ ## func ## _df(df, pws->h[i], pwt->h[i]);  \
-        }                                                               \
+        pwd->h[0] = msa_ ## func ## _df(df, pws->h[0], pwt->h[0]);      \
+        pwd->h[1] = msa_ ## func ## _df(df, pws->h[1], pwt->h[1]);      \
+        pwd->h[2] = msa_ ## func ## _df(df, pws->h[2], pwt->h[2]);      \
+        pwd->h[3] = msa_ ## func ## _df(df, pws->h[3], pwt->h[3]);      \
+        pwd->h[4] = msa_ ## func ## _df(df, pws->h[4], pwt->h[4]);      \
+        pwd->h[5] = msa_ ## func ## _df(df, pws->h[5], pwt->h[5]);      \
+        pwd->h[6] = msa_ ## func ## _df(df, pws->h[6], pwt->h[6]);      \
+        pwd->h[7] = msa_ ## func ## _df(df, pws->h[7], pwt->h[7]);      \
          break;                                                          \
      case DF_WORD:                                                       \
-        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {                    \
-            pwd->w[i] = msa_ ## func ## _df(df, pws->w[i], pwt->w[i]);  \
-        }                                                               \
+        pwd->w[0] = msa_ ## func ## _df(df, pws->w[0], pwt->w[0]);      \
+        pwd->w[1] = msa_ ## func ## _df(df, pws->w[1], pwt->w[1]);      \
+        pwd->w[2] = msa_ ## func ## _df(df, pws->w[2], pwt->w[2]);      \
+        pwd->w[3] = msa_ ## func ## _df(df, pws->w[3], pwt->w[3]);      \
          break;                                                          \
      case DF_DOUBLE:                                                     \
-        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {                  \
-            pwd->d[i] = msa_ ## func ## _df(df, pws->d[i], pwt->d[i]);  \
-        }                                                               \
+        pwd->d[0] = msa_ ## func ## _df(df, pws->d[0], pwt->d[0]);      \
+        pwd->d[1] = msa_ ## func ## _df(df, pws->d[1], pwt->d[1]);      \
          break;                                                          \
      default:                                                            \
          assert(0);                                                      \

--
Alex Bennée





reply via email to

[Prev in Thread] Current Thread [Next in Thread]