qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v3 2/5] target-ppc: improve lxvw4x implementatio


From: Nikunj A Dadhania
Subject: Re: [Qemu-devel] [PATCH v3 2/5] target-ppc: improve lxvw4x implementation
Date: Mon, 19 Sep 2016 16:06:40 +0530
User-agent: Notmuch/0.21 (https://notmuchmail.org) Emacs/25.0.94.1 (x86_64-redhat-linux-gnu)

David Gibson <address@hidden> writes:
> [ Unknown signature status ]
> On Mon, Sep 19, 2016 at 04:19:34PM +1000, David Gibson wrote:
>> On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote:
>> > diff --git a/target-ppc/translate/vsx-impl.inc.c 
>> > b/target-ppc/translate/vsx-impl.inc.c
>> > index eee6052..df278df 100644
>> > --- a/target-ppc/translate/vsx-impl.inc.c
>> > +++ b/target-ppc/translate/vsx-impl.inc.c
>> > @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx)
>> >  static void gen_lxvw4x(DisasContext *ctx)
>> >  {
>> >      TCGv EA;
>> > -    TCGv_i64 tmp;
>> >      TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode));
>> >      TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode));
>> >      if (unlikely(!ctx->vsx_enabled)) {
>> > @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx)
>> >      }
>> >      gen_set_access_type(ctx, ACCESS_INT);
>> >      EA = tcg_temp_new();
>> > -    tmp = tcg_temp_new_i64();
>> >  
>> >      gen_addr_reg_index(ctx, EA);
>> > -    gen_qemu_ld32u_i64(ctx, tmp, EA);
>> > -    tcg_gen_addi_tl(EA, EA, 4);
>> > -    gen_qemu_ld32u_i64(ctx, xth, EA);
>> > -    tcg_gen_deposit_i64(xth, xth, tmp, 32, 32);
>> > -
>> > -    tcg_gen_addi_tl(EA, EA, 4);
>> > -    gen_qemu_ld32u_i64(ctx, tmp, EA);
>> > -    tcg_gen_addi_tl(EA, EA, 4);
>> > -    gen_qemu_ld32u_i64(ctx, xtl, EA);
>> > -    tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32);
>> > -
>> > +    tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ);
>> > +    gen_helper_deposit32x2(xth, xth);
>> > +    tcg_gen_addi_tl(EA, EA, 8);
>> > +    tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ);
>> > +    gen_helper_deposit32x2(xtl, xtl);
>
> ..and I think this is wrong for BE mode.  The deposit32x2 will get the
> words in the right order, but the bytes within each word will be wrong
> because of the LE mode load on a BE setup.

Since lxvw4x/stxvw4x is available on POWER8. I tried running my test
code on BE and LE Fedora24 VM. TCG Results match the POWER8 hardware.
The order within the word is not changed. Snippet of the test code at
the end of email. Can share full code if needed (maybe will do it in
kvm-unit-test)

Fedora24VM BE:

    address@hidden ~]$ uname -a
    Linux cloudimg.localdomain 4.5.5-300.fc24.ppc64 #1 SMP Tue May 24 12:24:54 
UTC 2016 ppc64 ppc64 ppc64 GNU/Linux
    address@hidden ~]$ ./lxv_x
    VRT32 = 00010203 20212223 30313233 40414243 
    
    address@hidden ~]$ ./stxv_x 
     E0E1E2E3  E4E5E6E7  F0F1F2F3  F4F5F6F7 


TCG Result BE:
==============
    $ ./ppc64-linux-user/qemu-ppc64  -cpu POWER9 lxv_x
    VRT32 = 00010203 20212223 30313233 40414243 
    
    $ ./ppc64-linux-user/qemu-ppc64  -cpu POWER9 stxv_x
     E0E1E2E3  E4E5E6E7  F0F1F2F3  F4F5F6F7


Fedora24VM LE:
==============
    address@hidden ~]$ uname -a
    Linux cloudimg.localdomain 4.5.5-300.fc24.ppc64le #1 SMP Tue May 24 
12:23:26 UTC 2016 ppc64le ppc64le ppc64le GNU/Linux
    address@hidden ~]$ ./lxv_x 
    VRT32 = 40414243 30313233 20212223 00010203 
    
    address@hidden ~]$ ./stxv_x 
     F4F5F6F7  F0F1F2F3  E4E5E6E7  E0E1E2E3 

TCG Result LE:
==============
    $ ./ppc64le-linux-user/qemu-ppc64le  -cpu POWER9 lxv_x
    VRT32 = 40414243 30313233 20212223 00010203 
    
    $ ./ppc64le-linux-user/qemu-ppc64le  -cpu POWER9 stxv_x
     F4F5F6F7  F0F1F2F3  E4E5E6E7  E0E1E2E3 

Regards,
Nikunj


vsx.h:
======
#define U32_SIZE (sizeof(__vector uint32_t) / sizeof(uint32_t))

typedef union {
    __vector uint32_t v;
    uint32_t a[U32_SIZE];
} vuint32_t;

static void vec_put_u32(__vector uint32_t v) {
    int i;
    vuint32_t u;

    for (u.v = v, i = 0; i < U32_SIZE; ++i) {
        printf("%08x ", u.a[i]);
    }

    printf("\n");
}

static void print4x4(uint32_t *p)
{
    int i;
    if (!p)
        return;
    for(i = 0; i < 4; i++)
        printf(" %08X ", p[i]);
    printf("\n");
}

lxv_x.c:
========
  uint32_t rb32[4] = {0x00010203, 0x20212223, 0x30313233, 0x40414243};
  vuint32_t vrt32;
  
  asm("lxvw4x %x0, 0, %1 \n\t" \
      : "=ws"(vrt32) : "r"(&rb32));
  printf("VRT32 = "); vec_put_u32(vrt32);

stxv_x.c:
=========
  vuint32_t vrt32;

  vrt32.a[0] = 0xE0E1E2E3;
  vrt32.a[1] = 0xE4E5E6E7;
  vrt32.a[2] = 0xF0F1F2F3;
  vrt32.a[3] = 0xF4F5F6F7;

  asm("stxvw4x %x0, 0, %1 \n\t" \
      : : "ws"(vrt32.v), "r"(&rb32));
  print4x4(rb32);




reply via email to

[Prev in Thread] Current Thread [Next in Thread]