Re: [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes

From:	Alex Bennée
Subject:	Re: [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes
Date:	Fri, 27 Jan 2017 14:51:04 +0000
User-agent:	mu4e 0.9.19; emacs 25.1.91.4

Kirill Batuzov <address@hidden> writes:

> To be able to generate vector operations in a TCG backend we need to do
> several things.
>
> 1. We need to tell the register allocator about vector target's register.
>    In case of x86 we'll use xmm0..xmm7. xmm7 is designated as a scratch
>    register, others can be used by the register allocator.
>
> 2. We need a new constraint to indicate where to use vector registers. In
>    this commit the 'V' constraint is introduced.
>
> 3. We need to be able to generate bare minimum: load, store and reg-to-reg
>    move. MOVDQU is used for loads and stores. MOVDQA is used for reg-to-reg
>    moves.
>
> 4. Finally we need to support any other opcodes we want. INDEX_op_add_i32x4
>    is the only one for now. The PADDD instruction handles it perfectly.
>
> Signed-off-by: Kirill Batuzov <address@hidden>

This currently fails to apply cleanly to master because of other updates
however I see you have changes to make so I assume you'll re-base then ;-)

> ---
>  tcg/i386/tcg-target.h     |  24 +++++++++-
>  tcg/i386/tcg-target.inc.c | 109 
> +++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 125 insertions(+), 8 deletions(-)
>
> diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
> index 524cfc6..974a58b 100644
> --- a/tcg/i386/tcg-target.h
> +++ b/tcg/i386/tcg-target.h
> @@ -29,8 +29,14 @@
>  #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
>
>  #ifdef __x86_64__
> -# define TCG_TARGET_REG_BITS  64
> -# define TCG_TARGET_NB_REGS   16
> +# define TCG_TARGET_HAS_REG128 1
> +# ifdef TCG_TARGET_HAS_REG128
> +#  define TCG_TARGET_REG_BITS  64
> +#  define TCG_TARGET_NB_REGS   24
> +# else
> +#  define TCG_TARGET_REG_BITS  64
> +#  define TCG_TARGET_NB_REGS   16
> +# endif
>  #else
>  # define TCG_TARGET_REG_BITS  32
>  # define TCG_TARGET_NB_REGS    8
> @@ -56,6 +62,16 @@ typedef enum {
>      TCG_REG_R13,
>      TCG_REG_R14,
>      TCG_REG_R15,
> +#ifdef TCG_TARGET_HAS_REG128
> +    TCG_REG_XMM0,
> +    TCG_REG_XMM1,
> +    TCG_REG_XMM2,
> +    TCG_REG_XMM3,
> +    TCG_REG_XMM4,
> +    TCG_REG_XMM5,
> +    TCG_REG_XMM6,
> +    TCG_REG_XMM7,
> +#endif
>      TCG_REG_RAX = TCG_REG_EAX,
>      TCG_REG_RCX = TCG_REG_ECX,
>      TCG_REG_RDX = TCG_REG_EDX,
> @@ -133,6 +149,10 @@ extern bool have_bmi1;
>  #define TCG_TARGET_HAS_mulsh_i64        0
>  #endif
>
> +#ifdef TCG_TARGET_HAS_REG128
> +#define TCG_TARGET_HAS_add_i32x4        1
> +#endif
> +
>  #define TCG_TARGET_deposit_i32_valid(ofs, len) \
>      (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
>       ((ofs) == 0 && (len) == 16))
> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index eeb1777..69e3198 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c
> @@ -32,6 +32,9 @@ static const char * const 
> tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
>  #else
>      "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
>  #endif
> +#ifdef TCG_TARGET_HAS_REG128
> +    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
> +#endif
>  };
>  #endif
>
> @@ -61,6 +64,16 @@ static const int tcg_target_reg_alloc_order[] = {
>      TCG_REG_EDX,
>      TCG_REG_EAX,
>  #endif
> +#ifdef TCG_TARGET_HAS_REG128
> +    TCG_REG_XMM0,
> +    TCG_REG_XMM1,
> +    TCG_REG_XMM2,
> +    TCG_REG_XMM3,
> +    TCG_REG_XMM4,
> +    TCG_REG_XMM5,
> +    TCG_REG_XMM6,
> +/*  TCG_REG_XMM7, <- scratch register */
> +#endif
>  };
>
>  static const int tcg_target_call_iarg_regs[] = {
> @@ -247,6 +260,10 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
> const char **pct_str)
>      case 'I':
>          ct->ct |= TCG_CT_CONST_I32;
>          break;
> +    case 'V':
> +        ct->ct |= TCG_CT_REG;
> +        tcg_regset_set32(ct->u.regs, 0, 0xff0000);
> +        break;
>
>      default:
>          return -1;
> @@ -301,6 +318,9 @@ static inline int tcg_target_const_match(tcg_target_long 
> val, TCGType type,
>  #define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */
>  #define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */
>
> +#define P_SSE_660F      (P_DATA16 | P_EXT)
> +#define P_SSE_F30F      (P_SIMDF3 | P_EXT)
> +
>  #define OPC_ARITH_EvIz       (0x81)
>  #define OPC_ARITH_EvIb       (0x83)
>  #define OPC_ARITH_GvEv       (0x03)          /* ... plus (ARITH_FOO << 3) */
> @@ -351,6 +371,11 @@ static inline int tcg_target_const_match(tcg_target_long 
> val, TCGType type,
>  #define OPC_GRP3_Ev  (0xf7)
>  #define OPC_GRP5     (0xff)
>
> +#define OPC_MOVDQU_M2R  (0x6f | P_SSE_F30F)  /* store 128-bit value */
> +#define OPC_MOVDQU_R2M  (0x7f | P_SSE_F30F)  /* load 128-bit value */
> +#define OPC_MOVDQA_R2R  (0x6f | P_SSE_660F)  /* reg-to-reg 128-bit mov */
> +#define OPC_PADDD       (0xfe | P_SSE_660F)
> +
>  /* Group 1 opcode extensions for 0x80-0x83.
>     These are also used as modifiers for OPC_ARITH.  */
>  #define ARITH_ADD 0
> @@ -428,6 +453,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, 
> int rm, int x)
>          tcg_debug_assert((opc & P_REXW) == 0);
>          tcg_out8(s, 0x66);
>      }
> +    if (opc & P_SIMDF3) {
> +        tcg_out8(s, 0xf3);
> +    }
>      if (opc & P_ADDR32) {
>          tcg_out8(s, 0x67);
>      }
> @@ -634,9 +662,24 @@ static inline void tgen_arithr(TCGContext *s, int subop, 
> int dest, int src)
>  static inline void tcg_out_mov(TCGContext *s, TCGType type,
>                                 TCGReg ret, TCGReg arg)
>  {
> +    int opc;
>      if (arg != ret) {
> -        int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
> -        tcg_out_modrm(s, opc, ret, arg);
> +        switch (type) {
> +#ifdef TCG_TARGET_HAS_REG128
> +        case TCG_TYPE_V128:
> +            ret -= TCG_REG_XMM0;
> +            arg -= TCG_REG_XMM0;
> +            tcg_out_modrm(s, OPC_MOVDQA_R2R, ret, arg);
> +            break;
> +#endif
> +        case TCG_TYPE_I32:
> +        case TCG_TYPE_I64:
> +            opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
> +            tcg_out_modrm(s, opc, ret, arg);
> +            break;
> +        default:
> +            assert(0);
> +        }
>      }
>  }
>
> @@ -711,15 +754,43 @@ static inline void tcg_out_pop(TCGContext *s, int reg)
>  static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
>                                TCGReg arg1, intptr_t arg2)
>  {
> -    int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
> -    tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
> +    int opc;
> +    switch (type) {
> +#ifdef TCG_TARGET_HAS_REG128
> +    case TCG_TYPE_V128:
> +        ret -= TCG_REG_XMM0;
> +        tcg_out_modrm_offset(s, OPC_MOVDQU_M2R, ret, arg1, arg2);
> +        break;
> +#endif
> +    case TCG_TYPE_I32:
> +    case TCG_TYPE_I64:
> +        opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
> +        tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
> +        break;
> +    default:
> +        assert(0);
> +    }
>  }
>
>  static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
>                                TCGReg arg1, intptr_t arg2)
>  {
> -    int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
> -    tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
> +    int opc;
> +    switch (type) {
> +#ifdef TCG_TARGET_HAS_REG128
> +    case TCG_TYPE_V128:
> +        arg -= TCG_REG_XMM0;
> +        tcg_out_modrm_offset(s, OPC_MOVDQU_R2M, arg, arg1, arg2);
> +        break;
> +#endif
> +    case TCG_TYPE_I32:
> +    case TCG_TYPE_I64:
> +        opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
> +        tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
> +        break;
> +    default:
> +        assert(0);
> +    }
>  }
>
>  static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
> @@ -1856,6 +1927,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
> opc,
>      case INDEX_op_ld_i32:
>          tcg_out_ld(s, TCG_TYPE_I32, args[0], args[1], args[2]);
>          break;
> +#ifdef TCG_TARGET_HAS_REG128
> +    case INDEX_op_ld_v128:
> +        tcg_out_ld(s, TCG_TYPE_V128, args[0], args[1], args[2]);
> +        break;
> +#endif
>
>      OP_32_64(st8):
>          if (const_args[0]) {
> @@ -1888,6 +1964,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
> opc,
>              tcg_out_st(s, TCG_TYPE_I32, args[0], args[1], args[2]);
>          }
>          break;
> +#ifdef TCG_TARGET_HAS_REG128
> +    case INDEX_op_st_v128:
> +        tcg_out_st(s, TCG_TYPE_V128, args[0], args[1], args[2]);
> +        break;
> +#endif
>
>      OP_32_64(add):
>          /* For 3-operand addition, use LEA.  */
> @@ -2146,6 +2227,13 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
> opc,
>      case INDEX_op_mb:
>          tcg_out_mb(s, args[0]);
>          break;
> +
> +#ifdef TCG_TARGET_HAS_REG128
> +    case INDEX_op_add_i32x4:
> +        tcg_out_modrm(s, OPC_PADDD, args[0], args[2]);
> +        break;
> +#endif
> +
>      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
>      case INDEX_op_mov_i64:
>      case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
> @@ -2171,6 +2259,11 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_st16_i32, { "ri", "r" } },
>      { INDEX_op_st_i32, { "ri", "r" } },
>
> +#ifdef TCG_TARGET_HAS_REG128
> +    { INDEX_op_ld_v128, { "V", "r" } },
> +    { INDEX_op_st_v128, { "V", "r" } },
> +#endif
> +
>      { INDEX_op_add_i32, { "r", "r", "ri" } },
>      { INDEX_op_sub_i32, { "r", "0", "ri" } },
>      { INDEX_op_mul_i32, { "r", "0", "ri" } },
> @@ -2289,6 +2382,10 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_qemu_ld_i64, { "r", "r", "L", "L" } },
>      { INDEX_op_qemu_st_i64, { "L", "L", "L", "L" } },
>  #endif
> +
> +#ifdef TCG_TARGET_HAS_REG128
> +    { INDEX_op_add_i32x4, { "V", "0", "V" } },
> +#endif
>      { -1 },
>  };


--
Alex Bennée

[Prev in Thread]

Current Thread

[Next in Thread]

Re: [Qemu-devel] [PATCH 06/18] tcg: allow globals to overlap, (continued)
- [Qemu-devel] [PATCH 08/18] target/arm: support access to vector guest registers as globals, Kirill Batuzov, 2017/01/17
  - Re: [Qemu-devel] [PATCH 08/18] target/arm: support access to vector guest registers as globals, Richard Henderson, 2017/01/17
- [Qemu-devel] [PATCH 09/18] target/arm: use vector opcode to handle vadd.<size> instruction, Kirill Batuzov, 2017/01/17
- [Qemu-devel] [PATCH 13/18] tcg: do not relay on exact values of MO_BSWAP or MO_SIGN in backend, Kirill Batuzov, 2017/01/17
- [Qemu-devel] [PATCH 11/18] tcg/i386: support 64-bit vector operations, Kirill Batuzov, 2017/01/17
- [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes, Kirill Batuzov, 2017/01/17
  - Re: [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes, Richard Henderson, 2017/01/17
    - Re: [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes, Kirill Batuzov, 2017/01/18
    - Re: [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes, Richard Henderson, 2017/01/18
  - Re: [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes, Alex Bennée <=
- [Qemu-devel] [PATCH 12/18] tcg/i386: support remaining vector addition operations, Kirill Batuzov, 2017/01/17
  - Re: [Qemu-devel] [PATCH 12/18] tcg/i386: support remaining vector addition operations, Richard Henderson, 2017/01/17
- [Qemu-devel] [PATCH 14/18] tcg: introduce new TCGMemOp - MO_128, Kirill Batuzov, 2017/01/17
- [Qemu-devel] [PATCH 07/18] tcg: add vector addition operations, Kirill Batuzov, 2017/01/17
  - Re: [Qemu-devel] [PATCH 07/18] tcg: add vector addition operations, Richard Henderson, 2017/01/17
- [Qemu-devel] [PATCH 15/18] tcg: introduce qemu_ld_v128 and qemu_st_v128 opcodes, Kirill Batuzov, 2017/01/17
- [Qemu-devel] [PATCH 16/18] softmmu: create helpers for vector loads, Kirill Batuzov, 2017/01/17
- [Qemu-devel] [PATCH 17/18] tcg/i386: add support for qemu_ld_v128/qemu_st_v128 ops, Kirill Batuzov, 2017/01/17
- [Qemu-devel] [PATCH 18/18] target/arm: load two consecutive 64-bits vector regs as a 128-bit vector reg, Kirill Batuzov, 2017/01/17
- Re: [Qemu-devel] [PATCH 00/18] Emulate guest vector operations with host vector operations, Alex Bennée, 2017/01/27

Prev by Date: Re: [Qemu-devel] [PULL v1] Merge io/ 2017/01/26
Next by Date: [Qemu-devel] [PATCH] hw/core/or-irq: Mark the device with cannot_instantiate_with_device_add_yet
Previous by thread: Re: [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes
Next by thread: [Qemu-devel] [PATCH 12/18] tcg/i386: support remaining vector addition operations
Index(es):
- Date
- Thread