[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 28/37] target/i386: reimplement 0x0f 0x38, add AVX
From: |
Richard Henderson |
Subject: |
Re: [PATCH 28/37] target/i386: reimplement 0x0f 0x38, add AVX |
Date: |
Tue, 13 Sep 2022 10:31:37 +0100 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.11.0 |
On 9/12/22 00:04, Paolo Bonzini wrote:
+void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+ uint64_t zf = 0, cf = 0;
uint32_t, to match the size of the operation.
+ int i;
+
+ for (i = 0; i < 2 << SHIFT; i++) {
+ zf |= (s->L(i) & d->L(i));
+ cf |= (s->L(i) & ~d->L(i));
+ }
+void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env,
+ Reg *v, Reg *s, target_ulong a0)
+{
+ int i;
+
+ for (i = 0; i < (2 << SHIFT); i++) {
+ if (v->L(i) >> 31) {
+ cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC());
+ }
+ }
+}
+
+void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env,
+ Reg *v, Reg *s, target_ulong a0)
+{
+ int i;
+
+ for (i = 0; i < (1 << SHIFT); i++) {
+ if (v->Q(i) >> 63) {
+ cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC());
+ }
+ }
+}
Any idea if hw will write incomplete data if the pieces cross page boundaries, and the
second page is invalid? We're not good at that for any other vector sized write, though,
so not critical.
+void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+ int i;
+
+ for (i = 0; i < (2 << SHIFT); i++) {
+ d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0;
+ }
+}
This is tcg_gen_cmpsel_vec(TCG_COND_LT, d, v, zero, s, zero).
+void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env,
+ Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
+{
+ int i;
+ for (i = 0; i < (2 << SHIFT); i++) {
+ if (v->L(i) >> 31) {
+ target_ulong addr = a0
+ + ((target_ulong)(int32_t)s->L(i) << scale);
+ d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
+ }
+ v->L(i) = 0;
+ }
+}
Better to not modify registers until all potential #GP are raised.
Also, some missing whitespace between functions.
+ [0x2f] = X86_OP_ENTRY3(,x, vex4 cpuid(SSE41) avx2_256 p_66),
Whee! Mailer really chomped down on this series.
@@ -384,8 +484,8 @@ static const X86OpEntry opcodes_0F3A[256] = {
[0x0b] = X86_OP_ENTRY4(VROUNDSD, V,x, H,x, W,sd, vex3 cpuid(SSE41)
p_66),
[0x0c] = X86_OP_ENTRY4(VBLENDPS, V,x, H,x, W,x, vex4 cpuid(SSE41)
p_66),
[0x0d] = X86_OP_ENTRY4(VBLENDPD, V,x, H,x, W,x, vex4 cpuid(SSE41)
p_66),
- [0x0e] = X86_OP_ENTRY4(VPBLENDW, V,x, H,x, W,x, vex4 cpuid(SSE41)
p_66),
- [0x0f] = X86_OP_ENTRY4(PALIGNR, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx
p_00_66),
+ [0x0e] = X86_OP_ENTRY4(VPBLENDW, V,x, H,x, W,x, vex4 cpuid(SSE41)
avx2_256 p_66),
+ [0x0f] = X86_OP_ENTRY4(PALIGNR, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx
avx2_256 p_00_66),
Squash back.
+ case X86_SPECIAL_AVXExtMov:
+ if (!decode.op[2].has_ea) {
+ decode.op[2].ot = s->vex_l ? MO_128 : MO_256;
+ } else if (s->vex_l) {
+ decode.op[2].ot++;
+ }
Clever.
+BINARY_INT_SSE(VPMINSB, pminsb)
+BINARY_INT_SSE(VPMINUW, pminuw)
+BINARY_INT_SSE(VPMINUD, pminud)
+BINARY_INT_SSE(VPMINSD, pminsd)
+BINARY_INT_SSE(VPMAXSB, pmaxsb)
+BINARY_INT_SSE(VPMAXUW, pmaxuw)
+BINARY_INT_SSE(VPMAXUD, pmaxud)
+BINARY_INT_SSE(VPMAXSD, pmaxsd)
tcg_gen_gvec_{u,s}{min,max}.
+/* Same as above, but with extra arguments to the helper. */
+static inline void gen_vsib_avx(DisasContext *s, CPUX86State *env,
X86DecodedInsn *decode,
+ SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm,
+ SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm)
+{
+ SSEFunc_0_epppti d = s->vex_l ? d_ymm : d_xmm;
+ SSEFunc_0_epppti q = s->vex_l ? q_ymm : q_xmm;
+ SSEFunc_0_epppti fn = s->rex_w ? q : d;
+ TCGv_i32 scale = tcg_const_i32(decode->mem.scale);
tcg_constant_i32.
+static void gen_VPBROADCASTB(DisasContext *s, CPUX86State *env, X86DecodedInsn
*decode)
+{
+ int vec_len = sse_vec_len(s, decode);
+
+ tcg_gen_ld8u_i32(s->tmp2_i32, s->ptr2, 0);
+ tcg_gen_gvec_dup_i32(MO_8, decode->op[0].offset, vec_len, vec_len,
s->tmp2_i32);
+}
This is better done with tcg_gen_gvec_dup_mem, where you pass the cpu_env offset of the
source data. This lets the host use mem->reg broadcast, which turns out to be more
available than reg->reg broadcast.
+static void gen_VPBROADCASTW(DisasContext *s, CPUX86State *env, X86DecodedInsn
*decode)
+static void gen_VPBROADCASTD(DisasContext *s, CPUX86State *env, X86DecodedInsn
*decode)
+static void gen_VPBROADCASTQ(DisasContext *s, CPUX86State *env, X86DecodedInsn
*decode)
Likewise.
+static inline void gen_VBROADCASTx128(DisasContext *s, CPUX86State *env,
X86DecodedInsn *decode)
+{
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset,
+ decode->op[2].offset, 16, 16);
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset + offsetof(YMMReg, YMM_X(1)),
+ decode->op[2].offset, 16, 16);
+}
tcg_gen_dup_mem(MO_128, ...);
r~
- Re: [PATCH 25/37] target/i386: reimplement 0x0f 0xd0-0xd7, 0xe0-0xe7, 0xf0-0xf7, add AVX, (continued)
- [PATCH 26/37] target/i386: reimplement 0x0f 0x3a, add AVX, Paolo Bonzini, 2022/09/11
- [PATCH 28/37] target/i386: reimplement 0x0f 0x38, add AVX, Paolo Bonzini, 2022/09/11
- Re: [PATCH 28/37] target/i386: reimplement 0x0f 0x38, add AVX,
Richard Henderson <=
[PATCH 29/37] target/i386: reimplement 0x0f 0xc2, 0xc4-0xc6, add AVX, Paolo Bonzini, 2022/09/11
[PATCH 31/37] target/i386: reimplement 0x0f 0x28-0x2f, add AVX, Paolo Bonzini, 2022/09/11
[PATCH 30/37] target/i386: reimplement 0x0f 0x10-0x17, add AVX, Paolo Bonzini, 2022/09/11