[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v3 15/23] i386: Destructive vector helpers for AVX
From: |
Paolo Bonzini |
Subject: |
[PATCH v3 15/23] i386: Destructive vector helpers for AVX |
Date: |
Thu, 1 Sep 2022 09:48:34 +0200 |
From: Paul Brook <paul@nowt.org>
These helpers need to take special care to avoid overwriting source values
before the wole result has been calculated. Currently they use a dummy
Reg typed variable to store the result then assign the whole register.
This will cause 128 bit operations to corrupt the upper half of the register,
so replace it with explicit temporaries and element assignments.
Signed-off-by: Paul Brook <paul@nowt.org>
Message-Id: <20220424220204.2493824-14-paul@nowt.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/ops_sse.h | 556 ++++++++++++++++++++----------------------
1 file changed, 262 insertions(+), 294 deletions(-)
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 557cc7ce7d..7d48c05693 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -41,6 +41,7 @@
#endif
#define LANE_WIDTH (SHIFT ? 16 : 8)
+#define PACK_WIDTH (LANE_WIDTH / 2)
/*
* Copy the relevant parts of a Reg value around. In the case where
@@ -474,71 +475,81 @@ void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
}
#endif
+#define SHUFFLE4(F, a, b, offset) do { \
+ r0 = a->F((order & 3) + offset); \
+ r1 = a->F(((order >> 2) & 3) + offset); \
+ r2 = b->F(((order >> 4) & 3) + offset); \
+ r3 = b->F(((order >> 6) & 3) + offset); \
+ d->F(offset) = r0; \
+ d->F(offset + 1) = r1; \
+ d->F(offset + 2) = r2; \
+ d->F(offset + 3) = r3; \
+ } while (0)
+
#if SHIFT == 0
void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint16_t r0, r1, r2, r3;
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- MOVE(*d, r);
+ SHUFFLE4(W, s, s, 0);
}
#else
void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ Reg *v = d;
+ uint32_t r0, r1, r2, r3;
+ int i;
- r.L(0) = d->L(order & 3);
- r.L(1) = d->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- MOVE(*d, r);
+ for (i = 0; i < 2 << SHIFT; i += 4) {
+ SHUFFLE4(L, v, s, i);
+ }
}
void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ Reg *v = d;
+ uint64_t r0, r1;
+ int i;
- r.Q(0) = d->Q(order & 1);
- r.Q(1) = s->Q((order >> 1) & 1);
- MOVE(*d, r);
+ for (i = 0; i < 1 << SHIFT; i += 2) {
+ r0 = v->Q(((order & 1) & 1) + i);
+ r1 = s->Q(((order >> 1) & 1) + i);
+ d->Q(i) = r0;
+ d->Q(i + 1) = r1;
+ order >>= 2;
+ }
}
void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint32_t r0, r1, r2, r3;
+ int i;
- r.L(0) = s->L(order & 3);
- r.L(1) = s->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- MOVE(*d, r);
+ for (i = 0; i < 2 << SHIFT; i += 4) {
+ SHUFFLE4(L, s, s, i);
+ }
}
void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint16_t r0, r1, r2, r3;
+ int i, j;
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- r.Q(1) = s->Q(1);
- MOVE(*d, r);
+ for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
+ SHUFFLE4(W, s, s, i);
+ d->Q(j) = s->Q(j);
+ }
}
void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint16_t r0, r1, r2, r3;
+ int i, j;
- r.Q(0) = s->Q(0);
- r.W(4) = s->W(4 + (order & 3));
- r.W(5) = s->W(4 + ((order >> 2) & 3));
- r.W(6) = s->W(4 + ((order >> 4) & 3));
- r.W(7) = s->W(4 + ((order >> 6) & 3));
- MOVE(*d, r);
+ for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
+ d->Q(j) = s->Q(j);
+ SHUFFLE4(W, s, s, i);
+ }
}
#endif
@@ -1091,156 +1102,132 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State
*env, Reg *s)
return val;
}
-void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.B(0) = satsb((int16_t)d->W(0));
- r.B(1) = satsb((int16_t)d->W(1));
- r.B(2) = satsb((int16_t)d->W(2));
- r.B(3) = satsb((int16_t)d->W(3));
-#if SHIFT == 1
- r.B(4) = satsb((int16_t)d->W(4));
- r.B(5) = satsb((int16_t)d->W(5));
- r.B(6) = satsb((int16_t)d->W(6));
- r.B(7) = satsb((int16_t)d->W(7));
-#endif
- r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
- r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
- r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
- r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
-#if SHIFT == 1
- r.B(12) = satsb((int16_t)s->W(4));
- r.B(13) = satsb((int16_t)s->W(5));
- r.B(14) = satsb((int16_t)s->W(6));
- r.B(15) = satsb((int16_t)s->W(7));
-#endif
- MOVE(*d, r);
+#define PACK_HELPER_B(name, F) \
+void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
+ Reg *d, Reg *s) \
+{ \
+ Reg *v = d; \
+ uint8_t r[PACK_WIDTH * 2]; \
+ int j, k; \
+ for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \
+ for (k = 0; k < PACK_WIDTH; k++) { \
+ r[k] = F((int16_t)v->W(j + k)); \
+ } \
+ for (k = 0; k < PACK_WIDTH; k++) { \
+ r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \
+ } \
+ for (k = 0; k < PACK_WIDTH * 2; k++) { \
+ d->B(2 * j + k) = r[k]; \
+ } \
+ } \
}
-void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.B(0) = satub((int16_t)d->W(0));
- r.B(1) = satub((int16_t)d->W(1));
- r.B(2) = satub((int16_t)d->W(2));
- r.B(3) = satub((int16_t)d->W(3));
-#if SHIFT == 1
- r.B(4) = satub((int16_t)d->W(4));
- r.B(5) = satub((int16_t)d->W(5));
- r.B(6) = satub((int16_t)d->W(6));
- r.B(7) = satub((int16_t)d->W(7));
-#endif
- r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
- r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
- r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
- r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
-#if SHIFT == 1
- r.B(12) = satub((int16_t)s->W(4));
- r.B(13) = satub((int16_t)s->W(5));
- r.B(14) = satub((int16_t)s->W(6));
- r.B(15) = satub((int16_t)s->W(7));
-#endif
- MOVE(*d, r);
-}
+PACK_HELPER_B(sswb, satsb)
+PACK_HELPER_B(uswb, satub)
void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
- Reg r;
+ Reg *v = d;
+ uint16_t r[PACK_WIDTH];
+ int j, k;
- r.W(0) = satsw(d->L(0));
- r.W(1) = satsw(d->L(1));
-#if SHIFT == 1
- r.W(2) = satsw(d->L(2));
- r.W(3) = satsw(d->L(3));
-#endif
- r.W((2 << SHIFT) + 0) = satsw(s->L(0));
- r.W((2 << SHIFT) + 1) = satsw(s->L(1));
-#if SHIFT == 1
- r.W(6) = satsw(s->L(2));
- r.W(7) = satsw(s->L(3));
-#endif
- MOVE(*d, r);
+ for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
+ for (k = 0; k < PACK_WIDTH / 2; k++) {
+ r[k] = satsw(v->L(j + k));
+ }
+ for (k = 0; k < PACK_WIDTH / 2; k++) {
+ r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
+ }
+ for (k = 0; k < PACK_WIDTH; k++) {
+ d->W(2 * j + k) = r[k];
+ }
+ }
}
#define UNPCK_OP(base_name, base) \
\
void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint8_t r[PACK_WIDTH * 2]; \
+ int j, i; \
\
- r.B(0) = d->B((base << (SHIFT + 2)) + 0); \
- r.B(1) = s->B((base << (SHIFT + 2)) + 0); \
- r.B(2) = d->B((base << (SHIFT + 2)) + 1); \
- r.B(3) = s->B((base << (SHIFT + 2)) + 1); \
- r.B(4) = d->B((base << (SHIFT + 2)) + 2); \
- r.B(5) = s->B((base << (SHIFT + 2)) + 2); \
- r.B(6) = d->B((base << (SHIFT + 2)) + 3); \
- r.B(7) = s->B((base << (SHIFT + 2)) + 3); \
- XMM_ONLY( \
- r.B(8) = d->B((base << (SHIFT + 2)) + 4); \
- r.B(9) = s->B((base << (SHIFT + 2)) + 4); \
- r.B(10) = d->B((base << (SHIFT + 2)) + 5); \
- r.B(11) = s->B((base << (SHIFT + 2)) + 5); \
- r.B(12) = d->B((base << (SHIFT + 2)) + 6); \
- r.B(13) = s->B((base << (SHIFT + 2)) + 6); \
- r.B(14) = d->B((base << (SHIFT + 2)) + 7); \
- r.B(15) = s->B((base << (SHIFT + 2)) + 7); \
- ) \
- MOVE(*d, r); \
+ for (j = 0; j < 8 << SHIFT; ) { \
+ int k = j + base * PACK_WIDTH; \
+ for (i = 0; i < PACK_WIDTH; i++) { \
+ r[2 * i] = v->B(k + i); \
+ r[2 * i + 1] = s->B(k + i); \
+ } \
+ for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \
+ d->B(j) = r[i]; \
+ } \
+ } \
} \
\
void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint16_t r[PACK_WIDTH]; \
+ int j, i; \
\
- r.W(0) = d->W((base << (SHIFT + 1)) + 0); \
- r.W(1) = s->W((base << (SHIFT + 1)) + 0); \
- r.W(2) = d->W((base << (SHIFT + 1)) + 1); \
- r.W(3) = s->W((base << (SHIFT + 1)) + 1); \
- XMM_ONLY( \
- r.W(4) = d->W((base << (SHIFT + 1)) + 2); \
- r.W(5) = s->W((base << (SHIFT + 1)) + 2); \
- r.W(6) = d->W((base << (SHIFT + 1)) + 3); \
- r.W(7) = s->W((base << (SHIFT + 1)) + 3); \
- ) \
- MOVE(*d, r); \
+ for (j = 0; j < 4 << SHIFT; ) { \
+ int k = j + base * PACK_WIDTH / 2; \
+ for (i = 0; i < PACK_WIDTH / 2; i++) { \
+ r[2 * i] = v->W(k + i); \
+ r[2 * i + 1] = s->W(k + i); \
+ } \
+ for (i = 0; i < PACK_WIDTH; i++, j++) { \
+ d->W(j) = r[i]; \
+ } \
+ } \
} \
\
void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint32_t r[PACK_WIDTH / 2]; \
+ int j, i; \
\
- r.L(0) = d->L((base << SHIFT) + 0); \
- r.L(1) = s->L((base << SHIFT) + 0); \
- XMM_ONLY( \
- r.L(2) = d->L((base << SHIFT) + 1); \
- r.L(3) = s->L((base << SHIFT) + 1); \
- ) \
- MOVE(*d, r); \
+ for (j = 0; j < 2 << SHIFT; ) { \
+ int k = j + base * PACK_WIDTH / 4; \
+ for (i = 0; i < PACK_WIDTH / 4; i++) { \
+ r[2 * i] = v->L(k + i); \
+ r[2 * i + 1] = s->L(k + i); \
+ } \
+ for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \
+ d->L(j) = r[i]; \
+ } \
+ } \
} \
\
XMM_ONLY( # qdq,
SUFFIX)( \
+ CPUX86State *env, Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint64_t r[2]; \
+ int i; \
\
- r.Q(0) = d->Q(base); \
- r.Q(1) = s->Q(base); \
- MOVE(*d, r); \
+ for (i = 0; i < 1 << SHIFT; i += 2) { \
+ r[0] = v->Q(base + i); \
+ r[1] = s->Q(base + i); \
+ d->Q(i) = r[0]; \
+ d->Q(i + 1) = r[1]; \
+ } \
} \
)
UNPCK_OP(l, 0)
UNPCK_OP(h, 1)
+#undef PACK_WIDTH
+#undef PACK_HELPER_B
+#undef UNPCK_OP
+
+
/* 3DNow! float ops */
#if SHIFT == 0
void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
@@ -1393,122 +1380,86 @@ void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg
*s)
/* SSSE3 op helpers */
void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
+ Reg *v = d;
int i;
- Reg r;
+#if SHIFT == 0
+ uint8_t r[8];
- for (i = 0; i < (8 << SHIFT); i++) {
- r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
+ for (i = 0; i < 8; i++) {
+ r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7));
}
+ for (i = 0; i < 8; i++) {
+ d->B(i) = r[i];
+ }
+#else
+ uint8_t r[8 << SHIFT];
- MOVE(*d, r);
-}
-
-void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
-
- Reg r;
-
- r.W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
- r.W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
- XMM_ONLY(r.W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
- XMM_ONLY(r.W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
- r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
- r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
- XMM_ONLY(r.W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
- XMM_ONLY(r.W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
-
- MOVE(*d, r);
-}
-
-void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
- XMM_ONLY(r.L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
- r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
- XMM_ONLY(r.L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
-
- MOVE(*d, r);
-}
-
-void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
- r.W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
- XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
- XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
- r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
- r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
- XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
- XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
-
- MOVE(*d, r);
-}
-
-void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- d->W(0) = satsw((int8_t)s->B(0) * (uint8_t)d->B(0) +
- (int8_t)s->B(1) * (uint8_t)d->B(1));
- d->W(1) = satsw((int8_t)s->B(2) * (uint8_t)d->B(2) +
- (int8_t)s->B(3) * (uint8_t)d->B(3));
- d->W(2) = satsw((int8_t)s->B(4) * (uint8_t)d->B(4) +
- (int8_t)s->B(5) * (uint8_t)d->B(5));
- d->W(3) = satsw((int8_t)s->B(6) * (uint8_t)d->B(6) +
- (int8_t)s->B(7) * (uint8_t)d->B(7));
-#if SHIFT == 1
- d->W(4) = satsw((int8_t)s->B(8) * (uint8_t)d->B(8) +
- (int8_t)s->B(9) * (uint8_t)d->B(9));
- d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
- (int8_t)s->B(11) * (uint8_t)d->B(11));
- d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
- (int8_t)s->B(13) * (uint8_t)d->B(13));
- d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
- (int8_t)s->B(15) * (uint8_t)d->B(15));
+ for (i = 0; i < 8 << SHIFT; i++) {
+ int j = i & ~0xf;
+ r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf));
+ }
+ for (i = 0; i < 8 << SHIFT; i++) {
+ d->B(i) = r[i];
+ }
#endif
}
-void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
- r.W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
- XMM_ONLY(r.W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
- XMM_ONLY(r.W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
- r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
- r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
- XMM_ONLY(r.W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
- XMM_ONLY(r.W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
- MOVE(*d, r);
+#define SSE_HELPER_HW(name, F) \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+{ \
+ Reg *v = d; \
+ uint16_t r[4 << SHIFT]; \
+ int i, j, k; \
+ for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \
+ for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
+ r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \
+ } \
+ for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
+ r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \
+ } \
+ } \
+ for (i = 0; i < 4 << SHIFT; i++) { \
+ d->W(i) = r[i]; \
+ } \
}
-void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
- XMM_ONLY(r.L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
- r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
- XMM_ONLY(r.L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
- MOVE(*d, r);
+#define SSE_HELPER_HL(name, F) \
+void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+{ \
+ Reg *v = d; \
+ uint32_t r[2 << SHIFT]; \
+ int i, j, k; \
+ for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
+ for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
+ r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \
+ } \
+ for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
+ r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \
+ } \
+ } \
+ for (i = 0; i < 2 << SHIFT; i++) { \
+ d->L(i) = r[i]; \
+ } \
}
-void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
+SSE_HELPER_HW(phaddw, FADD)
+SSE_HELPER_HW(phsubw, FSUB)
+SSE_HELPER_HW(phaddsw, FADDSW)
+SSE_HELPER_HW(phsubsw, FSUBSW)
+SSE_HELPER_HL(phaddd, FADD)
+SSE_HELPER_HL(phsubd, FSUB)
- r.W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
- r.W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
- XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
- XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
- r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
- r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
- XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
- XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
- MOVE(*d, r);
+#undef SSE_HELPER_HW
+#undef SSE_HELPER_HL
+
+void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+ Reg *v = d;
+ int i;
+ for (i = 0; i < 4 << SHIFT; i++) {
+ d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
+ (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1));
+ }
}
#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x)
@@ -1531,32 +1482,38 @@ SSE_HELPER_L(helper_psignd, FSIGNL)
void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
int32_t shift)
{
- Reg r;
+ Reg *v = d;
+ int i;
/* XXX could be checked during translation */
- if (shift >= (16 << SHIFT)) {
- r.Q(0) = 0;
- XMM_ONLY(r.Q(1) = 0);
+ if (shift >= (SHIFT ? 32 : 16)) {
+ for (i = 0; i < (1 << SHIFT); i++) {
+ d->Q(i) = 0;
+ }
} else {
shift <<= 3;
#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
#if SHIFT == 0
- r.Q(0) = SHR(s->Q(0), shift - 0) |
- SHR(d->Q(0), shift - 64);
+ d->Q(0) = SHR(s->Q(0), shift - 0) |
+ SHR(v->Q(0), shift - 64);
#else
- r.Q(0) = SHR(s->Q(0), shift - 0) |
- SHR(s->Q(1), shift - 64) |
- SHR(d->Q(0), shift - 128) |
- SHR(d->Q(1), shift - 192);
- r.Q(1) = SHR(s->Q(0), shift + 64) |
- SHR(s->Q(1), shift - 0) |
- SHR(d->Q(0), shift - 64) |
- SHR(d->Q(1), shift - 128);
+ for (i = 0; i < (1 << SHIFT); i += 2) {
+ uint64_t r0, r1;
+
+ r0 = SHR(s->Q(i), shift - 0) |
+ SHR(s->Q(i + 1), shift - 64) |
+ SHR(v->Q(i), shift - 128) |
+ SHR(v->Q(i + 1), shift - 192);
+ r1 = SHR(s->Q(i), shift + 64) |
+ SHR(s->Q(i + 1), shift - 0) |
+ SHR(v->Q(i), shift - 64) |
+ SHR(v->Q(i + 1), shift - 128);
+ d->Q(i) = r0;
+ d->Q(i + 1) = r1;
+ }
#endif
#undef SHR
}
-
- MOVE(*d, r);
}
#define XMM0 (env->xmm_regs[0])
@@ -1681,17 +1638,23 @@ SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
- Reg r;
+ Reg *v = d;
+ uint16_t r[8];
+ int i, j, k;
- r.W(0) = satuw((int32_t) d->L(0));
- r.W(1) = satuw((int32_t) d->L(1));
- r.W(2) = satuw((int32_t) d->L(2));
- r.W(3) = satuw((int32_t) d->L(3));
- r.W(4) = satuw((int32_t) s->L(0));
- r.W(5) = satuw((int32_t) s->L(1));
- r.W(6) = satuw((int32_t) s->L(2));
- r.W(7) = satuw((int32_t) s->L(3));
- MOVE(*d, r);
+ for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
+ r[0] = satuw(v->L(j));
+ r[1] = satuw(v->L(j + 1));
+ r[2] = satuw(v->L(j + 2));
+ r[3] = satuw(v->L(j + 3));
+ r[4] = satuw(s->L(j));
+ r[5] = satuw(s->L(j + 1));
+ r[6] = satuw(s->L(j + 2));
+ r[7] = satuw(s->L(j + 3));
+ for (k = 0; k < 8; k++) {
+ d->W(i + k) = r[k];
+ }
+ }
}
#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
@@ -1947,20 +1910,25 @@ void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg
*d, Reg *s, uint32_t mask)
void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
uint32_t offset)
{
- int s0 = (offset & 3) << 2;
- int d0 = (offset & 4) << 0;
- int i;
- Reg r;
+ Reg *v = d;
+ int i, j;
+ uint16_t r[8];
- for (i = 0; i < 8; i++, d0++) {
- r.W(i) = 0;
- r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
- r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
- r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
- r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
+ for (j = 0; j < 4 << SHIFT; ) {
+ int s0 = (j * 2) + ((offset & 3) << 2);
+ int d0 = (j * 2) + ((offset & 4) << 0);
+ for (i = 0; i < LANE_WIDTH / 2; i++, d0++) {
+ r[i] = 0;
+ r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0));
+ r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1));
+ r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2));
+ r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3));
+ }
+ for (i = 0; i < LANE_WIDTH / 2; i++, j++) {
+ d->W(j) = r[i];
+ }
+ offset >>= 3;
}
-
- MOVE(*d, r);
}
/* SSE4.2 op helpers */
--
2.37.1
- [PATCH v3 11/23] i386: Add CHECK_NO_VEX, (continued)
- [PATCH v3 11/23] i386: Add CHECK_NO_VEX, Paolo Bonzini, 2022/09/01
- [PATCH v3 18/23] i386: Dot product AVX helper prep, Paolo Bonzini, 2022/09/01
- [PATCH v3 21/23] i386: Rewrite blendv helpers, Paolo Bonzini, 2022/09/01
- [PATCH v3 12/23] i386: Rewrite vector shift helper, Paolo Bonzini, 2022/09/01
- [PATCH v3 14/23] i386: Misc integer AVX helper prep, Paolo Bonzini, 2022/09/01
- [PATCH v3 13/23] i386: Rewrite simple integer vector helpers, Paolo Bonzini, 2022/09/01
- [PATCH v3 17/23] i386: reimplement AVX comparison helpers, Paolo Bonzini, 2022/09/01
- [PATCH v3 20/23] i386: Misc AVX helper prep, Paolo Bonzini, 2022/09/01
- [PATCH v3 19/23] i386: Destructive FP helpers for AVX, Paolo Bonzini, 2022/09/01
- [PATCH v3 15/23] i386: Destructive vector helpers for AVX,
Paolo Bonzini <=
- [PATCH v3 16/23] i386: Floating point arithmetic helper AVX prep, Paolo Bonzini, 2022/09/01
- [PATCH v3 22/23] i386: AVX pclmulqdq prep, Paolo Bonzini, 2022/09/01
- [PATCH v3 23/23] i386: AVX+AES helpers prep, Paolo Bonzini, 2022/09/01