qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 2/2] target/mips: Optimize ILVEV.<B|H|W|D> MSA instr


From: Mateja Marjanovic
Subject: [Qemu-devel] [PATCH 2/2] target/mips: Optimize ILVEV.<B|H|W|D> MSA instructions
Date: Fri, 15 Mar 2019 13:02:48 +0100

From: Mateja Marjanovic <address@hidden>

Optimize set of MSA instructions ILVEV, using directly
tcg registers and performing logic on them insted of
using helpers.
Performance measurement is done by executing the
instructions large number of times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

 instruction ||    before    ||    after   ||
==============================================
 ilvev.b     ||    74.38 ms  ||  38.85 ms  ||
 ilvev.h     ||    46.78 ms  ||  33.98 ms  ||
 ilvev.w     ||    45.50 ms  ||  28.93 ms  ||
 ilvev.d     ||    37.67 ms  ||  23.09 ms  ||

Signed-off-by: Mateja Marjanovic <address@hidden>
---
 target/mips/helper.h     |   1 -
 target/mips/msa_helper.c |  52 ---------------------
 target/mips/translate.c  | 117 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 116 insertions(+), 54 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index d162836..2f23b0d 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -864,7 +864,6 @@ DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index cbcfd57..421dced 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1311,58 +1311,6 @@ void helper_msa_pckev_df(CPUMIPSState *env, uint32_t df, 
uint32_t wd,
     }
 }
 
-
-void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
-                         uint32_t ws, uint32_t wt)
-{
-    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
-    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
-
-    switch (df) {
-    case DF_BYTE:
-        pwd->b[15] = pws->b[14];
-        pwd->b[14] = pwt->b[14];
-        pwd->b[13] = pws->b[12];
-        pwd->b[12] = pwt->b[12];
-        pwd->b[11] = pws->b[10];
-        pwd->b[10] = pwt->b[10];
-        pwd->b[9]  = pws->b[8];
-        pwd->b[8]  = pwt->b[8];
-        pwd->b[7]  = pws->b[6];
-        pwd->b[6]  = pwt->b[6];
-        pwd->b[5]  = pws->b[4];
-        pwd->b[4]  = pwt->b[4];
-        pwd->b[3]  = pws->b[2];
-        pwd->b[2]  = pwt->b[2];
-        pwd->b[1]  = pws->b[0];
-        pwd->b[0]  = pwt->b[0];
-        break;
-    case DF_HALF:
-        pwd->h[7] = pws->h[6];
-        pwd->h[6] = pwt->h[6];
-        pwd->h[5] = pws->h[4];
-        pwd->h[4] = pwt->h[4];
-        pwd->h[3] = pws->h[2];
-        pwd->h[2] = pwt->h[2];
-        pwd->h[1] = pws->h[0];
-        pwd->h[0] = pwt->h[0];
-        break;
-    case DF_WORD:
-        pwd->w[3] = pws->w[2];
-        pwd->w[2] = pwt->w[2];
-        pwd->w[1] = pws->w[0];
-        pwd->w[0] = pwt->w[0];
-        break;
-    case DF_DOUBLE:
-        pwd->d[1] = pws->d[0];
-        pwd->d[0] = pwt->d[0];
-        break;
-    default:
-        assert(0);
-    }
-}
-
 void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
                         uint32_t ws, uint32_t wt)
 {
diff --git a/target/mips/translate.c b/target/mips/translate.c
index 101d2de..1526d24 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28991,6 +28991,106 @@ static inline void gen_ilvod_d(CPUMIPSState *env, 
uint32_t wd,
     tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
 }
 
+static inline void gen_ilvev_b(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 8) - 1;
+    mask |= mask << 16;
+    mask |= mask << 32;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_shli_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvev_h(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 16) - 1;
+    mask |= mask << 32;
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_shli_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvev_w(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 32) - 1;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_shli_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvev_d(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
+}
+
 static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
 {
 #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -29147,7 +29247,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext 
*ctx)
         gen_helper_msa_mod_s_df(cpu_env, tdf, twd, tws, twt);
         break;
     case OPC_ILVEV_df:
-        gen_helper_msa_ilvev_df(cpu_env, tdf, twd, tws, twt);
+        switch (df) {
+        case DF_BYTE:
+            gen_ilvev_b(env, wd, ws, wt);
+            break;
+        case DF_HALF:
+            gen_ilvev_h(env, wd, ws, wt);
+            break;
+        case DF_WORD:
+            gen_ilvev_w(env, wd, ws, wt);
+            break;
+        case DF_DOUBLE:
+            gen_ilvev_d(env, wd, ws, wt);
+            break;
+        default:
+            assert(0);
+        }
         break;
     case OPC_BINSR_df:
         gen_helper_msa_binsr_df(cpu_env, tdf, twd, tws, twt);
-- 
2.7.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]