qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA i


From: Mateja Marjanovic
Subject: Re: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions
Date: Mon, 18 Mar 2019 18:21:59 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Thunderbird/60.5.1

On 15.3.19. 17:46, Aleksandar Markovic wrote:
From: Mateja Marjanovic <address@hidden>
Subject: [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions

From: Mateja Marjanovic <address@hidden>

Optimize set of MSA instructions ILVOD, using directly
tcg registers and performing logic on them insted of
using helpers.
insted -> instead
Fix in v2.
Performance measurement is done by executing the
instructions large number of times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

  instruction ||    before    ||    after   ||
==============================================
  ilvod.b:        ||   66.97 ms   ||  26.34 ms  ||
  ilvod.h:        ||   44.75 ms   ||  25.17 ms  ||
  ilvod.w:        ||   41.27 ms   ||  24.37 ms  ||
  ilvod.d:        ||   41.75 ms   ||  20.50 ms  ||

Alignment looks wrong.
Fix in v2.
Signed-off-by: Mateja Marjanovic <address@hidden>
---
  target/mips/helper.h     |   1 -
  target/mips/msa_helper.c |  51 --------------------
  target/mips/translate.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++++-
  3 files changed, 118 insertions(+), 53 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index a6d687e..d162836 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 9d9dafe..cbcfd57 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, 
uint32_t > wd,
      }
  }

-void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
-                         uint32_t ws, uint32_t wt)
-{
-    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
-    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
-
-    switch (df) {
-    case DF_BYTE:
-        pwd->b[0]  = pwt->b[1];
-        pwd->b[1]  = pws->b[1];
-        pwd->b[2]  = pwt->b[3];
-        pwd->b[3]  = pws->b[3];
-        pwd->b[4]  = pwt->b[5];
-        pwd->b[5]  = pws->b[5];
-        pwd->b[6]  = pwt->b[7];
-        pwd->b[7]  = pws->b[7];
-        pwd->b[8]  = pwt->b[9];
-        pwd->b[9]  = pws->b[9];
-        pwd->b[10] = pwt->b[11];
-        pwd->b[11] = pws->b[11];
-        pwd->b[12] = pwt->b[13];
-        pwd->b[13] = pws->b[13];
-        pwd->b[14] = pwt->b[15];
-        pwd->b[15] = pws->b[15];
-        break;
-    case DF_HALF:
-        pwd->h[0] = pwt->h[1];
-        pwd->h[1] = pws->h[1];
-        pwd->h[2] = pwt->h[3];
-        pwd->h[3] = pws->h[3];
-        pwd->h[4] = pwt->h[5];
-        pwd->h[5] = pws->h[5];
-        pwd->h[6] = pwt->h[7];
-        pwd->h[7] = pws->h[7];
-        break;
-    case DF_WORD:
-        pwd->w[0] = pwt->w[1];
-        pwd->w[1] = pws->w[1];
-        pwd->w[2] = pwt->w[3];
-        pwd->w[3] = pws->w[3];
-        break;
-    case DF_DOUBLE:
-        pwd->d[0] = pwt->d[1];
-        pwd->d[1] = pws->d[1];
-        break;
-    default:
-        assert(0);
-    }
-}
-
  void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
                          uint32_t ws, uint32_t wt)
  {
diff --git a/target/mips/translate.c b/target/mips/translate.c
index b4a1103..101d2de 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, 
DisasContext *ctx)
      tcg_temp_free_i32(tws);
  }

+static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
Please insert a brief comment before each handler, like in this example:

/*
  * [MSA] ILVOD.B wd, ws, wt
  *
  *   Vector Interleave Odd (byte data elements)
  *
  */
Will do in v2.
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 8) - 1;
There shouldn't be a blank line between declarations. The blank line
should be after the last declaration.
You are right.
+    mask |= mask << 16;
+    mask |= mask << 32;
+    mask <<= 8;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
If t1 is already initialized to 0, what is the purpose of OR-ing?

+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_shri_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 16) - 1;
+    mask |= mask << 32;
+    mask <<= 16;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_shri_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 32) - 1;
+    tcg_gen_movi_i64(t1, 0);
+
+    mask <<= 32;
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_shri_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]);
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
+}
+
  static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
  {
  #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -29060,7 +29162,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext 
*ctx)
          gen_helper_msa_mod_u_df(cpu_env, tdf, twd, tws, twt);
          break;
      case OPC_ILVOD_df:
-        gen_helper_msa_ilvod_df(cpu_env, tdf, twd, tws, twt);
+        switch (df) {
+        case DF_BYTE:
+            gen_ilvod_b(env, wd, ws, wt);
+            break;
+        case DF_HALF:
+            gen_ilvod_h(env, wd, ws, wt);
+            break;
+        case DF_WORD:
+            gen_ilvod_w(env, wd, ws, wt);
+            break;
+        case DF_DOUBLE:
+            gen_ilvod_d(env, wd, ws, wt);
+            break;
+        default:
+            assert(0);
+        }
          break;

      case OPC_DOTP_S_df:
--
2.7.4


Good idea, but the patches needs improvement.
v2 has much better perfomances than v1, thanks to your advice about unnecessary OR-ing and MOV-ing.
Also, I noticed you based your patches assuming that your previous patches
are applied. You don't do that, unless your previous patches are accepted
to the upstream tree, and they are not. Please base your patches on the
main upstream tree. Besides, the performance numbers you mentioned in the
commit message should use upstream performance as the reference, not the
performance of the your previous version of related patches.

The same comments as above applies to all versions og ILVEV and ILVOD.
My mistake.
Thanks for these efforts! I expect v2 soon.

Aleksandar

Thanks for all the comments.

Mateja




reply via email to

[Prev in Thread] Current Thread [Next in Thread]