qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA i


From: Aleksandar Markovic
Subject: Re: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions
Date: Tue, 19 Mar 2019 13:19:45 +0000

> From: Mateja Marjanovic <address@hidden>
> Sent: Tuesday, March 19, 2019 12:28 PM
> To: address@hidden
> Cc: address@hidden; Aleksandar Markovic; Aleksandar Rikalo
> Subject: [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions
> 
> From: Mateja Marjanovic <address@hidden>
> 
> Optimize set of MSA instructions ILVOD, using directly\

Use full instructions names.

> tcg registers and performing logic on them instead of
> using helpers.
> 
>  instr    ||   before    ||   after
> ======================================
>  ilvod.b  ||  117.50 ms  ||  24.99 ms
>  ilvod.h  ||   93.16 ms  ||  24.27 ms
>  ilvod.w  ||  119.90 ms  ||  24.81 ms
>  ilvod.d  ||   43.01 ms  ||  20.69 ms
> 

How are these numbers obtained? You should even insert the
source code of the test program you are using, complete with
the compilation command line and the execution command line,
and basic data on the host.

How do you interpret the strange pattern of numbers in
"before" column?

> Signed-off-by: Mateja Marjanovic <address@hidden>
> ---
>  target/mips/helper.h     |   1 -
>  target/mips/msa_helper.c |   7 ---
>  target/mips/translate.c  | 111 
> ++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 110 insertions(+), 9 deletions(-)
> 
> diff --git a/target/mips/helper.h b/target/mips/helper.h
> index a6d687e..d162836 100644
> --- a/target/mips/helper.h
> +++ b/target/mips/helper.h
> @@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
> -DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index c74e3cd..9e52a31 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -1206,13 +1206,6 @@ MSA_FN_DF(ilvr_df)
>  MSA_FN_DF(ilvev_df)
>  #undef MSA_DO
> 
> -#define MSA_DO(DF)                          \
> -    do {                                    \
> -        pwx->DF[2*i]   = pwt->DF[2*i+1];    \
> -        pwx->DF[2*i+1] = pws->DF[2*i+1];    \
> -    } while (0)
> -MSA_FN_DF(ilvod_df)
> -#undef MSA_DO
>  #undef MSA_LOOP_COND
> 
>  #define MSA_LOOP_COND(DF) \
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index 5848f7a..10c5c55 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -28884,6 +28884,100 @@ static void gen_msa_bit(CPUMIPSState *env, 
> DisasContext *ctx)
>      tcg_temp_free_i32(tws);
>  }
> 
> +/*
> + * [MSA] ILVOD.B wd, ws, wt
> + *
> + *   Vector Interleave Odd (byte data elements)
> + *
> + */
> +static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt)
> +{
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    const uint64_t mask = 0xff00ff00ff00ff00ULL;
> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t1, t1, 8);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
> +

It is very odd (and confusing for future readers) that you
use t1 first in this function. If variables are named t0, t1,
etc., than their first uses in the function in question should
be in the same order.

(the same comment applies to other similar cases in this
series)

> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 8);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
> +
> +    tcg_temp_free_i64(t0);
> +    tcg_temp_free_i64(t1);
> +}
> +
> +/*
> + * [MSA] ILVOD.H wd, ws, wt
> + *
> + *   Vector Interleave Odd (halfword data elements)
> + *
> + */
> +static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt)
> +{
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    const uint64_t mask = 0xffff0000ffff0000ULL;
> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t1, t1, 16);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 16);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
> +
> +    tcg_temp_free_i64(t0);
> +    tcg_temp_free_i64(t1);
> +}
> +
> +/*
> + * [MSA] ILVOD.W wd, ws, wt
> + *
> + *   Vector Interleave Odd (word data elements)
> + *
> + */
> +static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt)
> +{
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    const uint64_t mask = 0xffffffff00000000ULL;
> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t1, t1, 32);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 32);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
> +
> +    tcg_temp_free_i64(t0);
> +    tcg_temp_free_i64(t1);
> +}
> +
> +/*
> + * [MSA] ILVOD.D wd, ws, wt
> + *
> + *   Vector Interleave Odd (double data elements)

double -> doubleword

Double suggest FP operation.

> + *
> + */
> +static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt)
> +{
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]);
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
> +}
> +
>  static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
>  {
>  #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
> @@ -29055,7 +29149,22 @@ static void gen_msa_3r(CPUMIPSState *env, 
> DisasContext *ctx)
>          gen_helper_msa_mod_u_df(cpu_env, tdf, twd, tws, twt);
>          break;
>      case OPC_ILVOD_df:
> -        gen_helper_msa_ilvod_df(cpu_env, tdf, twd, tws, twt);
> +        switch (df) {
> +        case DF_BYTE:
> +            gen_ilvod_b(env, wd, ws, wt);
> +            break;
> +        case DF_HALF:
> +            gen_ilvod_h(env, wd, ws, wt);
> +            break;
> +        case DF_WORD:
> +            gen_ilvod_w(env, wd, ws, wt);
> +            break;
> +        case DF_DOUBLE:
> +            gen_ilvod_d(env, wd, ws, wt);
> +            break;
> +        default:
> +            assert(0);
> +        }
>          break;
> 
>      case OPC_DOTP_S_df:

All comments apply to the second patch too.

Also, the second patch should come first, since "ILVEV" is before
"ILVOD" in alphabethical order.

Thanks for this great performance improvement!

Aleksandar



reply via email to

[Prev in Thread] Current Thread [Next in Thread]