qemu-arm
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-arm] [PATCH 03/10] target/arm: optimize cross-page block chaining


From: Emilio G. Cota
Subject: [Qemu-arm] [PATCH 03/10] target/arm: optimize cross-page block chaining in softmmu
Date: Tue, 11 Apr 2017 21:17:23 -0400

Instead of unconditionally exiting to the exec loop, add a helper to
check whether the target TB is valid. As long as the hit rate in
tb_jmp_cache remains high, this improves performance.

Measurements:

- Boot time of ARM debian jessie on Intel host:

| setup              | ARM debian boot+shutdown time | stddev |
|--------------------+-------------------------------+--------|
| master             |                  10.050247057 | 0.0361 |
| +cross             |                  10.311265443 | 0.0721 |

That is a 2.58% slowdown when booting. This is reasonable given that
tb_jmp_cache's hit rate when booting is expected to be low.

-                NBench, arm-softmmu. Host: Intel i7-4790K @ 4.00GHz
                        (y axis: Speedup over 95b31d70)

    1.3x+-+--------------------------------------------------------------+-+
        |                                           cross+noinline $$$     |
        |                                           cross+inline   %%%     |
        |                   $$$%%                                          |
    1.2x+-+.................$.$.%.......$$$..............................+-+
        |                   $ $ %       $ $%                               |
        |                   $ $ %       $ $%                               |
    1.1x+-+.................$.$.%.......$.$%.............................+-+
        |             $$$%% $ $ %       $ $%                               |
        |             $ $ % $ $ %       $ $% $$$%%             $$$%% $$$%% |
        | $$$%% $$$%% $ $ % $ $ % $$$%% $ $% $ $ %   %%%       $ $ % $ $ % |
      1x+-$.$B%R$R$A%G$A$H%T$M$_%P$L$i%l$n$%.$.$.%...%.%.$$$%%.$.$.%.$.$.%-+
        | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ %   % % $ $ % $ $ % $ $ % |
        | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ %   % % $ $ % $ $ % $ $ % |
    0.9x+-$.$.%.$.$.%.$.$.%.$.$.%.$.$.%.$.$%.$.$.%...%.%.$.$.%.$.$.%.$.$.%-+
        | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ %   % % $ $ % $ $ % $ $ % |
        | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ % $$$ % $ $ % $ $ % $ $ % |
        | $ $ % $ $ % $ $ % $ $ % $ $ % $ $% $ $ % $ $ % $ $ % $ $ % $ $ % |
    0.8x+-$$$%%-$$$%%-$$$%%-$$$%%-$$$%%-$$$%-$$$%%-$$$%%-$$$%%-$$$%%-$$$%%-+
       ASSIGNMBITFIELFOUFP_EMULATHUFFMALU_DECOMPNEURANUMERICSTRING_SOhmean

  png: http://imgur.com/1rmYSaF

That is, a 4.04% hmean perf improvement over master with tb_from_jmp_cache
not inlined, and a 5.82% hmean perf improvement over master with 
tb_from_jmp_cache
inlined (i.e. this commit). The largest improvement is 21% for the FP_EMULATION
benchmark.

Signed-off-by: Emilio G. Cota <address@hidden>
---
 target/arm/helper.c    |  5 +++++
 target/arm/helper.h    |  2 ++
 target/arm/translate.c | 12 ++++++++++++
 3 files changed, 19 insertions(+)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 8cb7a94..10b8807 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -9922,3 +9922,8 @@ uint32_t HELPER(crc32c)(uint32_t acc, uint32_t val, 
uint32_t bytes)
     /* Linux crc32c converts the output to one's complement.  */
     return crc32c(acc, buf, bytes) ^ 0xffffffff;
 }
+
+uint32_t HELPER(cross_page_check)(CPUARMState *env, target_ulong vaddr)
+{
+    return !!tb_from_jmp_cache(env, vaddr);
+}
diff --git a/target/arm/helper.h b/target/arm/helper.h
index df86bf7..d4b779b 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -1,6 +1,8 @@
 DEF_HELPER_FLAGS_1(sxtb16, TCG_CALL_NO_RWG_SE, i32, i32)
 DEF_HELPER_FLAGS_1(uxtb16, TCG_CALL_NO_RWG_SE, i32, i32)
 
+DEF_HELPER_2(cross_page_check, i32, env, tl)
+
 DEF_HELPER_3(add_setq, i32, env, i32, i32)
 DEF_HELPER_3(add_saturate, i32, env, i32, i32)
 DEF_HELPER_3(sub_saturate, i32, env, i32, i32)
diff --git a/target/arm/translate.c b/target/arm/translate.c
index e32e38c..ce97d0c 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -4085,6 +4085,18 @@ static inline void gen_goto_tb(DisasContext *s, int n, 
target_ulong dest)
         gen_set_pc_im(s, dest);
         tcg_gen_exit_tb((uintptr_t)s->tb + n);
     } else {
+        TCGv vaddr = tcg_const_tl(dest);
+        TCGv_i32 valid = tcg_temp_new_i32();
+        TCGLabel *label = gen_new_label();
+
+        gen_helper_cross_page_check(valid, cpu_env, vaddr);
+        tcg_temp_free(vaddr);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, valid, 0, label);
+        tcg_temp_free_i32(valid);
+        tcg_gen_goto_tb(n);
+        gen_set_pc_im(s, dest);
+        tcg_gen_exit_tb((uintptr_t)s->tb + n);
+        gen_set_label(label);
         gen_set_pc_im(s, dest);
         tcg_gen_exit_tb(0);
     }
-- 
2.7.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]