[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v4 4/7] accel: replacing part of CONFIG_PROFILER
From: |
Alex Bennée |
Subject: |
Re: [Qemu-devel] [PATCH v4 4/7] accel: replacing part of CONFIG_PROFILER with TBStats |
Date: |
Fri, 26 Jul 2019 16:25:44 +0100 |
User-agent: |
mu4e 1.3.3; emacs 27.0.50 |
vandersonmr <address@hidden> writes:
> We add some of the statistics collected in the TCGProfiler
> into the TBStats, having the statistics not only for the whole
> emulation but for each TB. Then, we removed these stats
> from TCGProfiler and reconstruct the information for the
> "info jit" using the sum of all TBStats statistics.
>
> The goal is to have one unique and better way of collecting
> emulation statistics. Moreover, checking dynamiclly if the
> profiling is enabled showed to have an insignificant impact
> on the performance:
> https://wiki.qemu.org/Internships/ProjectIdeas/TCGCodeQuality#Overheads.
>
> Signed-off-by: Vanderson M. do Rosario <address@hidden>
> ---
> accel/tcg/Makefile.objs | 2 +-
> accel/tcg/tb-stats.c | 107 ++++++++++++++++++++++++++++++++++++++
> accel/tcg/translate-all.c | 10 ++--
> include/exec/tb-stats.h | 9 ++++
> tcg/tcg.c | 93 ++++-----------------------------
> tcg/tcg.h | 10 ----
> 6 files changed, 131 insertions(+), 100 deletions(-)
> create mode 100644 accel/tcg/tb-stats.c
>
> diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
> index d381a02f34..49ffe81b5d 100644
> --- a/accel/tcg/Makefile.objs
> +++ b/accel/tcg/Makefile.objs
> @@ -2,7 +2,7 @@ obj-$(CONFIG_SOFTMMU) += tcg-all.o
> obj-$(CONFIG_SOFTMMU) += cputlb.o
> obj-y += tcg-runtime.o tcg-runtime-gvec.o
> obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
> -obj-y += translator.o
> +obj-y += translator.o tb-stats.o
>
> obj-$(CONFIG_USER_ONLY) += user-exec.o
> obj-$(call lnot,$(CONFIG_SOFTMMU)) += user-exec-stub.o
> diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c
> new file mode 100644
> index 0000000000..44497d4f9b
> --- /dev/null
> +++ b/accel/tcg/tb-stats.c
> @@ -0,0 +1,107 @@
> +#include "qemu/osdep.h"
> +
> +#include "disas/disas.h"
> +#include "exec/exec-all.h"
> +#include "tcg.h"
> +
> +#include "qemu/qemu-print.h"
> +
> +struct jit_profile_info {
> + uint64_t translations;
> + uint64_t aborted;
> + uint64_t ops;
> + unsigned ops_max;
> + uint64_t del_ops;
> + uint64_t temps;
> + unsigned temps_max;
> + uint64_t host;
> + uint64_t guest;
> + uint64_t host_ins;
> + uint64_t search_data;
> +};
> +
> +/* accumulate the statistics from all TBs */
> +static void collect_jit_profile_info(void *p, uint32_t hash, void *userp)
> +{
> + struct jit_profile_info *jpi = userp;
> + TBStatistics *tbs = p;
> +
> + jpi->translations += tbs->translations.total;
> + jpi->ops += tbs->code.num_tcg_ops;
> + if (tbs->translations.total && tbs->code.num_tcg_ops /
> tbs->translations.total
> + > jpi->ops_max) {
> + jpi->ops_max = tbs->code.num_tcg_ops / tbs->translations.total;
> + }
> + jpi->del_ops += tbs->code.deleted_ops;
> + jpi->temps += tbs->code.temps;
> + if (tbs->translations.total && tbs->code.temps / tbs->translations.total
> >
> + jpi->temps_max) {
> + jpi->temps_max = tbs->code.temps / tbs->translations.total;
> + }
> + jpi->host += tbs->code.out_len;
> + jpi->guest += tbs->code.in_len;
> + jpi->host_ins += tbs->code.num_host_inst;
> + jpi->search_data += tbs->code.search_out_len;
> +}
> +
> +/* dump JIT statisticis using TCGProfile and TBStats */
> +void dump_jit_profile_info(TCGProfile *s)
> +{
> + if (!tb_stats_collection_enabled()) {
> + return;
> + }
> +
> + struct jit_profile_info *jpi = g_new0(struct jit_profile_info,
> 1);
This leaks and needs to be freed after we are done.
> +
> + qht_iter(&tb_ctx.tb_stats, collect_jit_profile_info, jpi);
> +
> + if (jpi->translations) {
> + qemu_printf("translated TBs %" PRId64 "\n", jpi->translations);
> + qemu_printf("avg ops/TB %0.1f max=%d\n",
> + jpi->ops / (double) jpi->translations, jpi->ops_max);
> + qemu_printf("deleted ops/TB %0.2f\n",
> + jpi->del_ops / (double) jpi->translations);
> + qemu_printf("avg temps/TB %0.2f max=%d\n",
> + jpi->temps / (double) jpi->translations, jpi->temps_max);
> + qemu_printf("avg host code/TB %0.1f\n",
> + jpi->host / (double) jpi->translations);
> + qemu_printf("avg host ins/TB %0.1f\n",
> + jpi->host_ins / (double) jpi->translations);
> + qemu_printf("avg search data/TB %0.1f\n",
> + jpi->search_data / (double) jpi->translations);
> +
> + if (s) {
> + int64_t tot = s->interm_time + s->code_time;
> + qemu_printf("JIT cycles %" PRId64 " (%0.3f s at 2.4
> GHz)\n",
> + tot, tot / 2.4e9);
> + qemu_printf("cycles/op %0.1f\n",
> + jpi->ops ? (double)tot / jpi->ops : 0);
> + qemu_printf("cycles/in byte %0.1f\n",
> + jpi->guest ? (double)tot / jpi->guest : 0);
> + qemu_printf("cycles/out byte %0.1f\n",
> + jpi->host ? (double)tot / jpi->host : 0);
> + qemu_printf("cycles/out inst %0.1f\n",
> + jpi->host_ins ? (double)tot / jpi->host_ins : 0);
> + qemu_printf("cycles/search byte %0.1f\n",
> + jpi->search_data ? (double)tot / jpi->search_data :
> 0);
> + if (tot == 0) {
> + tot = 1;
> + }
> + qemu_printf(" gen_interm time %0.1f%%\n",
> + (double)s->interm_time / tot * 100.0);
> + qemu_printf(" gen_code time %0.1f%%\n",
> + (double)s->code_time / tot * 100.0);
> + qemu_printf("optim./code time %0.1f%%\n",
> + (double)s->opt_time / (s->code_time ? s->code_time :
> 1)
> + * 100.0);
> + qemu_printf("liveness/code time %0.1f%%\n",
> + (double)s->la_time / (s->code_time ? s->code_time : 1) *
> 100.0);
> + qemu_printf("cpu_restore count %" PRId64 "\n",
> + s->restore_count);
> + qemu_printf(" avg cycles %0.1f\n",
> + s->restore_count ? (double)s->restore_time /
> s->restore_count : 0);
> + }
> + }
> +}
> +
> +
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index 3a47ac6f2c..bbdb0f23a4 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1773,8 +1773,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> tb_overflow:
>
> #ifdef CONFIG_PROFILER
> - /* includes aborted translations because of exceptions */
> - atomic_set(&prof->tb_count1, prof->tb_count1 + 1);
> ti = profile_getclock();
> #endif
I assume from the conversation on IRC we can add the profile clocks here
as well. I wonder if execution time could also be added (under a new
tbstats flag)? The data might be too noisy given the time to collect
timestamps vs the relatively short block length.
>
> @@ -1822,7 +1820,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> }
>
> #ifdef CONFIG_PROFILER
> - atomic_set(&prof->tb_count, prof->tb_count + 1);
> atomic_set(&prof->interm_time, prof->interm_time + profile_getclock() -
> ti);
> ti = profile_getclock();
> #endif
> @@ -1869,12 +1866,13 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>
> #ifdef CONFIG_PROFILER
> atomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);
> - atomic_set(&prof->code_in_len, prof->code_in_len + tb->size);
> - atomic_set(&prof->code_out_len, prof->code_out_len + gen_code_size);
> - atomic_set(&prof->search_out_len, prof->search_out_len + search_size);
> #endif
>
> if (tb_stats_enabled(tb, TB_JIT_STATS)) {
> + atomic_add(&tb->tb_stats->code.in_len, tb->size);
> + atomic_add(&tb->tb_stats->code.out_len, gen_code_size);
If we are switching num_host_insns to num_host_bytes we don't need a
duplicate here.
> + atomic_add(&tb->tb_stats->code.search_out_len, search_size);
> +
> size_t code_size = gen_code_size;
> if (tcg_ctx->data_gen_ptr) {
> code_size = tcg_ctx->data_gen_ptr - tb->tc.ptr;
> diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h
> index 6584b78c4a..7d775f2c0d 100644
> --- a/include/exec/tb-stats.h
> +++ b/include/exec/tb-stats.h
> @@ -40,6 +40,13 @@ struct TBStatistics {
> unsigned num_tcg_ops;
> unsigned num_tcg_ops_opt;
> unsigned spills;
> +
> + /* CONFIG_PROFILE */
> + unsigned temps;
> + unsigned deleted_ops;
> + unsigned in_len;
> + unsigned out_len;
> + unsigned search_out_len;
> } code;
>
> struct {
> @@ -51,4 +58,6 @@ struct TBStatistics {
>
> bool tb_stats_cmp(const void *ap, const void *bp);
>
> +void dump_jit_profile_info(TCGProfile *s);
> +
> #endif
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 446e3d1708..46b31a2f68 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -2295,9 +2295,9 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
> QTAILQ_INSERT_TAIL(&s->free_ops, op, link);
> s->nb_ops--;
>
> -#ifdef CONFIG_PROFILER
> - atomic_set(&s->prof.del_op_count, s->prof.del_op_count + 1);
> -#endif
> + if (tb_stats_enabled(s->current_tb, TB_JIT_STATS)) {
> + atomic_inc(&s->current_tb->tb_stats->code.deleted_ops);
> + }
> }
>
> static TCGOp *tcg_op_alloc(TCGOpcode opc)
> @@ -3914,16 +3914,6 @@ void tcg_profile_snapshot(TCGProfile *prof, bool
> counters, bool table)
>
> if (counters) {
> PROF_ADD(prof, orig, cpu_exec_time);
> - PROF_ADD(prof, orig, tb_count1);
> - PROF_ADD(prof, orig, tb_count);
> - PROF_ADD(prof, orig, op_count);
> - PROF_MAX(prof, orig, op_count_max);
> - PROF_ADD(prof, orig, temp_count);
> - PROF_MAX(prof, orig, temp_count_max);
> - PROF_ADD(prof, orig, del_op_count);
> - PROF_ADD(prof, orig, code_in_len);
> - PROF_ADD(prof, orig, code_out_len);
> - PROF_ADD(prof, orig, search_out_len);
> PROF_ADD(prof, orig, interm_time);
> PROF_ADD(prof, orig, code_time);
> PROF_ADD(prof, orig, la_time);
> @@ -4003,26 +3993,17 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
> TCGOp *op;
>
> s->current_tb = tb;
> -
> -#ifdef CONFIG_PROFILER
> - {
> + if (tb_stats_enabled(tb, TB_JIT_STATS)) {
> int n = 0;
>
> QTAILQ_FOREACH(op, &s->ops, link) {
> n++;
> }
> - atomic_set(&prof->op_count, prof->op_count + n);
> - if (n > prof->op_count_max) {
> - atomic_set(&prof->op_count_max, n);
> - }
> + atomic_add(&tb->tb_stats->code.num_tcg_ops, n);
>
> n = s->nb_temps;
> - atomic_set(&prof->temp_count, prof->temp_count + n);
> - if (n > prof->temp_count_max) {
> - atomic_set(&prof->temp_count_max, n);
> - }
> + atomic_add(&tb->tb_stats->code.temps, n);
> }
> -#endif
>
> #ifdef DEBUG_DISAS
> if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
> @@ -4229,70 +4210,16 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
> return tcg_current_code_size(s);
> }
>
> -#ifdef CONFIG_PROFILER
> void tcg_dump_info(void)
> {
> + TCGProfile *s = NULL;
> +#ifdef CONFIG_PROFILER
> TCGProfile prof = {};
> - const TCGProfile *s;
> - int64_t tb_count;
> - int64_t tb_div_count;
> - int64_t tot;
> -
> tcg_profile_snapshot_counters(&prof);
> s = &prof;
> - tb_count = s->tb_count;
> - tb_div_count = tb_count ? tb_count : 1;
> - tot = s->interm_time + s->code_time;
> -
> - qemu_printf("JIT cycles %" PRId64 " (%0.3f s at 2.4 GHz)\n",
> - tot, tot / 2.4e9);
> - qemu_printf("translated TBs %" PRId64 " (aborted=%" PRId64
> - " %0.1f%%)\n",
> - tb_count, s->tb_count1 - tb_count,
> - (double)(s->tb_count1 - s->tb_count)
> - / (s->tb_count1 ? s->tb_count1 : 1) * 100.0);
> - qemu_printf("avg ops/TB %0.1f max=%d\n",
> - (double)s->op_count / tb_div_count, s->op_count_max);
> - qemu_printf("deleted ops/TB %0.2f\n",
> - (double)s->del_op_count / tb_div_count);
> - qemu_printf("avg temps/TB %0.2f max=%d\n",
> - (double)s->temp_count / tb_div_count, s->temp_count_max);
> - qemu_printf("avg host code/TB %0.1f\n",
> - (double)s->code_out_len / tb_div_count);
> - qemu_printf("avg search data/TB %0.1f\n",
> - (double)s->search_out_len / tb_div_count);
> -
> - qemu_printf("cycles/op %0.1f\n",
> - s->op_count ? (double)tot / s->op_count : 0);
> - qemu_printf("cycles/in byte %0.1f\n",
> - s->code_in_len ? (double)tot / s->code_in_len : 0);
> - qemu_printf("cycles/out byte %0.1f\n",
> - s->code_out_len ? (double)tot / s->code_out_len : 0);
> - qemu_printf("cycles/search byte %0.1f\n",
> - s->search_out_len ? (double)tot / s->search_out_len : 0);
> - if (tot == 0) {
> - tot = 1;
> - }
> - qemu_printf(" gen_interm time %0.1f%%\n",
> - (double)s->interm_time / tot * 100.0);
> - qemu_printf(" gen_code time %0.1f%%\n",
> - (double)s->code_time / tot * 100.0);
> - qemu_printf("optim./code time %0.1f%%\n",
> - (double)s->opt_time / (s->code_time ? s->code_time : 1)
> - * 100.0);
> - qemu_printf("liveness/code time %0.1f%%\n",
> - (double)s->la_time / (s->code_time ? s->code_time : 1) *
> 100.0);
> - qemu_printf("cpu_restore count %" PRId64 "\n",
> - s->restore_count);
> - qemu_printf(" avg cycles %0.1f\n",
> - s->restore_count ? (double)s->restore_time /
> s->restore_count : 0);
> -}
> -#else
> -void tcg_dump_info(void)
> -{
> - qemu_printf("[TCG profiler not compiled]\n");
> -}
> #endif
> + dump_jit_profile_info(s);
> +}
>
> #ifdef ELF_HOST_MACHINE
> /* In order to use this feature, the backend needs to do three things:
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index bf6f3bcba3..026a066b9a 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -644,16 +644,6 @@ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
>
> typedef struct TCGProfile {
> int64_t cpu_exec_time;
> - int64_t tb_count1;
> - int64_t tb_count;
> - int64_t op_count; /* total insn count */
> - int op_count_max; /* max insn per TB */
> - int temp_count_max;
> - int64_t temp_count;
> - int64_t del_op_count;
> - int64_t code_in_len;
> - int64_t code_out_len;
> - int64_t search_out_len;
> int64_t interm_time;
> int64_t code_time;
> int64_t la_time;
--
Alex Bennée
- [Qemu-devel] [PATCH v4 0/7] Measure Tiny Code Generation Quality, vandersonmr, 2019/07/19
- [Qemu-devel] [PATCH v4 1/7] accel: introducing TBStatistics structure, vandersonmr, 2019/07/19
- [Qemu-devel] [PATCH v4 2/7] accel: collecting TB execution count, vandersonmr, 2019/07/19
- [Qemu-devel] [PATCH v4 3/7] accel: collecting JIT statistics, vandersonmr, 2019/07/19
- [Qemu-devel] [PATCH v4 4/7] accel: replacing part of CONFIG_PROFILER with TBStats, vandersonmr, 2019/07/19
- Re: [Qemu-devel] [PATCH v4 4/7] accel: replacing part of CONFIG_PROFILER with TBStats,
Alex Bennée <=
- [Qemu-devel] [PATCH v4 5/7] log: adding -d tb_stats to control tbstats, vandersonmr, 2019/07/19
- [Qemu-devel] [PATCH v4 6/7] monitor: adding tb_stats hmp command, vandersonmr, 2019/07/19
- [Qemu-devel] [PATCH v4 7/7] monitor: adding info tbs, tb, and coverset, vandersonmr, 2019/07/19
- Re: [Qemu-devel] [PATCH v4 0/7] Measure Tiny Code Generation Quality, Alex Bennée, 2019/07/26