[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH for-4.0 v2 08/37] tcg/i386: Force qemu_ld/st arg
From: |
Alex Bennée |
Subject: |
Re: [Qemu-devel] [PATCH for-4.0 v2 08/37] tcg/i386: Force qemu_ld/st arguments into fixed registers |
Date: |
Fri, 30 Nov 2018 16:16:00 +0000 |
User-agent: |
mu4e 1.1.0; emacs 26.1.90 |
Richard Henderson <address@hidden> writes:
> This is an incremental step toward moving the qemu_ld/st
> code sequence out of line.
>
> Signed-off-by: Richard Henderson <address@hidden>
> ---
> tcg/i386/tcg-target.inc.c | 203 +++++++++++++++++++++++++++++++-------
> 1 file changed, 169 insertions(+), 34 deletions(-)
>
> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index 07df4b2b12..50e5dc31b3 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c
> @@ -171,6 +171,80 @@ static bool have_lzcnt;
>
> static tcg_insn_unit *tb_ret_addr;
>
> +typedef enum {
> + ARG_ADDR,
> + ARG_STVAL,
> + ARG_LDVAL,
> +} QemuMemArgType;
> +
> +#ifdef CONFIG_SOFTMMU
> +/*
> + * Constraint to choose a particular register. This is used for softmmu
> + * loads and stores. Registers with no assignment get an empty string.
> + */
> +static const char * const one_reg_constraint[TCG_TARGET_NB_REGS] = {
> + [TCG_REG_EAX] = "a",
> + [TCG_REG_EBX] = "b",
> + [TCG_REG_ECX] = "c",
> + [TCG_REG_EDX] = "d",
> + [TCG_REG_ESI] = "S",
> + [TCG_REG_EDI] = "D",
> +#if TCG_TARGET_REG_BITS == 64
> + [TCG_REG_R8] = "E",
> + [TCG_REG_R9] = "N",
> +#endif
> +};
> +
> +/*
> + * Calling convention for the softmmu load and store thunks.
> + *
> + * For 64-bit, we mostly use the host calling convention, therefore the
> + * real first argument is reserved for the ENV parameter that is passed
> + * on to the slow path helpers.
> + *
> + * For 32-bit, the host calling convention is stack based; we invent a
> + * private convention that uses 4 of the 6 available host registers.
> + * We reserve EAX and EDX as temporaries for use by the thunk, we require
> + * INDEX_op_qemu_st_i32 to have a 'q' register from which to store, and
> + * further complicate this last by wanting a call-clobbered for that store.
> + * The 'q' requirement allows MO_8 stores at all; the call-clobbered part
> + * allows bswap to operate in-place, clobbering the input.
> + */
> +static TCGReg softmmu_arg(QemuMemArgType type, bool is_64, int hi)
> +{
> + switch (type) {
> + case ARG_ADDR:
> + tcg_debug_assert(!hi || TARGET_LONG_BITS > TCG_TARGET_REG_BITS);
> + if (TCG_TARGET_REG_BITS == 64) {
> + return tcg_target_call_iarg_regs[1];
> + } else {
> + return hi ? TCG_REG_EDI : TCG_REG_ESI;
> + }
> + case ARG_STVAL:
> + tcg_debug_assert(!hi || (TCG_TARGET_REG_BITS == 32 && is_64));
> + if (TCG_TARGET_REG_BITS == 64) {
> + return tcg_target_call_iarg_regs[2];
> + } else {
> + return hi ? TCG_REG_EBX : TCG_REG_ECX;
> + }
> + case ARG_LDVAL:
> + tcg_debug_assert(!hi || (TCG_TARGET_REG_BITS == 32 && is_64));
> + return tcg_target_call_oarg_regs[hi];
> + }
> + g_assert_not_reached();
> +}
> +
> +static const char *constrain_memop_arg(QemuMemArgType type, bool is_64, int
> hi)
> +{
> + return one_reg_constraint[softmmu_arg(type, is_64, hi)];
> +}
> +#else
> +static const char *constrain_memop_arg(QemuMemArgType type, bool is_64, int
> hi)
> +{
> + return "L";
> +}
> +#endif /* CONFIG_SOFTMMU */
> +
> static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
> intptr_t value, intptr_t addend)
> {
> @@ -1680,11 +1754,15 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg
> addrlo, TCGReg addrhi,
> copies the entire guest address for the slow path, while truncation
> for the 32-bit host happens with the fastpath ADDL below. */
> if (TCG_TARGET_REG_BITS == 64) {
> - base = tcg_target_call_iarg_regs[1];
> + tcg_debug_assert(addrlo == tcg_target_call_iarg_regs[1]);
> + if (TARGET_LONG_BITS == 32) {
> + tcg_out_ext32u(s, addrlo, addrlo);
> + }
> + base = addrlo;
> } else {
> base = r1;
> + tcg_out_mov(s, ttype, base, addrlo);
> }
> - tcg_out_mov(s, ttype, base, addrlo);
>
> /* jne slow_path */
> tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
> @@ -2009,16 +2087,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
> common. */
> static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
> {
> - TCGReg datalo, datahi, addrlo;
> - TCGReg addrhi __attribute__((unused));
> + TCGReg datalo, addrlo;
> + TCGReg datahi __attribute__((unused)) = -1;
> + TCGReg addrhi __attribute__((unused)) = -1;
> TCGMemOpIdx oi;
> TCGMemOp opc;
> + int i = -1;
>
> - datalo = *args++;
> - datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
> - addrlo = *args++;
> - addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
> - oi = *args++;
> + datalo = args[++i];
Swapping to i = -1 and pre-indexes seems a little unnatural here
compared to a more normal 0 and i++ unless there was a specific reason
to have i in the range of 2-4?
> + if (TCG_TARGET_REG_BITS == 32 && is64) {
> + datahi = args[++i];
> + }
> + addrlo = args[++i];
> + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> + addrhi = args[++i];
> + }
> + oi = args[++i];
> opc = get_memop(oi);
>
> #if defined(CONFIG_SOFTMMU)
> @@ -2027,6 +2111,15 @@ static void tcg_out_qemu_ld(TCGContext *s, const
> TCGArg *args, bool is64)
> tcg_insn_unit *label_ptr[2];
> TCGReg base;
>
> + tcg_debug_assert(datalo == softmmu_arg(ARG_LDVAL, is64, 0));
> + if (TCG_TARGET_REG_BITS == 32 && is64) {
> + tcg_debug_assert(datahi == softmmu_arg(ARG_LDVAL, is64, 1));
> + }
> + tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));
> + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> + tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));
> + }
> +
> base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
> label_ptr, offsetof(CPUTLBEntry, addr_read));
>
> @@ -2149,16 +2242,22 @@ static void tcg_out_qemu_st_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
>
> static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
> {
> - TCGReg datalo, datahi, addrlo;
> - TCGReg addrhi __attribute__((unused));
> + TCGReg datalo, addrlo;
> + TCGReg datahi __attribute__((unused)) = -1;
> + TCGReg addrhi __attribute__((unused)) = -1;
> TCGMemOpIdx oi;
> TCGMemOp opc;
> + int i = -1;
And again here
>
> - datalo = *args++;
> - datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
> - addrlo = *args++;
> - addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
> - oi = *args++;
> + datalo = args[++i];
> + if (TCG_TARGET_REG_BITS == 32 && is64) {
> + datahi = args[++i];
> + }
> + addrlo = args[++i];
> + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> + addrhi = args[++i];
> + }
> + oi = args[++i];
> opc = get_memop(oi);
>
> #if defined(CONFIG_SOFTMMU)
> @@ -2167,6 +2266,15 @@ static void tcg_out_qemu_st(TCGContext *s, const
> TCGArg *args, bool is64)
> tcg_insn_unit *label_ptr[2];
> TCGReg base;
>
> + tcg_debug_assert(datalo == softmmu_arg(ARG_STVAL, is64, 0));
> + if (TCG_TARGET_REG_BITS == 32 && is64) {
> + tcg_debug_assert(datahi == softmmu_arg(ARG_STVAL, is64, 1));
> + }
> + tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));
> + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> + tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));
> + }
> +
> base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
> label_ptr, offsetof(CPUTLBEntry,
> addr_write));
>
> @@ -2836,15 +2944,6 @@ static const TCGTargetOpDef
> *tcg_target_op_def(TCGOpcode op)
> static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" }
> };
> static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" }
> };
> static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" }
> };
> - static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
> - static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
> - static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
> - static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
> - static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
> - static const TCGTargetOpDef r_r_L_L
> - = { .args_ct_str = { "r", "r", "L", "L" } };
> - static const TCGTargetOpDef L_L_L_L
> - = { .args_ct_str = { "L", "L", "L", "L" } };
> static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
> static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
> static const TCGTargetOpDef x_x_x_x
> @@ -3026,17 +3125,53 @@ static const TCGTargetOpDef
> *tcg_target_op_def(TCGOpcode op)
> }
>
> case INDEX_op_qemu_ld_i32:
> - return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
> - case INDEX_op_qemu_st_i32:
> - return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
> + {
> + static TCGTargetOpDef ld32;
> + int i;
> +
> + ld32.args_ct_str[0] = constrain_memop_arg(ARG_LDVAL, 0, 0);
> + for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS;
> ++i) {
This formulation is a bit weird to follow, at first I thought it was
going to overflow TCG_MAX_OP_ARGS until I realised what it was doing.
Maybe a helper function would be a little clearer:
for (src_reg = 0; src_reg <
tcg_target_regs_for(TARGET_LONG_BITS), ++i)
Same comment applies bellow.
> + ld32.args_ct_str[i + 1] = constrain_memop_arg(ARG_ADDR, 0,
> i);
> + }
> + return &ld32;
> + }
> case INDEX_op_qemu_ld_i64:
> - return (TCG_TARGET_REG_BITS == 64 ? &r_L
> - : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
> - : &r_r_L_L);
> + {
> + static TCGTargetOpDef ld64;
> + int i, j = 0;
> +
> + for (i = 0; i * TCG_TARGET_REG_BITS < 64; ++i) {
> + ld64.args_ct_str[j++] = constrain_memop_arg(ARG_LDVAL, 1, i);
> + }
> + for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++i) {
> + ld64.args_ct_str[j++] = constrain_memop_arg(ARG_ADDR, 0, i);
> + }
> + return &ld64;
> + }
> + case INDEX_op_qemu_st_i32:
> + {
> + static TCGTargetOpDef st32;
> + int i;
> +
> + st32.args_ct_str[0] = constrain_memop_arg(ARG_STVAL, 0, 0);
> + for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++i) {
> + st32.args_ct_str[i + 1] = constrain_memop_arg(ARG_ADDR, 0,
> i);
> + }
> + return &st32;
> + }
> case INDEX_op_qemu_st_i64:
> - return (TCG_TARGET_REG_BITS == 64 ? &L_L
> - : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
> - : &L_L_L_L);
> + {
> + static TCGTargetOpDef st64;
> + int i, j = 0;
> +
> + for (i = 0; i * TCG_TARGET_REG_BITS < 64; ++i) {
> + st64.args_ct_str[j++] = constrain_memop_arg(ARG_STVAL, 1, i);
> + }
> + for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++i) {
> + st64.args_ct_str[j++] = constrain_memop_arg(ARG_ADDR, 0, i);
> + }
> + return &st64;
> + }
>
> case INDEX_op_brcond2_i32:
> {
--
Alex Bennée
- [Qemu-devel] [PATCH for-4.0 v2 05/37] tcg/i386: Add constraints for r8 and r9, (continued)
- [Qemu-devel] [PATCH for-4.0 v2 05/37] tcg/i386: Add constraints for r8 and r9, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 04/37] tcg: Add TCG_TARGET_NEED_LDST_OOL_LABELS, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 10/37] tcg/aarch64: Add constraints for x0, x1, x2, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 11/37] tcg/aarch64: Parameterize the temps for tcg_out_tlb_read, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 12/37] tcg/aarch64: Parameterize the temp for tcg_out_goto_long, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 13/37] tcg/aarch64: Use B not BL for tcg_out_goto_long, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 09/37] tcg/i386: Use TCG_TARGET_NEED_LDST_OOL_LABELS, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 08/37] tcg/i386: Force qemu_ld/st arguments into fixed registers, Richard Henderson, 2018/11/23
- Re: [Qemu-devel] [PATCH for-4.0 v2 08/37] tcg/i386: Force qemu_ld/st arguments into fixed registers,
Alex Bennée <=
- [Qemu-devel] [PATCH for-4.0 v2 14/37] tcg/aarch64: Use TCG_TARGET_NEED_LDST_OOL_LABELS, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 16/37] tcg/arm: Add constraints for R0-R5, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 15/37] tcg/arm: Parameterize the temps for tcg_out_tlb_read, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 29/37] tcg: Add TCG_TARGET_HAS_MEMORY_BSWAP, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 27/37] tcg: Clean up generic bswap64, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 24/37] tcg/ppc: Force qemu_ld/st arguments into fixed registers, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 18/37] tcg/arm: Force qemu_ld/st arguments into fixed registers, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 17/37] tcg/arm: Reduce the number of temps for tcg_out_tlb_read, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 22/37] tcg/ppc: Add constraints for R7-R8, Richard Henderson, 2018/11/23
- [Qemu-devel] [PATCH for-4.0 v2 20/37] tcg/ppc: Parameterize the temps for tcg_out_tlb_read, Richard Henderson, 2018/11/23