qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PULL 21/23] tcg/ppc: Change TCG_REG_RA to TCG_REG_TB


From: Richard Henderson
Subject: [Qemu-devel] [PULL 21/23] tcg/ppc: Change TCG_REG_RA to TCG_REG_TB
Date: Thu, 7 Sep 2017 15:40:49 -0700

From: Richard Henderson <address@hidden>

At this point the conversion is a wash.  Loading of TB+ofs is
smaller, but the actual return address from exit_tb is larger.
There are a few more insns required to transition between TBs.

But the expectation is that accesses to the constant pool will
on the whole be smaller.

Signed-off-by: Richard Henderson <address@hidden>
---
 tcg/ppc/tcg-target.inc.c | 273 +++++++++++++++++++++--------------------------
 1 file changed, 122 insertions(+), 151 deletions(-)

diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index d772faf7be..bc14d2c9c6 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -39,29 +39,8 @@
 # define TCG_REG_TMP1   TCG_REG_R12
 #endif
 
-/* For the 64-bit target, we don't like the 5 insn sequence needed to build
-   full 64-bit addresses.  Better to have a base register to which we can
-   apply a 32-bit displacement.
-
-   There are generally three items of interest:
-   (1) helper functions in the main executable,
-   (2) TranslationBlock data structures,
-   (3) the return address in the epilogue.
-
-   For user-only, we USE_STATIC_CODE_GEN_BUFFER, so the code_gen_buffer
-   will be inside the main executable, and thus near enough to make a
-   pointer to the epilogue be within 2GB of all helper functions.
-
-   For softmmu, we'll let the kernel choose the address of code_gen_buffer,
-   and odds are it'll be somewhere close to the main malloc arena, and so
-   a pointer to the epilogue will be within 2GB of the TranslationBlocks.
-
-   For --enable-pie, everything will be kinda near everything else,
-   somewhere in high memory.
-
-   Thus we choose to keep the return address in a call-saved register.  */
-#define TCG_REG_RA     TCG_REG_R31
-#define USE_REG_RA     (TCG_TARGET_REG_BITS == 64)
+#define TCG_REG_TB     TCG_REG_R31
+#define USE_REG_TB     (TCG_TARGET_REG_BITS == 64)
 
 /* Shorthand for size of a pointer.  Avoid promotion to unsigned.  */
 #define SZP  ((int)sizeof(void *))
@@ -614,50 +593,68 @@ static inline void tcg_out_shri64(TCGContext *s, TCGReg 
dst, TCGReg src, int c)
     tcg_out_rld(s, RLDICL, dst, src, 64 - c, c);
 }
 
-static void tcg_out_movi32(TCGContext *s, TCGReg ret, int32_t arg)
+static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+                             tcg_target_long arg, bool in_prologue)
 {
-    if (arg == (int16_t) arg) {
+    intptr_t tb_diff;
+    int32_t high;
+
+    tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || type == TCG_TYPE_I32);
+
+    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I32) {
+        arg = (int32_t)arg;
+    }
+
+    /* Load 16-bit immediates with one insn.  */
+    if (arg == (int16_t)arg) {
         tcg_out32(s, ADDI | TAI(ret, 0, arg));
-    } else {
+        return;
+    }
+
+    /* Load addresses within the TB with one insn.  */
+    tb_diff = arg - (intptr_t)s->code_gen_ptr;
+    if (!in_prologue && USE_REG_TB && tb_diff == (int16_t)tb_diff) {
+        tcg_out32(s, ADDI | TAI(ret, TCG_REG_TB, tb_diff));
+        return;
+    }
+
+    /* Load 32-bit immediates with two insns.  */
+    if (TCG_TARGET_REG_BITS == 32 || arg == (int32_t)arg) {
         tcg_out32(s, ADDIS | TAI(ret, 0, arg >> 16));
         if (arg & 0xffff) {
             tcg_out32(s, ORI | SAI(ret, ret, arg));
         }
+        return;
     }
-}
-
-static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
-                         tcg_target_long arg)
-{
-    tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || type == TCG_TYPE_I32);
-    if (type == TCG_TYPE_I32 || arg == (int32_t)arg) {
-        tcg_out_movi32(s, ret, arg);
-    } else if (arg == (uint32_t)arg && !(arg & 0x8000)) {
+    if (arg == (uint32_t)arg && !(arg & 0x8000)) {
         tcg_out32(s, ADDI | TAI(ret, 0, arg));
         tcg_out32(s, ORIS | SAI(ret, ret, arg >> 16));
-    } else {
-        int32_t high;
+        return;
+    }
 
-        if (USE_REG_RA) {
-            intptr_t diff = arg - (intptr_t)tb_ret_addr;
-            if (diff == (int32_t)diff) {
-                tcg_out_mem_long(s, ADDI, ADD, ret, TCG_REG_RA, diff);
-                return;
-            }
-        }
+    /* Load addresses within 2GB of TB with 2 (or rarely 3) insns.  */
+    if (!in_prologue && USE_REG_TB && tb_diff == (int32_t)tb_diff) {
+        tcg_out_mem_long(s, ADDI, ADD, ret, TCG_REG_TB, tb_diff);
+        return;
+    }
 
-        high = arg >> 31 >> 1;
-        tcg_out_movi32(s, ret, high);
-        if (high) {
-            tcg_out_shli64(s, ret, ret, 32);
-        }
-        if (arg & 0xffff0000) {
-            tcg_out32(s, ORIS | SAI(ret, ret, arg >> 16));
-        }
-        if (arg & 0xffff) {
-            tcg_out32(s, ORI | SAI(ret, ret, arg));
-        }
+    high = arg >> 31 >> 1;
+    tcg_out_movi(s, TCG_TYPE_I32, ret, high);
+    if (high) {
+        tcg_out_shli64(s, ret, ret, 32);
     }
+    if (arg & 0xffff0000) {
+        tcg_out32(s, ORIS | SAI(ret, ret, arg >> 16));
+    }
+    if (arg & 0xffff) {
+        tcg_out32(s, ORI | SAI(ret, ret, arg));
+    }
+}
+
+static inline void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
+                                tcg_target_long arg)
+{
+    tcg_out_movi_int(s, type, ret, arg, false);
 }
 
 static bool mask_operand(uint32_t c, int *mb, int *me)
@@ -1293,49 +1290,43 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
     tcg_out32(s, insn);
 }
 
-#ifdef __powerpc64__
 void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_addr,
                               uintptr_t addr)
 {
-    tcg_insn_unit i1, i2;
-    uint64_t pair;
-    intptr_t diff = addr - jmp_addr;
-
-    if (in_range_b(diff)) {
-        i1 = B | (diff & 0x3fffffc);
-        i2 = NOP;
-    } else if (USE_REG_RA) {
-        intptr_t lo, hi;
-        diff = addr - (uintptr_t)tb_ret_addr;
-        lo = (int16_t)diff;
-        hi = (int32_t)(diff - lo);
-        tcg_debug_assert(diff == hi + lo);
-        i1 = ADDIS | TAI(TCG_REG_TMP1, TCG_REG_RA, hi >> 16);
-        i2 = ADDI | TAI(TCG_REG_TMP1, TCG_REG_TMP1, lo);
-    } else {
-        tcg_debug_assert(TCG_TARGET_REG_BITS == 32 || addr == (int32_t)addr);
-        i1 = ADDIS | TAI(TCG_REG_TMP1, 0, addr >> 16);
-        i2 = ORI | SAI(TCG_REG_TMP1, TCG_REG_TMP1, addr);
-    }
+    if (TCG_TARGET_REG_BITS == 64) {
+        tcg_insn_unit i1, i2;
+        intptr_t tb_diff = addr - tc_ptr;
+        intptr_t br_diff = addr - (jmp_addr + 4);
+        uint64_t pair;
+
+        /* This does not exercise the range of the branch, but we do
+           still need to be able to load the new value of TCG_REG_TB.
+           But this does still happen quite often.  */
+        if (tb_diff == (int16_t)tb_diff) {
+            i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
+            i2 = B | (br_diff & 0x3fffffc);
+        } else {
+            intptr_t lo = (int16_t)tb_diff;
+            intptr_t hi = (int32_t)(tb_diff - lo);
+            assert(tb_diff == hi + lo);
+            i1 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
+            i2 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
+        }
 #ifdef HOST_WORDS_BIGENDIAN
-    pair = (uint64_t)i1 << 32 | i2;
+        pair = (uint64_t)i1 << 32 | i2;
 #else
-    pair = (uint64_t)i2 << 32 | i1;
+        pair = (uint64_t)i2 << 32 | i1;
 #endif
 
-    atomic_set((uint64_t *)jmp_addr, pair);
-    flush_icache_range(jmp_addr, jmp_addr + 8);
-}
-#else
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_addr,
-                              uintptr_t addr)
-{
-    intptr_t diff = addr - jmp_addr;
-    tcg_debug_assert(in_range_b(diff));
-    atomic_set((uint32_t *)jmp_addr, B | (diff & 0x3fffffc));
-    flush_icache_range(jmp_addr, jmp_addr + 4);
+        atomic_set((uint64_t *)jmp_addr, pair);
+        flush_icache_range(jmp_addr, jmp_addr + 8);
+    } else {
+        intptr_t diff = addr - jmp_addr;
+        tcg_debug_assert(in_range_b(diff));
+        atomic_set((uint32_t *)jmp_addr, B | (diff & 0x3fffffc));
+        flush_icache_range(jmp_addr, jmp_addr + 4);
+    }
 }
-#endif
 
 static void tcg_out_call(TCGContext *s, tcg_insn_unit *target)
 {
@@ -1897,44 +1888,20 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
 #ifndef CONFIG_SOFTMMU
     if (guest_base) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base);
+        tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base, 
true);
         tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
     }
 #endif
 
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
     tcg_out32(s, MTSPR | RS(tcg_target_call_iarg_regs[1]) | CTR);
-
-    if (USE_REG_RA) {
-#ifdef _CALL_AIX
-        /* Make the caller load the value as the TOC into R2.  */
-        tb_ret_addr = s->code_ptr + 2;
-        desc[1] = tb_ret_addr;
-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_RA, TCG_REG_R2);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-#elif defined(_CALL_ELF) && _CALL_ELF == 2
-        /* Compute from the incoming R12 value.  */
-        tb_ret_addr = s->code_ptr + 2;
-        tcg_out32(s, ADDI | TAI(TCG_REG_RA, TCG_REG_R12,
-                                tcg_ptr_byte_diff(tb_ret_addr, s->code_buf)));
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-#else
-        /* Reserve max 5 insns for the constant load.  */
-        tb_ret_addr = s->code_ptr + 6;
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (intptr_t)tb_ret_addr);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-        while (s->code_ptr < tb_ret_addr) {
-            tcg_out32(s, NOP);
-        }
-#endif
-    } else {
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-        tb_ret_addr = s->code_ptr;
+    if (USE_REG_TB) {
+        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, tcg_target_call_iarg_regs[1]);
     }
+    tcg_out32(s, BCCTR | BO_ALWAYS);
 
     /* Epilogue */
-    tcg_debug_assert(tb_ret_addr == s->code_ptr);
-    s->code_gen_epilogue = tb_ret_addr;
+    s->code_gen_epilogue = tb_ret_addr = s->code_ptr;
 
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_REG_R1, FRAME_SIZE+LR_OFFSET);
     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); ++i) {
@@ -1954,44 +1921,48 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, 
const TCGArg *args,
 
     switch (opc) {
     case INDEX_op_exit_tb:
-        if (USE_REG_RA) {
-            ptrdiff_t disp = tcg_pcrel_diff(s, tb_ret_addr);
-
-            /* Use a direct branch if we can, otherwise use the value in RA.
-               Note that the direct branch is always backward, thus we need
-               to account for the possibility of 5 insns from the movi.  */
-            if (!in_range_b(disp - 20)) {
-                tcg_out32(s, MTSPR | RS(TCG_REG_RA) | CTR);
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
-                tcg_out32(s, BCCTR | BO_ALWAYS);
-                break;
-            }
-        }
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
         tcg_out_b(s, 0, tb_ret_addr);
         break;
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset);
-        /* Direct jump. */
-#ifdef __powerpc64__
-        /* Ensure the next insns are 8-byte aligned. */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out32(s, NOP);
-        }
-        s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-        /* To be replaced by either a branch+nop or a load into TMP1.  */
-        s->code_ptr += 2;
-        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
+        if (s->tb_jmp_insn_offset) {
+            /* Direct jump. */
+            if (TCG_TARGET_REG_BITS == 64) {
+                /* Ensure the next insns are 8-byte aligned. */
+                if ((uintptr_t)s->code_ptr & 7) {
+                    tcg_out32(s, NOP);
+                }
+                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+                tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+                tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+            } else {
+                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+                tcg_out32(s, B);
+                s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+                break;
+            }
+        } else {
+            /* Indirect jump. */
+            tcg_debug_assert(s->tb_jmp_insn_offset == NULL);
+            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, 0,
+                       (intptr_t)(s->tb_jmp_insn_offset + args[0]));
+        }
+        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
         tcg_out32(s, BCCTR | BO_ALWAYS);
-#else
-        /* To be replaced by a branch.  */
-        s->code_ptr++;
-#endif
-        s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+        s->tb_jmp_reset_offset[args[0]] = c = tcg_current_code_size(s);
+        if (USE_REG_TB) {
+            /* For the unlinked case, need to reset TCG_REG_TB.  */
+            c = -c;
+            assert(c == (int16_t)c);
+            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, c));
+        }
         break;
     case INDEX_op_goto_ptr:
         tcg_out32(s, MTSPR | RS(args[0]) | CTR);
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, 0);
+        if (USE_REG_TB) {
+            tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, args[0]);
+        }
+        tcg_out32(s, ADDI | TAI(TCG_REG_R3, 0, 0));
         tcg_out32(s, BCCTR | BO_ALWAYS);
         break;
     case INDEX_op_br:
@@ -2761,8 +2732,8 @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_R13); /* thread pointer */
 #endif
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP1); /* mem temp */
-    if (USE_REG_RA) {
-        tcg_regset_set_reg(s->reserved_regs, TCG_REG_RA);  /* return addr */
+    if (USE_REG_TB) {
+        tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);  /* tb->tc_ptr */
     }
 }
 
-- 
2.13.5




reply via email to

[Prev in Thread] Current Thread [Next in Thread]