qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 3/3] tcg-alpha: New TCG target.


From: Aurelien Jarno
Subject: Re: [Qemu-devel] [PATCH 3/3] tcg-alpha: New TCG target.
Date: Mon, 22 Oct 2012 23:39:46 +0200
User-agent: Mutt/1.5.21 (2010-09-15)

On Sat, Sep 29, 2012 at 11:10:39AM -0700, Richard Henderson wrote:
> This began with a patch from Dong Weiyu <address@hidden>,
> and was modified to fix problems and to adapt to changes in TCG.
> 
> Signed-off-by: Richard Henderson <address@hidden>
> ---
>  configure              |   17 +-
>  exec-all.h             |    7 +-
>  qemu-common.h          |    4 +-
>  tcg/alpha/tcg-target.c | 1860 
> ++++++++++++++++++++++++++++++++++++++++++++++++
>  tcg/alpha/tcg-target.h |  142 ++++
>  5 files changed, 2018 insertions(+), 12 deletions(-)
>  create mode 100644 tcg/alpha/tcg-target.c
>  create mode 100644 tcg/alpha/tcg-target.h
> 
> diff --git a/configure b/configure
> index 8f99b7b..85e5efa 100755
> --- a/configure
> +++ b/configure
> @@ -352,6 +352,8 @@ elif check_define __arm__ ; then
>    cpu="arm"
>  elif check_define __hppa__ ; then
>    cpu="hppa"
> +elif check_define __alpha__ ; then
> +  cpu="alpha"
>  else
>    cpu=`uname -m`
>  fi
> @@ -381,6 +383,9 @@ case "$cpu" in
>    sparc|sun4[cdmuv])
>      cpu="sparc"
>    ;;
> +  alpha*)
> +    cpu="alpha"
> +  ;;
>    *)
>      # This will result in either an error or falling back to TCI later
>      ARCH=unknown
> @@ -895,6 +900,11 @@ case "$cpu" in
>             cc_i386='$(CC) -m32'
>             host_guest_base="yes"
>             ;;
> +    alpha)
> +           QEMU_CFLAGS="-msmall-data $QEMU_CFLAGS"
> +           LDFLAGS="-Wl,--warn-multiple-gp $LDFLAGS"
> +           host_guest_base="yes"
> +           ;;

The host_guest_base part should now be removed, but it's due to me
taking time to review the patch.

>      arm*)
>             host_guest_base="yes"
>             ;;
> @@ -4048,13 +4058,6 @@ if test "$tcg_interpreter" = "yes" ; then
>    echo "CONFIG_TCI_DIS=y"  >> $libdis_config_mak
>  fi
>  
> -case "$ARCH" in
> -alpha)
> -  # Ensure there's only a single GP
> -  cflags="-msmall-data $cflags"
> -;;
> -esac
> -
>  if test "$target_softmmu" = "yes" ; then
>    case "$TARGET_BASE_ARCH" in
>    arm)
> diff --git a/exec-all.h b/exec-all.h
> index 6516da0..4e2f2e8 100644
> --- a/exec-all.h
> +++ b/exec-all.h
> @@ -132,9 +132,8 @@ static inline void tlb_flush(CPUArchState *env, int 
> flush_global)
>  #define CODE_GEN_AVG_BLOCK_SIZE 64
>  #endif
>  
> -#if defined(__arm__) || defined(_ARCH_PPC) \
> -    || defined(__x86_64__) || defined(__i386__) \
> -    || defined(__sparc__) \
> +#if defined(__alpha__) || defined(__arm__) || defined(_ARCH_PPC) \
> +    || defined(__x86_64__) || defined(__i386__) || defined(__sparc__) \
>      || defined(CONFIG_TCG_INTERPRETER)
>  #define USE_DIRECT_JUMP
>  #endif
> @@ -245,7 +244,7 @@ static inline void tb_set_jmp_target1(uintptr_t jmp_addr, 
> uintptr_t addr)
>      __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" 
> (_flg));
>  #endif
>  }
> -#elif defined(__sparc__)
> +#elif defined(__alpha__) || defined(__sparc__)
>  void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
>  #else
>  #error tb_set_jmp_target1 is missing
> diff --git a/qemu-common.h b/qemu-common.h
> index 15d9e4e..b46a9b0 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -6,7 +6,9 @@
>  #include "compiler.h"
>  #include "config-host.h"
>  
> -#if defined(__arm__) || defined(__sparc__) || defined(__mips__) || 
> defined(__hppa__) || defined(__ia64__)
> +#if defined(__alpha__) || defined(__arm__) \
> +    || defined(__sparc__) || defined(__mips__) \
> +    || defined(__hppa__) || defined(__ia64__)
>  #define WORDS_ALIGNED
>  #endif
>  
> diff --git a/tcg/alpha/tcg-target.c b/tcg/alpha/tcg-target.c
> new file mode 100644
> index 0000000..3a9a354
> --- /dev/null
> +++ b/tcg/alpha/tcg-target.c
> @@ -0,0 +1,1860 @@
> +/*
> + * Tiny Code Generator for QEMU on ALPHA platform.
> + *
> + * Permission is hereby granted, free of charge, to any person
> + * obtaining a copy of this software and associated documentation
> + * files (the "Software"), to deal in the Software without
> + * restriction, including without limitation the rights to use, copy,
> + * modify, merge, publish, distribute, sublicense, and/or sell copies
> + * of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> + * included in all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#ifndef NDEBUG
> +static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
> +    [TCG_REG_V0] = "v0",
> +    [TCG_REG_T0] = "t0",
> +    [TCG_REG_T1] = "t1",
> +    [TCG_REG_T2] = "t2",
> +    [TCG_REG_T3] = "t3",
> +    [TCG_REG_T4] = "t4",
> +    [TCG_REG_T5] = "t5",
> +    [TCG_REG_T6] = "t6",
> +    [TCG_REG_T7] = "t7",
> +    [TCG_REG_T8] = "t8",
> +    [TCG_REG_T9] = "t9",
> +    [TCG_REG_T10] = "t10",
> +    [TCG_REG_T11] = "t11",
> +    [TCG_REG_S0] = "s0",
> +    [TCG_REG_S1] = "s1",
> +    [TCG_REG_S2] = "s2",
> +    [TCG_REG_S3] = "s3",
> +    [TCG_REG_S4] = "s4",
> +    [TCG_REG_S5] = "s5",
> +    [TCG_REG_S6] = "s6",
> +    [TCG_REG_A0] = "a0",
> +    [TCG_REG_A1] = "a1",
> +    [TCG_REG_A2] = "a2",
> +    [TCG_REG_A3] = "a3",
> +    [TCG_REG_A4] = "a4",
> +    [TCG_REG_A5] = "a5",
> +    [TCG_REG_RA] = "ra",
> +    [TCG_REG_PV] = "pv",
> +    [TCG_REG_AT] = "at",
> +    [TCG_REG_GP] = "gp",
> +    [TCG_REG_SP] = "sp",
> +    [TCG_REG_ZERO] = "zero",
> +};
> +#endif
> +
> +/*
> + * $29 is the global pointer,
> + * $30 is the stack pointer,
> + * $31 is the zero register,
> + */
> +static const int tcg_target_reg_alloc_order[] = {
> +    /* Call-saved registers.  */
> +    TCG_REG_S0,
> +    TCG_REG_S1,
> +    TCG_REG_S2,
> +    TCG_REG_S3,
> +    TCG_REG_S4,
> +    TCG_REG_S5,
> +    TCG_REG_S6,
> +    /* Call-clobbered temporaries.  */
> +    TCG_REG_T0,
> +    TCG_REG_T1,
> +    TCG_REG_T2,
> +    TCG_REG_T3,
> +    TCG_REG_T4,
> +    TCG_REG_T5,
> +    TCG_REG_T6,
> +    TCG_REG_T7,
> +    TCG_REG_T8,
> +    TCG_REG_T9,
> +    TCG_REG_T10,
> +    TCG_REG_T11,
> +    TCG_REG_RA,
> +    TCG_REG_PV,
> +    TCG_REG_AT,
> +    /* Call-clobbered argument and return registers.  */
> +    TCG_REG_V0,
> +    TCG_REG_A0,
> +    TCG_REG_A1,
> +    TCG_REG_A2,
> +    TCG_REG_A3,
> +    TCG_REG_A4,
> +    TCG_REG_A5,
> +};
> +
> +/*
> + * According to alpha calling convention, these 6 registers are used for
> + * function parameter passing. if function has more than 6 parameters,
> + * remaining arguments are stored on the stack.
> + */
> +static const int tcg_target_call_iarg_regs[6] = {
> +    TCG_REG_A0,
> +    TCG_REG_A1,
> +    TCG_REG_A2,
> +    TCG_REG_A3,
> +    TCG_REG_A4,
> +    TCG_REG_A5,
> +};
> +
> +/*
> + * According to alpha calling convention, $0 is used for returning function
> + * result.
> + */
> +static const int tcg_target_call_oarg_regs[1] = {
> +    TCG_REG_V0
> +};
> +
> +/*
> + * Temporary registers used within this translator.  Note that T9 is
> + * selected because it is the division return address register.
> + */
> +#define TMP_REG1 TCG_REG_AT
> +#define TMP_REG2 TCG_REG_T9
> +
> +/*
> + * Save the address of TB's epilogue.
> + */
> +#define TB_RET_OFS \
> +    (TCG_STATIC_CALL_ARGS_SIZE + CPU_TEMP_BUF_NLONGS * sizeof(long))
> +
> +/*
> + * If the guest base gets placed in high memory, it's more efficient
> + * to use a register to hold the address.
> + */
> +#ifndef CONFIG_USE_GUEST_BASE
> +#define GUEST_BASE 0
> +#endif

Ditto here, CONFIG_USE_GUEST_BASE has now been removed.

> +#define USE_GUEST_BASE_REG (GUEST_BASE > 0x7fff0000)
> +#define TCG_GUEST_BASE_REG TCG_REG_S5
> +
> +/*
> + * Constant constraint mask values.
> + */
> +#define TCG_CT_CONST_U8     0x100
> +#define TCG_CT_CONST_ZERO   0x200
> +#define TCG_CT_CONST_ANDI   0x400
> +#define TCG_CT_CONST_PN255  0x800
> +
> +#define INSN_OP(x)     (((x) & 0x3f) << 26)
> +#define INSN_FUNC1(x)  (((x) & 0x3) << 14)
> +#define INSN_FUNC2(x)  (((x) & 0x7f) << 5)
> +#define INSN_RA(x)     (TCG_TO_HW_REGNO(x) << 21)
> +#define INSN_RB(x)     (TCG_TO_HW_REGNO(x) << 16)
> +#define INSN_RC(x)     (TCG_TO_HW_REGNO(x))
> +#define INSN_LIT(x)    (((x) & 0xff) << 13)
> +#define INSN_DISP16(x) ((x) & 0xffff)
> +#define INSN_DISP21(x) ((x) & 0x1fffff)
> +#define INSN_RSVED(x)  ((x) & 0x3fff)
> +
> +typedef enum AlphaOpcode {
> +    INSN_ADDL       = INSN_OP(0x10) | INSN_FUNC2(0x00),
> +    INSN_ADDQ       = INSN_OP(0x10) | INSN_FUNC2(0x20),
> +    INSN_AND        = INSN_OP(0x11) | INSN_FUNC2(0x00),
> +    INSN_BEQ        = INSN_OP(0x39),
> +    INSN_BGE        = INSN_OP(0x3e),
> +    INSN_BGT        = INSN_OP(0x3f),
> +    INSN_BIC        = INSN_OP(0x11) | INSN_FUNC2(0x08),
> +    INSN_BIS        = INSN_OP(0x11) | INSN_FUNC2(0x20),
> +    INSN_BLE        = INSN_OP(0x3b),
> +    INSN_BLT        = INSN_OP(0x3a),
> +    INSN_BNE        = INSN_OP(0x3d),
> +    INSN_BR         = INSN_OP(0x30),
> +    INSN_BSR        = INSN_OP(0x34),
> +    INSN_CMOVEQ     = INSN_OP(0x11) | INSN_FUNC2(0x24),
> +    INSN_CMOVGE     = INSN_OP(0x11) | INSN_FUNC2(0x46),
> +    INSN_CMOVGT     = INSN_OP(0x11) | INSN_FUNC2(0x66),
> +    INSN_CMOVLE     = INSN_OP(0x11) | INSN_FUNC2(0x64),
> +    INSN_CMOVLT     = INSN_OP(0x11) | INSN_FUNC2(0x44),
> +    INSN_CMOVNE     = INSN_OP(0x11) | INSN_FUNC2(0x26),
> +    INSN_CMPEQ      = INSN_OP(0x10) | INSN_FUNC2(0x2d),
> +    INSN_CMPLE      = INSN_OP(0x10) | INSN_FUNC2(0x6d),
> +    INSN_CMPLT      = INSN_OP(0x10) | INSN_FUNC2(0x4d),
> +    INSN_CMPULE     = INSN_OP(0x10) | INSN_FUNC2(0x3d),
> +    INSN_CMPULT     = INSN_OP(0x10) | INSN_FUNC2(0x1d),
> +    INSN_EQV        = INSN_OP(0x11) | INSN_FUNC2(0x48),
> +    INSN_EXTBL      = INSN_OP(0x12) | INSN_FUNC2(0x06),
> +    INSN_EXTWH      = INSN_OP(0x12) | INSN_FUNC2(0x5a),
> +    INSN_EXTWL      = INSN_OP(0x12) | INSN_FUNC2(0x16),
> +    INSN_INSBL      = INSN_OP(0x12) | INSN_FUNC2(0x0b),
> +    INSN_INSLH      = INSN_OP(0x12) | INSN_FUNC2(0x67),
> +    INSN_INSLL      = INSN_OP(0x12) | INSN_FUNC2(0x2b),
> +    INSN_INSWL      = INSN_OP(0x12) | INSN_FUNC2(0x1b),
> +    INSN_JMP        = INSN_OP(0x1a) | INSN_FUNC1(0),
> +    INSN_JSR        = INSN_OP(0x1a) | INSN_FUNC1(1),
> +    INSN_LDA        = INSN_OP(0x08),
> +    INSN_LDAH       = INSN_OP(0x09),
> +    INSN_LDBU       = INSN_OP(0x0a),
> +    INSN_LDL        = INSN_OP(0x28),
> +    INSN_LDQ        = INSN_OP(0x29),
> +    INSN_LDWU       = INSN_OP(0x0c),
> +    INSN_MSKBL      = INSN_OP(0x12) | INSN_FUNC2(0x02),
> +    INSN_MSKLL      = INSN_OP(0x12) | INSN_FUNC2(0x22),
> +    INSN_MSKWL      = INSN_OP(0x12) | INSN_FUNC2(0x12),
> +    INSN_MULL       = INSN_OP(0x13) | INSN_FUNC2(0x00),
> +    INSN_MULQ       = INSN_OP(0x13) | INSN_FUNC2(0x20),
> +    INSN_ORNOT      = INSN_OP(0x11) | INSN_FUNC2(0x28),
> +    INSN_RET        = INSN_OP(0x1a) | INSN_FUNC1(2),
> +    INSN_S4ADDL     = INSN_OP(0x10) | INSN_FUNC2(0x02),
> +    INSN_S8ADDL     = INSN_OP(0x10) | INSN_FUNC2(0x12),
> +    INSN_SEXTB      = INSN_OP(0x1c) | INSN_FUNC2(0x00),
> +    INSN_SEXTW      = INSN_OP(0x1c) | INSN_FUNC2(0x01),
> +    INSN_SLL        = INSN_OP(0x12) | INSN_FUNC2(0x39),
> +    INSN_SRA        = INSN_OP(0x12) | INSN_FUNC2(0x3c),
> +    INSN_SRL        = INSN_OP(0x12) | INSN_FUNC2(0x34),
> +    INSN_STB        = INSN_OP(0x0e),
> +    INSN_STL        = INSN_OP(0x2c),
> +    INSN_STQ        = INSN_OP(0x2d),
> +    INSN_STW        = INSN_OP(0x0d),
> +    INSN_SUBL       = INSN_OP(0x10) | INSN_FUNC2(0x09),
> +    INSN_SUBQ       = INSN_OP(0x10) | INSN_FUNC2(0x29),
> +    INSN_XOR        = INSN_OP(0x11) | INSN_FUNC2(0x40),
> +    INSN_ZAPNOT     = INSN_OP(0x12) | INSN_FUNC2(0x31),
> +
> +    INSN_BUGCHK     = INSN_OP(0x00) | INSN_DISP16(0x81),
> +
> +    INSN_NOP        = INSN_BIS
> +                      | INSN_RA(TCG_REG_ZERO)
> +                      | INSN_RB(TCG_REG_ZERO)
> +                      | INSN_RC(TCG_REG_ZERO),
> +} AlphaOpcode;
> +
> +/*
> + * Given a constraint, fill in the available register set or constant range.
> + */
> +static int target_parse_constraint(TCGArgConstraint *ct, const char 
> **pct_str)
> +{
> +    const char *ct_str = *pct_str;
> +
> +    switch (ct_str[0]) {
> +    case 'r':
> +        /* Constaint 'r' means any register is okay.  */
> +        ct->ct |= TCG_CT_REG;
> +        tcg_regset_set32(ct->u.regs, 0, 0xffffffffu);
> +        break;
> +
> +    case 'a':
> +        /* Constraint 'a' means $24, one of the division inputs.  */
> +        ct->ct |= TCG_CT_REG;
> +        tcg_regset_clear(ct->u.regs);
> +        tcg_regset_set_reg(ct->u.regs, TCG_REG_T10);
> +        break;
> +
> +    case 'b':
> +        /* Constraint 'b' means $25, one of the division inputs.  */
> +        ct->ct |= TCG_CT_REG;
> +        tcg_regset_clear(ct->u.regs);
> +        tcg_regset_set_reg(ct->u.regs, TCG_REG_T11);
> +        break;
> +
> +    case 'c':
> +        /* Constraint 'c' means $27, the call procedure vector,
> +           as well as the division output.  */
> +        ct->ct |= TCG_CT_REG;
> +        tcg_regset_clear(ct->u.regs);
> +        tcg_regset_set_reg(ct->u.regs, TCG_REG_PV);
> +        break;
> +
> +    case 'L':
> +        /* Constraint for qemu_ld/st.  The extra reserved registers are
> +           used for passing the parameters to the helper function.  */
> +        ct->ct |= TCG_CT_REG;
> +        tcg_regset_set32(ct->u.regs, 0, 0xffffffffu);
> +        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
> +        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A1);
> +        break;
> +
> +    case 'I':
> +        /* Constraint 'I' means an immediate 0 ... 255.  */
> +        ct->ct |= TCG_CT_CONST_U8;
> +        break;
> +
> +    case 'J':
> +        /* Constraint 'J' means the immediate 0.  */
> +        ct->ct |= TCG_CT_CONST_ZERO;
> +        break;
> +
> +    case 'K':
> +        /* Constraint 'K' means an immediate -255..255.  */
> +        ct->ct |= TCG_CT_CONST_PN255;

I9 might be easier to read/remember.

> +        break;
> +
> +    case 'M':
> +        /* Constraint 'M' means constants used with AND/BIC/ZAPNOT.  */
> +        ct->ct |= TCG_CT_CONST_ANDI;
> +        break;
> +

Does these name correspond to something beside the alphabet order? They
might be difficult to remember, and something like 'Z' for zero
register or 'U' for unsigned constant and 'I' for signed constant might
be better.

> +    default:
> +        return -1;
> +    }
> +
> +    ct_str++;
> +    *pct_str = ct_str;
> +    return 0;
> +}
> +
> +static int tcg_match_zapnot(tcg_target_long val)
> +{
> +    tcg_target_long mask0, maskff;
> +
> +    /* Since we know this is an alpha host, speed the check by using
> +       cmpbge to compare 8 bytes at once, and incidentally also
> +       produce the zapnot mask.  */
> +    /* ??? This builtin was implemented sometime in 2002,
> +       perhaps in the GCC 3.1 timeframe.  */
> +    mask0 = __builtin_alpha_cmpbge(0, val);
> +    maskff = __builtin_alpha_cmpbge(val, -1);
> +
> +    /* Here, mask0 contains the bytes that are 0, maskff contains
> +       the bytes that are 0xff; that should cover the entire word.  */
> +    if ((mask0 | maskff) == 0xff) {
> +        return maskff;
> +    }
> +    return 0;
> +}
> +
> +static int tcg_match_andi(tcg_target_long val)
> +{
> +    if (val == (val & 0xff)) {
> +        return 1;  /* and */
> +    } else if (~val == (~val & 0xff)) {
> +        return 1;  /* bic */
> +    } else {
> +        return tcg_match_zapnot(val) != 0;
> +    }
> +}
> +
> +static inline int tcg_target_const_match(tcg_target_long val,
> +                                         const TCGArgConstraint *arg_ct)
> +{
> +    int ct = arg_ct->ct;
> +    if (ct & TCG_CT_CONST) {
> +        return 1;
> +    } else if (ct & TCG_CT_CONST_U8) {
> +        return val == (uint8_t)val;
> +    } else if (ct & TCG_CT_CONST_ZERO) {
> +        return val == 0;
> +    } else if (ct & TCG_CT_CONST_ANDI) {
> +        return tcg_match_andi(val);
> +    } else if (ct & TCG_CT_CONST_PN255) {
> +        return val >= -255 && val <= 255;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static inline void tcg_out_fmt_br(TCGContext *s, AlphaOpcode opc,
> +                                  TCGReg ra, int disp)
> +{
> +    tcg_out32(s, opc | INSN_RA(ra) | INSN_DISP21(disp));
> +}
> +
> +static inline void tcg_out_fmt_mem(TCGContext *s, AlphaOpcode opc,
> +                                   TCGReg ra, TCGReg rb, int disp)
> +{
> +    assert(disp != (int16_t)disp);
> +    tcg_out32(s, opc | INSN_RA(ra) | INSN_RB(rb) | INSN_DISP16(disp));
> +}
> +
> +static inline void tcg_out_fmt_jmp(TCGContext *s, AlphaOpcode opc,
> +                                   TCGReg ra, TCGReg rb, int rsved)
> +{
> +    tcg_out32(s, opc | INSN_RA(ra) | INSN_RB(rb) | INSN_RSVED(rsved));
> +}
> +
> +static inline void tcg_out_fmt_opr(TCGContext *s, AlphaOpcode opc,
> +                                   TCGReg ra, TCGReg rb, TCGReg rc)
> +{
> +    tcg_out32(s, opc | INSN_RA(ra) | INSN_RB(rb) | INSN_RC(rc));
> +}
> +
> +static inline void tcg_out_fmt_opi(TCGContext *s, AlphaOpcode opc,
> +                                   TCGReg ra, tcg_target_ulong lit, TCGReg 
> rc)
> +{
> +    assert(lit <= 0xff);
> +    tcg_out32(s, opc | INSN_RA(ra) | INSN_LIT(lit) | INSN_RC(rc) | (1<<12));
> +}
> +
> +/*
> + * Move from one reg to another.  This is called from tcg.c.
> + */
> +static inline void tcg_out_mov(TCGContext *s, TCGType type,
> +                               TCGReg rc, TCGReg rb)
> +{
> +    if (type == TCG_TYPE_I32) {
> +        /* Also used for 64->32 bit truncation, so don't elide copies.  */
> +        tcg_out_fmt_opr(s, INSN_ADDL, TCG_REG_ZERO, rb, rc);
> +    } else if (rb != rc) {
> +        tcg_out_fmt_opr(s, INSN_BIS, TCG_REG_ZERO, rb, rc);
> +    }
> +}
> +
> +/*
> + * Helper function to emit a memory format operation with a displacement
> + * that may be larger than the 16 bits accepted by the real instruction.
> + */
> +static void tcg_out_mem_long(TCGContext *s, AlphaOpcode opc, TCGReg ra,
> +                             TCGReg rb, tcg_target_long orig)
> +{
> +    tcg_target_long l0, l1, extra = 0, val = orig;
> +    TCGReg rs;
> +
> +    /* Pick a scratch register.  Use the output register, if possible.  */
> +    switch (opc) {
> +    default:
> +        if (ra != rb) {
> +            rs = ra;
> +            break;
> +        }
> +        /* FALLTHRU */
> +
> +    case INSN_STB:
> +    case INSN_STW:
> +    case INSN_STL:
> +    case INSN_STQ:
> +        assert(ra != TMP_REG1);
> +        rs = TMP_REG1;
> +        break;
> +    }

Are there cases where the scratch register can't be TMP_REG1 or
TMP_REG2? This part of the code seems fragile, despite the assert().

> +
> +    /* See if we can turn a large absolute address into an offset from $gp.
> +       Note that we assert via -msmall-data and --warn-multiple-gp that
> +       the $gp value is constant everywhere.  Which means that the translated
> +       code shares the same value as we have loaded right now.  */
> +    if (rb == TCG_REG_ZERO && orig != (int32_t)orig) {
> +        register tcg_target_long gp __asm__("$29");
> +        tcg_target_long gprel = orig - gp;
> +
> +        if (gprel == (int32_t)gprel) {
> +            orig = val = gprel;
> +            rb = TCG_REG_GP;
> +        }
> +    }

Is it something that really matches a lot of cases? AFAIU most of the
load store are going to be relative to the cpu_env variable. Also it
means a 16-bit value that can loaded through a 32-bit offset to gprel
might be loaded into 2 instructions instead of one.

> +
> +    l0 = (int16_t)val;
> +    val = (val - l0) >> 16;
> +    l1 = (int16_t)val;
> +
> +    if (orig == (int32_t)orig) {
> +        if (l1 < 0 && orig >= 0) {
> +            extra = 0x4000;
> +            l1 = (int16_t)(val - 0x4000);
> +        }

I don't get the use case of this. AFAIU a 32-bit signed value can be
loaded with LDAH + LDA. Here it seems that it end up being loaded with
LDAH + LDAH + LDA.

> +    } else {
> +        tcg_target_long l2, l3;
> +        int rh = TCG_REG_ZERO;
> +
> +        val = (val - l1) >> 16;
> +        l2 = (int16_t)val;
> +        val = (val - l2) >> 16;
> +        l3 = (int16_t)val;
> +
> +        if (l3) {
> +            tcg_out_fmt_mem(s, INSN_LDAH, rs, rh, l3);
> +            rh = rs;
> +        }
> +        if (l2) {
> +            tcg_out_fmt_mem(s, INSN_LDA, rs, rh, l2);
> +            rh = rs;
> +        }
> +        tcg_out_fmt_opi(s, INSN_SLL, rh, 32, rs);
> +
> +        if (rb != TCG_REG_ZERO) {
> +            tcg_out_fmt_opr(s, INSN_ADDQ, rs, rb, rs);
> +        }
> +        rb = rs;
> +    }
> +
> +    if (l1) {
> +        tcg_out_fmt_mem(s, INSN_LDAH, rs, rb, l1);
> +        rb = rs;
> +    }
> +    if (extra) {
> +        tcg_out_fmt_mem(s, INSN_LDAH, rs, rb, extra);
> +        rb = rs;
> +    }
> +
> +    if (opc != INSN_LDA || rb != ra || l0 != 0) {
> +        tcg_out_fmt_mem(s, opc, ra, rb, l0);
> +    }
> +}
> +
> +static inline void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ra,
> +                                tcg_target_long val)
> +{
> +    if (type == TCG_TYPE_I32) {
> +        val = (int32_t)val;
> +    }
> +    tcg_out_mem_long(s, INSN_LDA, ra, TCG_REG_ZERO, val);
> +}
> +
> +static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ra,
> +                              TCGReg rb, tcg_target_long disp)
> +{
> +    tcg_out_mem_long(s, type == TCG_TYPE_I32 ? INSN_LDL : INSN_LDQ,
> +                     ra, rb, disp);
> +}
> +
> +static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg ra,
> +                              TCGReg rb, tcg_target_long disp)
> +{
> +    tcg_out_mem_long(s, type == TCG_TYPE_I32 ? INSN_STL : INSN_STQ,
> +                     ra, rb, disp);
> +}
> +
> +static void tgen_andi(TCGContext *s, TCGReg ra, tcg_target_long val, TCGReg 
> rc)
> +{
> +    if (val == (val & 0xff)) {
> +        tcg_out_fmt_opi(s, INSN_AND, ra, val, rc);
> +    } else if (~val == (~val & 0xff)) {
> +        tcg_out_fmt_opi(s, INSN_BIC, ra, ~val, rc);
> +    } else {
> +        int mask = tcg_match_zapnot(val);
> +        assert(mask != 0);
> +        tcg_out_fmt_opi(s, INSN_ZAPNOT, ra, mask, rc);
> +    }
> +}
> +
> +static inline void tgen_ext8u(TCGContext *s, TCGReg ra, TCGReg rc)
> +{
> +    tcg_out_fmt_opi(s, INSN_AND, ra, 0xff, rc);
> +}
> +
> +static inline void tgen_ext8s(TCGContext *s, TCGReg ra, TCGReg rc)
> +{
> +    tcg_out_fmt_opr(s, INSN_SEXTB, TCG_REG_ZERO, ra, rc);
> +}
> +
> +static inline void tgen_ext16u(TCGContext *s, TCGReg ra, TCGReg rc)
> +{
> +    tcg_out_fmt_opi(s, INSN_ZAPNOT, ra, 0x03, rc);
> +}
> +
> +static inline void tgen_ext16s(TCGContext *s, TCGReg ra, TCGReg rc)
> +{
> +    tcg_out_fmt_opr(s, INSN_SEXTW, TCG_REG_ZERO, ra, rc);
> +}
> +
> +static inline void tgen_ext32u(TCGContext *s, TCGReg ra, TCGReg rc)
> +{
> +    tcg_out_fmt_opi(s, INSN_ZAPNOT, ra, 0x0f, rc);
> +}
> +
> +static inline void tgen_ext32s(TCGContext *s, TCGReg ra, TCGReg rc)
> +{
> +    tcg_out_fmt_opr(s, INSN_ADDL, TCG_REG_ZERO, ra, rc);
> +}
> +
> +static void tgen_extend(TCGContext *s, int sizeop, TCGReg ra, TCGReg rc)
> +{
> +    switch (sizeop) {
> +    case 0:
> +        tgen_ext8u(s, ra, rc);
> +        break;
> +    case 0 | 4:
> +        tgen_ext8s(s, ra, rc);
> +        break;
> +    case 1:
> +        tgen_ext16u(s, ra, rc);
> +        break;
> +    case 1 | 4:
> +        tgen_ext16s(s, ra, rc);
> +        break;
> +    case 2:
> +        tgen_ext32u(s, ra, rc);
> +        break;
> +    case 2 | 4:
> +        tgen_ext32s(s, ra, rc);
> +        break;
> +    case 3:
> +        tcg_out_mov(s, TCG_TYPE_I64, ra, rc);
> +        break;
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
> +static void tgen_bswap(TCGContext *s, int sizeop, TCGReg ra, TCGReg rc)
> +{
> +    const TCGReg t0 = TMP_REG1, t1 = TMP_REG2;
> +
> +    switch (sizeop) {
> +    case 1:     /* 16-bit swap, unsigned result */
> +    case 1 | 4: /* 16-bit swap, signed result */
> +        /* input value =                                   xxxx xxAB */
> +        tcg_out_fmt_opi(s, INSN_EXTWH, ra, 7, t0);      /* .... ..B. */
> +        tcg_out_fmt_opi(s, INSN_EXTBL, ra, 1, rc);      /* .... ...A */
> +        tcg_out_fmt_opr(s, INSN_BIS, rc, t0, rc);       /* .... ..BA */
> +        if (sizeop & 4) {
> +            tcg_out_fmt_opr(s, INSN_SEXTW, TCG_REG_ZERO, rc, rc);
> +        }
> +        break;
> +
> +    case 2:     /* 32-bit swap, unsigned result */
> +    case 2 | 4: /* 32-bit swap, signed result */
> +        /* input value =                                   xxxx ABCD */
> +        tcg_out_fmt_opi(s, INSN_INSLH, ra, 7, t0);      /* .... .ABC */
> +        tcg_out_fmt_opi(s, INSN_INSWL, ra, 3, rc);      /* ...C D... */
> +        tcg_out_fmt_opr(s, INSN_BIS, t0, rc, rc);       /* ...C DABC */
> +        tcg_out_fmt_opi(s, INSN_SRL, rc, 16, t0);       /* .... .CDA */
> +        tcg_out_fmt_opi(s, INSN_ZAPNOT, rc, 0x0A, rc);  /* .... D.B. */
> +        tcg_out_fmt_opi(s, INSN_ZAPNOT, t0, 0x05, t0);  /* .... .C.A */
> +        tcg_out_fmt_opr(s, (sizeop & 4 ? INSN_ADDL : INSN_BIS), t0, rc, rc);
> +        break;
> +
> +    case 3:     /* 64-bit swap */
> +        /* input value =                                   ABCD EFGH */
> +        tcg_out_fmt_opi(s, INSN_SRL, ra, 24, t0);       /* ...A BCDE */
> +        tcg_out_fmt_opi(s, INSN_SLL, ra, 24, t1);       /* DEFG H... */
> +        tcg_out_fmt_opi(s, INSN_ZAPNOT, t0, 0x11, t0);  /* ...A ...E */
> +        tcg_out_fmt_opi(s, INSN_ZAPNOT, t1, 0x88, t1);  /* D... H... */
> +        tcg_out_fmt_opr(s, INSN_BIS, t0, t1, t1);       /* D..A H..E */
> +        tcg_out_fmt_opi(s, INSN_SRL, ra, 8, t0);        /* .ABC DEFG */
> +        tcg_out_fmt_opi(s, INSN_ZAPNOT, t0, 0x22, t0);  /* ..B. ..F. */
> +        tcg_out_fmt_opr(s, INSN_BIS, t0, t1, t1);       /* D.BA H.FE */
> +        tcg_out_fmt_opi(s, INSN_SLL, ra, 8, t0);        /* BCDE FGH. */
> +        tcg_out_fmt_opi(s, INSN_ZAPNOT, t1, 0x44, t0);  /* .C.. .G.. */
> +        tcg_out_fmt_opr(s, INSN_BIS, t1, t1, t1);       /* DCBA HGFE */
> +        tcg_out_fmt_opi(s, INSN_SRL, t1, 32, t0);       /* .... DCBA */
> +        tcg_out_fmt_opi(s, INSN_SLL, t1, 32, t1);       /* HGFE .... */
> +        tcg_out_fmt_opr(s, INSN_BIS, t0, t1, rc);       /* HGFE DCBA */
> +        break;
> +
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
> +static void tcg_out_ld_sz(TCGContext *s, int sizeop, TCGReg ra, TCGReg rb,
> +                          tcg_target_long disp)
> +{
> +    static const AlphaOpcode ld_opc[4] = {
> +        INSN_LDBU, INSN_LDWU, INSN_LDL, INSN_LDQ
> +    };
> +
> +    tcg_out_mem_long(s, ld_opc[sizeop & 3], ra, rb, disp);
> +
> +    switch (sizeop) {
> +    case 0 | 4 | 8:
> +    case 0 | 4:
> +    case 1 | 4:
> +    case 2:
> +        tgen_extend(s, sizeop & 7, ra, ra);
> +        break;
> +
> +    case 0:
> +    case 0 | 8:
> +    case 1:
> +    case 2 | 4:
> +    case 3:
> +        break;

As far as I understand the bit 2 (ie the value 4) means sign extension.
I find strange to see "2" sign extended and "2 | 4" not sign extended.

> +
> +    case 1 | 8:
> +    case 1 | 4 | 8:
> +    case 2 | 8:
> +    case 2 | 4 | 8:
> +    case 3 | 8:
> +        tgen_bswap(s, sizeop & 7, ra, ra);
> +        break;
> +
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
> +static void tcg_out_st_sz(TCGContext *s, int sizeop, TCGReg ra, TCGReg rb,
> +                          tcg_target_long disp)
> +{
> +    static const AlphaOpcode st_opc[4] = {
> +        INSN_STB, INSN_STW, INSN_STL, INSN_STQ
> +    };
> +
> +    tcg_out_mem_long(s, st_opc[sizeop & 3], ra, rb, disp);
> +}

This is technically correct, but any reason why the bswap can be done in
tcg_out_ld_sz(), and tcg_out_st_sz() doesn't support that?

> +
> +static void patch_reloc(uint8_t *x_ptr, int type,
> +                        tcg_target_long value, tcg_target_long addend)
> +{
> +    uint32_t *code_ptr = (uint32_t *)x_ptr;
> +    uint32_t insn = *code_ptr;
> +
> +    value += addend;
> +    switch (type) {
> +    case R_ALPHA_BRADDR:
> +        value -= (tcg_target_long)x_ptr + 4;
> +        if ((value & 3) || value < -0x400000 || value >= 0x400000) {
> +            tcg_abort();
> +        }
> +        *code_ptr = (insn & ~0x1fffff) | INSN_DISP21(value >> 2);
> +        break;
> +
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
> +static void tcg_out_br(TCGContext *s, int opc, TCGReg ra, int label_index)
> +{
> +    TCGLabel *l = &s->labels[label_index];
> +    tcg_target_long value;
> +
> +    if (l->has_value) {
> +        value = l->u.value;
> +        value -= (tcg_target_long)s->code_ptr + 4;
> +        if ((value & 3) || value < -0x400000 || value >= 0x400000) {
> +            tcg_abort();
> +        }
> +        value >>= 2;
> +    } else {
> +        tcg_out_reloc(s, s->code_ptr, R_ALPHA_BRADDR, label_index, 0);
> +        /* We need to keep the offset unchanged for retranslation.
> +           The field loaded here will be masked in tcg_out_fmt_br.  */
> +        value = *(uint32_t *) s->code_ptr;
> +    }
> +    tcg_out_fmt_br(s, opc, ra, value);
> +}
> +
> +static void tcg_out_const_call(TCGContext *s, tcg_target_long dest)
> +{
> +    const uint16_t *check = (const uint16_t *) dest;
> +    tcg_target_long disp;
> +
> +    /* ??? Ideally we'd have access to Elf64_Sym.st_other, which
> +       would tell us definitively whether the target function uses
> +       the incoming PV value.  Make a simplifying assumption here
> +       that all of the compiler-generated code that we're calling
> +       either computes the GP from the PV in the first two insns
> +       or it doesn't use the PV at all.  This assumption holds in
> +       general for just about anything except some hand-written
> +       assembly, which we're not calling into.  */
> +
> +    /* Note we access the insn stream as 16-bit units to avoid having
> +       to mask out the offsets of the ldah and lda insns.  */
> +    if (check[1] == 0x27bb && check[3] == 0x23bd) {
> +        /* Skip the GP computation.  We can do this even if the
> +           direct branch is out of range.  */
> +        dest += 8;
> +    }
> +
> +    disp = dest - ((tcg_target_long)s->code_ptr + 4);
> +    if (disp >= -0x400000 && disp < 0x400000) {
> +        tcg_out_fmt_br(s, INSN_BSR, TCG_REG_RA, disp >> 2);
> +    } else {
> +        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_PV, dest);
> +        tcg_out_fmt_jmp(s, INSN_JSR, TCG_REG_RA, TCG_REG_PV, dest);
> +    }

I don't fully understand what you are trying to do. Is this optimization
really worthwhile? Skipping two instructions in case of a call, which is
something expensive, should not make any visible difference.

> +}
> +
> +static void tcg_out_deposit(TCGContext *s, TCGReg dest, TCGReg arg1,
> +                            TCGReg arg2, int ofs, int len, int is_64)
> +{
> +    AlphaOpcode ins_opc, msk_opc;
> +
> +    switch (len) {
> +    case 8:
> +        ins_opc = INSN_INSBL;
> +        msk_opc = INSN_MSKBL;
> +        break;
> +    case 16:
> +        ins_opc = INSN_INSWL;
> +        msk_opc = INSN_MSKWL;
> +        break;
> +    case 32:
> +        ins_opc = INSN_INSLL;
> +        msk_opc = INSN_MSKLL;
> +    default:
> +        tcg_abort();
> +    }
> +
> +    /* Convert the bit offset to a byte offset.  */
> +    ofs >>= 3;
> +
> +    if (arg1 == TCG_REG_ZERO) {
> +        tcg_out_fmt_opi(s, ins_opc, arg2, ofs, dest);
> +        if (!is_64 && len + ofs * 8 == 32) {
> +            tgen_ext32s(s, dest, dest);
> +        }
> +    } else if (arg2 == TCG_REG_ZERO) {
> +        tcg_out_fmt_opi(s, msk_opc, arg1, ofs, dest);
> +    } else {
> +        tcg_out_fmt_opi(s, ins_opc, arg2, ofs, TMP_REG1);
> +        tcg_out_fmt_opi(s, msk_opc, arg1, ofs, dest);
> +        tcg_out_fmt_opr(s, is_64 ? INSN_BIS : INSN_ADDL, dest, TMP_REG1, 
> dest);
> +    }
> +}
> +
> +/* The low bit of these entries indicates that the result of
> +   the comparison must be inverted.  This bit should not be
> +   output with the rest of the instruction.  */
> +static const int cmp_opc[] = {
> +    [TCG_COND_EQ] = INSN_CMPEQ,
> +    [TCG_COND_NE] = INSN_CMPEQ | 1,
> +    [TCG_COND_LT] = INSN_CMPLT,
> +    [TCG_COND_GE] = INSN_CMPLT | 1,
> +    [TCG_COND_LE] = INSN_CMPLE,
> +    [TCG_COND_GT] = INSN_CMPLE | 1,
> +    [TCG_COND_LTU] = INSN_CMPULT,
> +    [TCG_COND_GEU] = INSN_CMPULT | 1,
> +    [TCG_COND_LEU] = INSN_CMPULE,
> +    [TCG_COND_GTU] = INSN_CMPULE | 1
> +};
> +
> +static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg dest,
> +                            TCGReg c1, TCGArg c2, int c2const)
> +{
> +    AlphaOpcode opc = cmp_opc[cond] & ~1;
> +
> +    if (c2const) {
> +        tcg_out_fmt_opi(s, opc, c1, c2, dest);
> +    } else {
> +        tcg_out_fmt_opr(s, opc, c1, c2, dest);
> +    }
> +
> +    if (cmp_opc[cond] & 1) {
> +        tcg_out_fmt_opi(s, INSN_XOR, dest, 1, dest);
> +    }
> +}
> +
> +static void tcg_out_movcond(TCGContext *s, TCGCond cond, TCGReg dest,
> +                            TCGReg c1, TCGArg c2, int c2const,
> +                            TCGArg v1, int v1const)
> +{
> +    /* Note that unsigned comparisons are not present here, which means
> +       that their entries will contain zeros.  */
> +    static const AlphaOpcode cmov_opc[] = {
> +        [TCG_COND_EQ] = INSN_CMOVEQ,
> +        [TCG_COND_NE] = INSN_CMOVNE,
> +        [TCG_COND_LT] = INSN_CMOVLT,
> +        [TCG_COND_GE] = INSN_CMOVGE,
> +        [TCG_COND_LE] = INSN_CMOVLE,
> +        [TCG_COND_GT] = INSN_CMOVGT
> +    };
> +
> +    AlphaOpcode opc = 0;
> +
> +    /* Notice signed comparisons vs zero.  These are handled by the
> +       cmov instructions directly.  */
> +    if (c2 == 0) {
> +        opc = cmov_opc[cond];
> +    }
> +
> +    /* Otherwise, generate a comparison into a temporary.  */
> +    if (opc == 0) {
> +        opc = cmp_opc[cond] & ~1;
> +        if (c2const) {
> +            tcg_out_fmt_opi(s, opc, c1, c2, TMP_REG1);
> +        } else {
> +            tcg_out_fmt_opr(s, opc, c1, c2, TMP_REG1);
> +        }
> +
> +        opc = (cmp_opc[cond] & 1 ? INSN_CMOVEQ : INSN_CMOVNE);
> +        c1 = TMP_REG1;
> +    }
> +
> +    if (v1const) {
> +        tcg_out_fmt_opi(s, opc, c1, v1, dest);
> +    } else {
> +        tcg_out_fmt_opr(s, opc, c1, v1, dest);
> +    }
> +}
> +
> +static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGReg arg1,
> +                           TCGArg arg2, int const_arg2, int label_index)
> +{
> +    /* Note that unsigned comparisons are not present here, which means
> +       that their entries will contain zeros.  */
> +    static const AlphaOpcode br_opc[] = {
> +        [TCG_COND_EQ] = INSN_BEQ,
> +        [TCG_COND_NE] = INSN_BNE,
> +        [TCG_COND_LT] = INSN_BLT,
> +        [TCG_COND_GE] = INSN_BGE,
> +        [TCG_COND_LE] = INSN_BLE,
> +        [TCG_COND_GT] = INSN_BGT
> +    };
> +
> +    AlphaOpcode opc = 0;
> +
> +    /* Notice signed comparisons vs zero.  These are handled by the
> +       branch instructions directly.  */
> +    if (arg2 == 0) {
> +        opc = br_opc[cond];
> +    }
> +
> +    /* Otherwise, generate a comparison into a temporary.  */
> +    if (opc == 0) {
> +        opc = cmp_opc[cond] & ~1;
> +        if (const_arg2) {
> +            tcg_out_fmt_opi(s, opc, arg1, arg2, TMP_REG1);
> +        } else {
> +            tcg_out_fmt_opr(s, opc, arg1, arg2, TMP_REG1);
> +        }
> +
> +        opc = (cmp_opc[cond] & 1 ? INSN_BEQ : INSN_BNE);
> +        arg1 = TMP_REG1;
> +    }
> +
> +    tcg_out_br(s, opc, arg1, label_index);
> +}
> +
> +/* Note that these functions don't have normal C calling conventions.  */
> +typedef long divfn(long, long);
> +extern divfn __divl, __divlu, __reml, __remlu;
> +extern divfn __divq, __divqu, __remq, __remqu;
> +
> +static void tcg_out_div(TCGContext *s, int sizeop)
> +{
> +    static divfn * const libc_div[16] = {
> +        [2] = __divlu,
> +        [2 | 8] = __remlu,
> +        [2 | 4] = __divl,
> +        [2 | 4 | 8] = __reml,
> +
> +        [3] = __divqu,
> +        [3 | 8] = __remqu,
> +        [3 | 4] = __divq,
> +        [3 | 4 | 8] = __remq,
> +    };
> +
> +    tcg_target_long val, disp;
> +
> +    val = (tcg_target_long) libc_div[sizeop];
> +    assert(val != 0);
> +
> +    disp = val - ((tcg_target_long)s->code_ptr + 4);
> +    if (disp >= -0x400000 && disp < 0x400000) {
> +        tcg_out_fmt_br(s, INSN_BSR, TCG_REG_T9, disp >> 2);
> +    } else {
> +        tcg_out_movi(s, TCG_TYPE_PTR, TMP_REG1, val);
> +        tcg_out_fmt_jmp(s, INSN_JSR, TCG_REG_T9, TMP_REG1, val);
> +    }
> +}

For host architecture which don't provide a direct way to do a division,
TCG provides some helper. This way these architecture do not have to
re-invent the wheel.

For that just define TCG_TARGET_HAS_div_i{32,64} to 0.


> +#if defined(CONFIG_SOFTMMU)
> +
> +#include "../../softmmu_defs.h"
> +
> +static void *qemu_ld_helpers[4] = {
> +    helper_ldb_mmu,
> +    helper_ldw_mmu,
> +    helper_ldl_mmu,
> +    helper_ldq_mmu,
> +};
> +
> +static void *qemu_st_helpers[4] = {
> +    helper_stb_mmu,
> +    helper_stw_mmu,
> +    helper_stl_mmu,
> +    helper_stq_mmu,
> +};
> +
> +static void tgen_andi_tmp(TCGContext *s, TCGReg ra, uint64_t val, TCGReg rc)
> +{
> +    if (!tcg_match_andi(val)) {
> +        tcg_out_movi(s, TCG_TYPE_I64, TMP_REG1, val);
> +        tcg_out_fmt_opr(s, INSN_AND, ra, TMP_REG1, rc);
> +    } else {
> +        tgen_andi(s, ra, val, rc);
> +    }
> +}
> +
> +static void tcg_out_tlb_cmp(TCGContext *s, int sizeop, TCGReg r0, TCGReg r1,
> +                            TCGReg addr_reg, int label1, long tlb_offset)
> +{
> +    int addrsizeop = TARGET_LONG_BITS == 32 ? 2 : 3;
> +    unsigned long val;
> +
> +    /* Mask the page, plus the low bits of the access, into TMP3.  Note
> +       that the low bits are added in order to catch unaligned accesses,
> +       as those bits won't be set in the TLB entry.  For 32-bit targets,
> +       force the high bits of the mask to be zero, as the high bits of
> +       the input register are garbage.  */
> +    val = TARGET_PAGE_MASK | ((1 << (sizeop & 3)) - 1);
> +    if (TARGET_LONG_BITS == 32) {
> +        val &= 0xfffffffful;
> +    }
> +    tgen_andi_tmp(s, addr_reg, val, TMP_REG1);
> +
> +    /* Compute the index into the TLB into R1.  Again, note that the
> +       high bits of a 32-bit address must be cleared.  */
> +    tcg_out_fmt_opi(s, INSN_SRL, addr_reg,
> +                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS, r1);
> +
> +    val = (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS;
> +    if (TARGET_LONG_BITS == 32) {
> +        val &= 0xfffffffful >> (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
> +    }
> +    tgen_andi_tmp(s, r1, val, r1);
> +
> +    /* Load the word at (R1 + CPU_ENV + TLB_OFFSET).  Note that we
> +       arrange for a 32-bit load to be zero-extended.  */
> +    tcg_out_fmt_opr(s, INSN_ADDQ, r1, TCG_AREG0, r1);
> +    tcg_out_ld_sz(s, addrsizeop, TMP_REG2, r1, tlb_offset);
> +
> +    /* Copy the original address into R0.  This is needed on the
> +       slow path through the helper function.  */
> +    tgen_extend(s, addrsizeop, addr_reg, r0);
> +
> +    /* Compare TMP1 with the value loaded from the TLB.  */
> +    tcg_out_brcond(s, TCG_COND_NE, TMP_REG2, TMP_REG1, 0, label1);
> +}
> +#endif /* SOFTMMU */
> +
> +static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int sizeop)
> +{
> +    TCGReg addr_reg, data_reg, r0;
> +    long ofs;
> +    int bswap;
> +#if defined(CONFIG_SOFTMMU)
> +    TCGReg r1;
> +    int label1, label2, mem_index;
> +#endif
> +
> +    data_reg = *args++;
> +    addr_reg = *args++;
> +
> +#if defined(CONFIG_SOFTMMU)
> +    mem_index = *args;
> +    r0 = TCG_REG_A1;
> +    r1 = TCG_REG_A0;
> +
> +    label1 = gen_new_label();
> +    label2 = gen_new_label();
> +
> +    tcg_out_tlb_cmp(s, sizeop, r0, r1, addr_reg, label1,
> +                    offsetof(CPUArchState, 
> tlb_table[mem_index][0].addr_read));
> +
> +    /* TLB Hit.  Note that Alpha statically predicts forward branch as
> +       not taken, so arrange the fallthru as the common case.
> +
> +       R0 contains the guest address, and R1 contains the pointer
> +       to CPU_ENV plus the TLB entry offset.  */
> +
> +    tcg_out_ld(s, TCG_TYPE_I64, r1, r1,
> +               offsetof(CPUArchState, tlb_table[mem_index][0].addend));
> +    tcg_out_fmt_opr(s, INSN_ADDQ, r0, r1, r0);
> +    ofs = 0;
> +#else
> +    if (TARGET_LONG_BITS == 32) {
> +        r0 = TCG_REG_A1;
> +        tgen_ext32u(s, addr_reg, r0);
> +    } else {
> +        r0 = addr_reg;
> +    }
> +    if (USE_GUEST_BASE_REG) {
> +        tcg_out_fmt_opr(s, INSN_ADDQ, r0, TCG_GUEST_BASE_REG, TCG_REG_A1);
> +        r0 = TCG_REG_A1;
> +        ofs = 0;
> +    } else {
> +        ofs = GUEST_BASE;
> +    }
> +#endif
> +
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    /* Signal byte swap necessary.  */
> +    bswap = 8;
> +#else
> +    bswap = 0;
> +#endif
> +
> +    /* Perform the actual load.  */
> +    tcg_out_ld_sz(s, sizeop | bswap, data_reg, r0, ofs);
> +
> +#if defined(CONFIG_SOFTMMU)
> +    tcg_out_br(s, INSN_BR, TCG_REG_ZERO, label2);
> +
> +    /* TLB miss.  Call the helper function.  */
> +    tcg_out_label(s, label1, s->code_ptr);
> +    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
> +    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_A2, mem_index);
> +
> +    tcg_out_const_call(s, (tcg_target_long)qemu_ld_helpers[sizeop & 3]);
> +
> +    /* The helper routines have no defined data extension.
> +       Properly extend the result to whatever data type we need.  */
> +    tgen_extend(s, sizeop, TCG_REG_V0, data_reg);
> +
> +    tcg_out_label(s, label2, s->code_ptr);
> +#endif
> +}
> +
> +static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int sizeop)
> +{
> +    TCGReg addr_reg, data_reg, r0;
> +    long ofs;
> +#if defined(CONFIG_SOFTMMU)
> +    TCGReg r1;
> +    int label1, label2, mem_index;
> +#endif
> +
> +    data_reg = *args++;
> +    addr_reg = *args++;
> +
> +#if defined(CONFIG_SOFTMMU)
> +    mem_index = *args;
> +    r0 = TCG_REG_A1;
> +    r1 = TCG_REG_A0;
> +
> +    label1 = gen_new_label();
> +    label2 = gen_new_label();
> +
> +    tcg_out_tlb_cmp(s, sizeop, r0, r1, addr_reg, label1,
> +                    offsetof(CPUArchState,
> +                             tlb_table[mem_index][0].addr_write));
> +
> +    /* TLB Hit.  Note that Alpha statically predicts forward branch as
> +       not taken, so arrange the fallthru as the common case.
> +
> +       R0 contains the guest address, and R1 contains the pointer
> +       to CPU_ENV plus the TLB entry offset.  */
> +
> +    tcg_out_ld(s, TCG_TYPE_I64, r1, r1,
> +               offsetof(CPUArchState, tlb_table[mem_index][0].addend));
> +    tcg_out_fmt_opr(s, INSN_ADDQ, r0, r1, r0);
> +    ofs = 0;
> +#else
> +    if (TARGET_LONG_BITS == 32) {
> +        r0 = TCG_REG_A1;
> +        tgen_ext32u(s, addr_reg, r0);
> +    } else {
> +        r0 = addr_reg;
> +    }
> +    if (USE_GUEST_BASE_REG) {
> +        tcg_out_fmt_opr(s, INSN_ADDQ, r0, TCG_GUEST_BASE_REG, TCG_REG_A1);
> +        r0 = TCG_REG_A1;
> +        ofs = 0;
> +    } else {
> +        ofs = GUEST_BASE;
> +    }
> +#endif
> +
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    /* Byte swap if necessary.  */
> +    if ((sizeop & 3) > 0) {
> +        tgen_bswap(s, sizeop & 3, data_reg, TCG_REG_A0);
> +        data_reg = TCG_REG_A0;
> +    }
> +#endif
> +
> +    /* Perform the actual store.  */
> +    tcg_out_st_sz(s, sizeop, data_reg, r0, ofs);
> +
> +#if defined(CONFIG_SOFTMMU)
> +    tcg_out_br(s, INSN_BR, TCG_REG_ZERO, label2);
> +
> +    /* TLB miss.  Call the helper function.  */
> +    tcg_out_label(s, label1, s->code_ptr);
> +    tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_A0, TCG_AREG0);
> +    tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_A2, data_reg);
> +    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_A3, mem_index);
> +
> +    tcg_out_const_call(s, (tcg_target_long)qemu_st_helpers[sizeop & 3]);
> +
> +    tcg_out_label(s, label2, s->code_ptr);
> +#endif
> +}
> +
> +static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
> +                              const TCGArg *args, const int *const_args)
> +{
> +    TCGArg arg0, arg1, arg2;
> +    AlphaOpcode insn;
> +    int c;
> +
> +    arg0 = args[0];
> +    arg1 = args[1];
> +    arg2 = args[2];
> +
> +    switch (opc) {
> +    case INDEX_op_exit_tb:
> +        tcg_out_ld(s, TCG_TYPE_PTR, TMP_REG1, TCG_REG_SP, TB_RET_OFS);
> +        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_V0, arg0);
> +        tcg_out_fmt_jmp(s, INSN_RET, TCG_REG_ZERO, TMP_REG1, 0);
> +        break;
> +
> +    case INDEX_op_goto_tb:
> +        if (s->tb_jmp_offset) {
> +            /* Direct jump method.  In the general case we output:
> +                 br   $at,.+4
> +                 ldah $at,hi($at)
> +                 lda  $at,lo($at)
> +                 jmp  $31,($at),0
> +               We need to modify two instructions to set the link.
> +               We want that modification to be atomic, so we arrange
> +               for the ldah+lda pair to be 8-byte aligned.  Which
> +               means that the first branch should be 4 mod 8.  */
> +            if (((uintptr_t)s->code_ptr & 7) == 0) {
> +                tcg_out32(s, INSN_NOP);
> +            }
> +            tcg_out_fmt_br(s, INSN_BR, TMP_REG1, 0);
> +            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
> +            s->code_ptr += 8;
> +        } else {
> +            /* Indirect jump method.  */
> +            tcg_out_ld(s, TCG_TYPE_PTR, TMP_REG1, TCG_REG_ZERO,
> +                       (tcg_target_long)(s->tb_next + arg0));
> +        }
> +        tcg_out_fmt_jmp(s, INSN_JMP, TCG_REG_ZERO, TMP_REG1, 0);
> +        s->tb_next_offset[arg0] = s->code_ptr - s->code_buf;
> +        break;
> +
> +    case INDEX_op_call:
> +        if (const_args[0]) {
> +            tcg_out_const_call(s, arg0);
> +        } else {
> +            tcg_out_fmt_jmp(s, INSN_JSR, TCG_REG_RA, TCG_REG_PV, 0);
> +        }
> +        break;
> +
> +    case INDEX_op_jmp:
> +        tcg_out_fmt_jmp(s, INSN_JMP, TCG_REG_ZERO, arg0, 0);
> +        break;
> +
> +    case INDEX_op_br:
> +        tcg_out_br(s, INSN_BR, TCG_REG_ZERO, arg0);
> +        break;
> +
> +    case INDEX_op_ld8u_i32:
> +    case INDEX_op_ld8u_i64:
> +        c = 0;
> +        goto do_load;
> +    case INDEX_op_ld8s_i32:
> +    case INDEX_op_ld8s_i64:
> +        c = 0 | 4;
> +        goto do_load;
> +    case INDEX_op_ld16u_i32:
> +    case INDEX_op_ld16u_i64:
> +        c = 1;
> +        goto do_load;
> +    case INDEX_op_ld16s_i32:
> +    case INDEX_op_ld16s_i64:
> +        c = 1 | 4;
> +        goto do_load;
> +    case INDEX_op_ld32u_i64:
> +        c = 2;
> +        goto do_load;
> +    case INDEX_op_ld_i32:
> +    case INDEX_op_ld32s_i64:
> +        c = 2 | 4;
> +        goto do_load;
> +    case INDEX_op_ld_i64:
> +        c = 3;
> +    do_load:
> +        tcg_out_ld_sz(s, c, arg0, arg1, arg2);
> +        break;
> +

Minor nitpick: given do_load contains a single line, I am not sure it's
actually better nor more readable to set one of the parameters of this
line in a constant and do a goto.

> +    case INDEX_op_st8_i32:
> +    case INDEX_op_st8_i64:
> +        c = 0;
> +        goto do_store;
> +    case INDEX_op_st16_i32:
> +    case INDEX_op_st16_i64:
> +        c = 1;
> +        goto do_store;
> +    case INDEX_op_st_i32:
> +    case INDEX_op_st32_i64:
> +        c = 2;
> +        goto do_store;
> +    case INDEX_op_st_i64:
> +        c = 3;
> +    do_store:
> +        tcg_out_st_sz(s, c, arg0, arg1, arg2);
> +        break;
> +

Same there.

> +    case INDEX_op_sub_i32:
> +        if (const_args[2]) {
> +            arg2 = -arg2;
> +        } else {
> +            insn = INSN_SUBL;
> +            goto do_arith;
> +        }
> +        /* FALLTHRU */
> +
> +    case INDEX_op_add_i32:
> +        if (const_args[2]) {
> +            if ((int32_t)arg2 >= 0) {
> +                tcg_out_fmt_opi(s, INSN_ADDL, arg1, (int32_t)arg2, arg0);
> +            } else {
> +                tcg_out_fmt_opi(s, INSN_SUBL, arg1, -(int32_t)arg2, arg0);
> +            }
> +        } else {
> +            insn = INSN_ADDL;
> +            goto do_arith;
> +        }
> +        break;
> +
> +    case INDEX_op_sub_i64:
> +        if (const_args[2]) {
> +            arg2 = -arg2;
> +        } else {
> +            insn = INSN_SUBQ;
> +            goto do_arith;
> +        }
> +        /* FALLTHRU */
> +
> +    case INDEX_op_add_i64:
> +        if (const_args[2]) {
> +            tcg_out_mem_long(s, INSN_LDA, arg0, arg1, arg2);
> +        } else {
> +            insn = INSN_ADDQ;
> +            goto do_arith;
> +        }
> +        break;
> +
> +    case INDEX_op_mul_i32:
> +        insn = INSN_MULL;
> +        goto do_arith;
> +
> +    case INDEX_op_mul_i64:
> +        insn = INSN_MULQ;
> +        goto do_arith;
> +
> +    case INDEX_op_and_i32:
> +    case INDEX_op_and_i64:
> +        if (const_args[2]) {
> +            if (opc == INDEX_op_and_i32) {
> +                arg2 = (int32_t)arg2;
> +            }
> +            tgen_andi(s, arg1, arg2, arg0);
> +            break;
> +        }
> +        insn = INSN_AND;
> +        goto do_arith;
> +
> +    case INDEX_op_andc_i32:
> +    case INDEX_op_andc_i64:
> +        if (const_args[2]) {
> +            if (opc == INDEX_op_andc_i32) {
> +                arg2 = (int32_t)arg2;
> +            }
> +            tgen_andi(s, arg1, ~arg2, arg0);
> +            break;
> +        }
> +        insn = INSN_BIC;
> +        goto do_arith;
> +
> +    case INDEX_op_or_i32:
> +    case INDEX_op_or_i64:
> +        insn = INSN_BIS;
> +        goto do_arith;
> +
> +    case INDEX_op_orc_i32:
> +    case INDEX_op_orc_i64:
> +        insn = INSN_ORNOT;
> +        goto do_arith;
> +
> +    case INDEX_op_xor_i32:
> +    case INDEX_op_xor_i64:
> +        insn = INSN_XOR;
> +        goto do_arith;
> +
> +    case INDEX_op_eqv_i32:
> +    case INDEX_op_eqv_i64:
> +        insn = INSN_EQV;
> +        goto do_arith;
> +
> +    case INDEX_op_shl_i32:
> +        /* Make sure to preserve the sign-extension in the result.
> +           Thus the special casing of shifts by 1, 2 and 3.  */
> +        if (const_args[2]) {
> +            arg2 &= 31;
> +            switch (arg2) {
> +            case 0:
> +                tcg_out_mov(s, TCG_TYPE_I32, arg0, arg1);
> +                break;
> +            case 1:
> +                tcg_out_fmt_opr(s, INSN_ADDL, arg1, arg1, arg0);
> +                break;
> +            case 2:
> +                tcg_out_fmt_opr(s, INSN_S4ADDL, arg1, TCG_REG_ZERO, arg0);
> +                break;
> +            case 3:
> +                tcg_out_fmt_opr(s, INSN_S8ADDL, arg1, TCG_REG_ZERO, arg0);
> +                break;
> +            default:
> +                tcg_out_fmt_opi(s, INSN_SLL, arg1, arg2, arg0);
> +                tgen_ext32s(s, arg0, arg0);
> +                break;
> +            }
> +        } else {
> +            /* ??? TCG has no requirement to truncate the shift yet.  */
> +            tcg_out_fmt_opr(s, INSN_SLL, arg1, arg2, arg0);
> +            tgen_ext32s(s, arg0, arg0);
> +        }
> +        break;
> +
> +    case INDEX_op_shl_i64:
> +        insn = INSN_SLL;
> +        goto do_arith;
> +
> +    case INDEX_op_shr_i32:
> +        /* Recall that the input is sign-extended, which means that we
> +           need to mask the high bits that we'll be shifting in.  There
> +           are three common cases that can perform the shift+mask in
> +           one instruction.  Otherwise, we'll need a separate mask.  */
> +        if (const_args[2]) {
> +            arg2 &= 31;
> +            switch (arg2) {
> +            case 0:
> +                tcg_out_mov(s, TCG_TYPE_I32, arg0, arg1);
> +                break;
> +            case 8:
> +                tcg_out_fmt_opi(s, INSN_INSLH, arg1, 7, arg0);
> +                break;
> +            case 16:
> +                tcg_out_fmt_opi(s, INSN_EXTWL, arg1, 2, arg0);
> +                break;
> +            case 24:
> +                tcg_out_fmt_opi(s, INSN_EXTBL, arg1, 3, arg0);
> +                break;
> +            case 25 ... 31:
> +                tcg_out_fmt_opi(s, INSN_SRL, arg1, arg2, arg0);
> +                tcg_out_fmt_opi(s, INSN_AND, arg0,
> +                                (1 << (32 - arg2)) - 1, arg0);
> +                break;
> +            default:
> +                tgen_ext32u(s, arg1, arg0);
> +                tcg_out_fmt_opi(s, INSN_SRL, arg0, arg2, arg0);
> +                break;
> +            }
> +        } else {
> +            /* Here we need to be careful about a shift of zero,
> +               for which we'd need to re-sign-extend the output.  */
> +            tgen_ext32u(s, arg1, TMP_REG1);
> +            tcg_out_fmt_opr(s, INSN_SRL, TMP_REG1, arg2, arg0);
> +            tgen_ext32s(s, arg0, arg0);
> +        }
> +        break;
> +
> +    case INDEX_op_shr_i64:
> +        insn = INSN_SRL;
> +        goto do_arith;
> +
> +    case INDEX_op_sar_i32:
> +        /* Note that since the input is already sign-extended,
> +           we need not do anything special here.  */
> +    case INDEX_op_sar_i64:
> +        insn = INSN_SRA;
> +        goto do_arith;
> +
> +    do_arith:
> +        if (const_args[2]) {
> +            tcg_out_fmt_opi(s, insn, arg1, arg2, arg0);
> +        } else {
> +            tcg_out_fmt_opr(s, insn, arg1, arg2, arg0);
> +        }
> +        break;
> +
> +    case INDEX_op_not_i32:
> +    case INDEX_op_not_i64:
> +        if (const_args[1]) {
> +            tcg_out_fmt_opi(s, INSN_ORNOT, TCG_REG_ZERO, arg1, arg0);
> +        } else {
> +            tcg_out_fmt_opr(s, INSN_ORNOT, TCG_REG_ZERO, arg1, arg0);
> +        }
> +        break;

Do we really want to handle the constant case? This seems a bit overkill
as the constant propagation pass is able to handle it. Also it's better
coded as a movi.

> +    case INDEX_op_deposit_i32:
> +        tcg_out_deposit(s, arg0, arg1, arg2, args[3], args[4], 0);
> +        break;
> +    case INDEX_op_deposit_i64:
> +        tcg_out_deposit(s, arg0, arg1, arg2, args[3], args[4], 1);
> +        break;
> +
> +    case INDEX_op_brcond_i32:
> +    case INDEX_op_brcond_i64:
> +        tcg_out_brcond(s, arg2, arg0, arg1, const_args[1], args[3]);
> +        break;
> +
> +    case INDEX_op_setcond_i32:
> +    case INDEX_op_setcond_i64:
> +        tcg_out_setcond(s, args[3], arg0, arg1, arg2, const_args[2]);
> +        break;
> +
> +    case INDEX_op_movcond_i32:
> +    case INDEX_op_movcond_i64:
> +        tcg_out_movcond(s, args[5], arg0, arg1, arg2, const_args[2],
> +                        args[3], const_args[3]);
> +        break;
> +
> +    case INDEX_op_ext8s_i32:
> +    case INDEX_op_ext8s_i64:
> +        c = 0 | 4;
> +        goto do_sign_extend;
> +    case INDEX_op_ext16s_i32:
> +    case INDEX_op_ext16s_i64:
> +        c = 1 | 4;
> +        goto do_sign_extend;
> +    case INDEX_op_ext32s_i64:
> +        c = 2 | 4;
> +    do_sign_extend:
> +        tgen_extend(s, c, arg1, arg0);
> +        break;

Same comment as for do_load/do_store. I don't see what the goto brings
here. 

> +    case INDEX_op_div_i32:
> +        c = 2 | 4;
> +        goto do_div;
> +    case INDEX_op_rem_i32:
> +        c = 2 | 4 | 8;
> +        goto do_div;
> +    case INDEX_op_divu_i32:
> +        c = 2;
> +        goto do_div;
> +    case INDEX_op_remu_i32:
> +        c = 2 | 8;
> +        goto do_div;
> +    case INDEX_op_div_i64:
> +        c = 3 | 4;
> +        goto do_div;
> +    case INDEX_op_rem_i64:
> +        c = 3 | 4 | 8;
> +        goto do_div;
> +    case INDEX_op_divu_i64:
> +        c = 3;
> +        goto do_div;
> +    case INDEX_op_remu_i64:
> +        c = 3 | 8;
> +    do_div:
> +        tcg_out_div(s, c);
> +        break;

And same here.

> +    case INDEX_op_bswap16_i32:
> +    case INDEX_op_bswap16_i64:
> +        c = 1;
> +        goto do_bswap;
> +    case INDEX_op_bswap32_i32:
> +        c = 2 | 4;
> +        goto do_bswap;
> +    case INDEX_op_bswap32_i64:
> +        c = 2;
> +        goto do_bswap;
> +    case INDEX_op_bswap64_i64:
> +        c = 3;
> +    do_bswap:
> +        tgen_bswap(s, c, arg1, arg0);
> +        break;

And again here.

> +    case INDEX_op_qemu_ld8u:
> +        c = 0;
> +        goto do_qemu_load;
> +    case INDEX_op_qemu_ld8s:
> +        c = 0 | 4;
> +        goto do_qemu_load;
> +    case INDEX_op_qemu_ld16u:
> +        c = 1;
> +        goto do_qemu_load;
> +    case INDEX_op_qemu_ld16s:
> +        c = 1 | 4;
> +        goto do_qemu_load;
> +    case INDEX_op_qemu_ld32:
> +    case INDEX_op_qemu_ld32s:
> +        c = 2 | 4;
> +        goto do_qemu_load;
> +    case INDEX_op_qemu_ld32u:
> +        c = 2;
> +        goto do_qemu_load;
> +    case INDEX_op_qemu_ld64:
> +        c = 3;
> +    do_qemu_load:
> +        tcg_out_qemu_ld(s, args, c);
> +        break;

Ditto.

> +    case INDEX_op_qemu_st8:
> +        c = 0;
> +        goto do_qemu_store;
> +    case INDEX_op_qemu_st16:
> +        c = 1;
> +        goto do_qemu_store;
> +    case INDEX_op_qemu_st32:
> +        c = 2;
> +        goto do_qemu_store;
> +    case INDEX_op_qemu_st64:
> +        c = 3;
> +    do_qemu_store:
> +        tcg_out_qemu_st(s, args, c);
> +        break;

Ditto.

> +    case INDEX_op_mov_i32:
> +    case INDEX_op_mov_i64:
> +    case INDEX_op_movi_i32:
> +    case INDEX_op_movi_i64:
> +        /* These four are handled by tcg.c directly.  */
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
> +static const TCGTargetOpDef alpha_op_defs[] = {
> +    { INDEX_op_exit_tb,         { } },
> +    { INDEX_op_goto_tb,         { } },
> +    { INDEX_op_call,            { "ci" } },
> +    { INDEX_op_jmp,             { "r" } },
> +    { INDEX_op_br,              { } },
> +
> +    { INDEX_op_mov_i32,         { "r", "r" } },
> +    { INDEX_op_movi_i32,        { "r" } },
> +
> +    { INDEX_op_ld8u_i32,        { "r", "r" } },
> +    { INDEX_op_ld8s_i32,        { "r", "r" } },
> +    { INDEX_op_ld16u_i32,       { "r", "r" } },
> +    { INDEX_op_ld16s_i32,       { "r", "r" } },
> +    { INDEX_op_ld_i32,          { "r", "r" } },
> +    { INDEX_op_st8_i32,         { "rJ", "r" } },
> +    { INDEX_op_st16_i32,        { "rJ", "r" } },
> +    { INDEX_op_st_i32,          { "rJ", "r" } },
> +
> +    { INDEX_op_add_i32,         { "r", "rJ", "rK" } },
> +    { INDEX_op_mul_i32,         { "r", "rJ", "rI" } },
> +    { INDEX_op_sub_i32,         { "r", "rJ", "rK" } },
> +    { INDEX_op_and_i32,         { "r", "rJ", "rM" } },
> +    { INDEX_op_or_i32,          { "r", "rJ", "rI" } },
> +    { INDEX_op_xor_i32,         { "r", "rJ", "rI" } },
> +    { INDEX_op_andc_i32,        { "r", "rJ", "rM" } },
> +    { INDEX_op_orc_i32,         { "r", "rJ", "rI" } },
> +    { INDEX_op_eqv_i32,         { "r", "rJ", "rI" } },
> +    { INDEX_op_not_i32,         { "r", "rI" } },
> +
> +    { INDEX_op_shl_i32,         { "r", "rJ", "rI" } },
> +    { INDEX_op_shr_i32,         { "r", "rJ", "rI" } },
> +    { INDEX_op_sar_i32,         { "r", "rJ", "rI" } },
> +
> +    { INDEX_op_deposit_i32,     { "r", "rJ", "rJ" } },
> +
> +    { INDEX_op_div_i32,         { "c", "a", "b" } },
> +    { INDEX_op_rem_i32,         { "c", "a", "b" } },
> +    { INDEX_op_divu_i32,        { "c", "a", "b" } },
> +    { INDEX_op_remu_i32,        { "c", "a", "b" } },
> +
> +    { INDEX_op_brcond_i32,      { "rJ", "rI" } },
> +    { INDEX_op_setcond_i32,     { "r", "rJ", "rI" } },
> +    { INDEX_op_movcond_i32,     { "r", "rJ", "rI", "rI", "0" } },
> +
> +    { INDEX_op_mov_i64,         { "r", "r" } },
> +    { INDEX_op_movi_i64,        { "r" } },
> +
> +    { INDEX_op_ld8u_i64,        { "r", "r" } },
> +    { INDEX_op_ld8s_i64,        { "r", "r" } },
> +    { INDEX_op_ld16u_i64,       { "r", "r" } },
> +    { INDEX_op_ld16s_i64,       { "r", "r" } },
> +    { INDEX_op_ld32u_i64,       { "r", "r" } },
> +    { INDEX_op_ld32s_i64,       { "r", "r" } },
> +    { INDEX_op_ld_i64,          { "r", "r" } },
> +    { INDEX_op_st8_i64,         { "rJ", "r" } },
> +    { INDEX_op_st16_i64,        { "rJ", "r" } },
> +    { INDEX_op_st32_i64,        { "rJ", "r" } },
> +    { INDEX_op_st_i64,          { "rJ", "r" } },
> +
> +    { INDEX_op_add_i64,         { "r", "rJ", "ri" } },
> +    { INDEX_op_mul_i64,         { "r", "rJ", "rI" } },
> +    { INDEX_op_sub_i64,         { "r", "rJ", "ri" } },
> +    { INDEX_op_and_i64,         { "r", "rJ", "rM" } },
> +    { INDEX_op_or_i64,          { "r", "rJ", "rI" } },
> +    { INDEX_op_xor_i64,         { "r", "rJ", "rI" } },
> +    { INDEX_op_andc_i64,        { "r", "rJ", "rM" } },
> +    { INDEX_op_orc_i64,         { "r", "rJ", "rI" } },
> +    { INDEX_op_eqv_i64,         { "r", "rJ", "rI" } },
> +    { INDEX_op_not_i64,         { "r", "rI" } },
> +
> +    { INDEX_op_shl_i64,         { "r", "rJ", "rI" } },
> +    { INDEX_op_shr_i64,         { "r", "rJ", "rI" } },
> +    { INDEX_op_sar_i64,         { "r", "rJ", "rI" } },
> +
> +    { INDEX_op_deposit_i64,     { "r", "rJ", "rJ" } },
> +
> +    { INDEX_op_div_i64,         { "c", "a", "b" } },
> +    { INDEX_op_rem_i64,         { "c", "a", "b" } },
> +    { INDEX_op_divu_i64,        { "c", "a", "b" } },
> +    { INDEX_op_remu_i64,        { "c", "a", "b" } },
> +
> +    { INDEX_op_brcond_i64,      { "rJ", "rI" } },
> +    { INDEX_op_setcond_i64,     { "r", "rJ", "rI" } },
> +    { INDEX_op_movcond_i64,     { "r", "rJ", "rI", "rI", "0" } },
> +
> +    { INDEX_op_ext8s_i32,       { "r", "rJ" } },
> +    { INDEX_op_ext16s_i32,      { "r", "rJ" } },
> +    { INDEX_op_ext8s_i64,       { "r", "rJ" } },
> +    { INDEX_op_ext16s_i64,      { "r", "rJ" } },
> +    { INDEX_op_ext32s_i64,      { "r", "rJ" } },
> +
> +    { INDEX_op_bswap16_i32,     { "r", "rJ" } },
> +    { INDEX_op_bswap32_i32,     { "r", "rJ" } },
> +    { INDEX_op_bswap16_i64,     { "r", "rJ" } },
> +    { INDEX_op_bswap32_i64,     { "r", "rJ" } },
> +    { INDEX_op_bswap64_i64,     { "r", "rJ" } },
> +
> +    { INDEX_op_qemu_ld8u,       { "r", "L" } },
> +    { INDEX_op_qemu_ld8s,       { "r", "L" } },
> +    { INDEX_op_qemu_ld16u,      { "r", "L" } },
> +    { INDEX_op_qemu_ld16s,      { "r", "L" } },
> +    { INDEX_op_qemu_ld32,       { "r", "L" } },
> +    { INDEX_op_qemu_ld32u,      { "r", "L" } },
> +    { INDEX_op_qemu_ld32s,      { "r", "L" } },
> +    { INDEX_op_qemu_ld64,       { "r", "L" } },
> +
> +    { INDEX_op_qemu_st8,        { "L", "L" } },
> +    { INDEX_op_qemu_st16,       { "L", "L" } },
> +    { INDEX_op_qemu_st32,       { "L", "L" } },
> +    { INDEX_op_qemu_st64,       { "L", "L" } },
> +    { -1 },
> +};
> +
> +
> +/*
> + * Generate global QEMU prologue and epilogue code
> + */
> +void tcg_target_qemu_prologue(TCGContext *s)
> +{
> +    static const TCGReg save_regs[] = {
> +        TCG_REG_RA,
> +        TCG_REG_S0,
> +        TCG_REG_S1,
> +        TCG_REG_S2,
> +        TCG_REG_S3,
> +        TCG_REG_S4,
> +        /* TCG_REG_S5 -- currently used for the global env.  */
> +        TCG_REG_S6,
> +    };
> +
> +    long i, frame_size, save_ofs;
> +    uint8_t *ret_loc, *ent_loc;
> +
> +    /* The shape of the stack frame is:
> +       input sp
> +         [ Register save area ]
> +         [ TB return address ]
> +         [ CPU_TEMP_BUF_NLONGS ]
> +         [ TCG_STATIC_CALL_ARGS_SIZE ]
> +       sp
> +    */
> +
> +    save_ofs = TB_RET_OFS + 8;
> +    frame_size = save_ofs + ARRAY_SIZE(save_regs) * 8;
> +    frame_size += TCG_TARGET_STACK_ALIGN - 1;
> +    frame_size &= -TCG_TARGET_STACK_ALIGN;
> +
> +    tcg_set_frame(s, TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE,
> +                  CPU_TEMP_BUF_NLONGS * sizeof(long));
> +
> +    /* TB Prologue.  */
> +    ent_loc = s->code_ptr;
> +
> +    /* Allocate the stack frame.  */
> +    tcg_out_fmt_mem(s, INSN_LDA, TCG_REG_SP, TCG_REG_SP, -frame_size);
> +
> +    /* Save all callee saved registers.  */
> +    for (i = 0; i < ARRAY_SIZE(save_regs); i++) {
> +        tcg_out_fmt_mem(s, INSN_STQ, save_regs[i], TCG_REG_SP, save_ofs + 
> i*8);
> +    }
> +
> +    /* Store the return address of the TB.  */
> +    ret_loc = s->code_ptr;
> +    tcg_out_fmt_mem(s, INSN_LDA, TMP_REG1, TCG_REG_PV, 0);
> +    tcg_out_fmt_mem(s, INSN_STQ, TMP_REG1, TCG_REG_SP, TB_RET_OFS);
> +
> +    /* Copy the ENV pointer into place.  */
> +    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_A0);
> +
> +    /* Setup TCG_GUEST_BASE_REG if desired.  */
> +    if (USE_GUEST_BASE_REG) {
> +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
> +        tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
> +    }
> +
> +    /* Invoke the TB.  */
> +    tcg_out_fmt_jmp(s, INSN_JSR, TCG_REG_ZERO, TCG_REG_A1, 0);
> +
> +    /* Fill in the offset for the TB return address, as described above.  */
> +    i = s->code_ptr - ent_loc;
> +    assert(i == (int16_t)i);
> +    *(int16_t *)ret_loc = i;
> +
> +    /* TB epilogue. */
> +
> +    /* Restore all callee saved registers.  */
> +    for (i = 0; i < ARRAY_SIZE(save_regs); i++) {
> +        tcg_out_fmt_mem(s, INSN_LDQ, save_regs[i], TCG_REG_SP, save_ofs + 
> i*8);
> +    }
> +
> +    /* Deallocate the stack frame.  */
> +    tcg_out_fmt_mem(s, INSN_LDA, TCG_REG_SP, TCG_REG_SP, frame_size);
> +
> +    tcg_out_fmt_jmp(s, INSN_RET, TCG_REG_ZERO, TCG_REG_RA, 0);
> +}
> +
> +
> +void tcg_target_init(TCGContext *s)
> +{
> +#if !defined(CONFIG_USER_ONLY)
> +    /* fail safe */
> +    assert((1 << CPU_TLB_ENTRY_BITS) == sizeof(CPUTLBEntry));
> +#endif
> +
> +    tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffffffff);
> +    tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffffffff);
> +
> +    tcg_regset_set32(tcg_target_call_clobber_regs, 0, 0xffffffff);
> +    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S0);
> +    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S1);
> +    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S2);
> +    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S3);
> +    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S4);
> +    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S5);
> +    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S6);
> +    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_GP);
> +    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_SP);
> +
> +    tcg_regset_clear(s->reserved_regs);
> +    tcg_regset_set_reg(s->reserved_regs, TCG_REG_GP);
> +    tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
> +    tcg_regset_set_reg(s->reserved_regs, TCG_REG_ZERO);
> +    tcg_regset_set_reg(s->reserved_regs, TMP_REG1);
> +    tcg_regset_set_reg(s->reserved_regs, TMP_REG2);
> +
> +    tcg_add_target_add_op_defs(alpha_op_defs);
> +}
> +
> +void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
> +{
> +    long disp, hi, lo, insn1, insn2;
> +
> +    /* Try a direct branch first.  */
> +    disp = addr - (jmp_addr + 4);
> +    if (disp >= -0x400000 && disp < 0x400000) {
> +        insn1 = INSN_BR | INSN_RA(TCG_REG_ZERO) | INSN_DISP21(disp >> 2);
> +        /* The second insn is dead code, but don't leave the memory totally
> +           uninitialized.  If the garbage is an illegal insn the prefetch
> +           unit can flush the pipeline in order to prove the illegal insn
> +           isn't executed.  */
> +        insn2 = INSN_NOP;
> +    } else {
> +        /* Failing that, do an ldah+lda pair to make the distance.
> +           Given that the code buffer is limited to 2G, this should
> +           always reach.  */
> +        disp = addr - jmp_addr;
> +        lo = (int16_t)disp;
> +        hi = (int16_t)((disp - lo) >> 16);
> +        assert((hi << 16) + lo != disp);
> +        insn1 = INSN_LDAH | INSN_RA(TMP_REG1)
> +                | INSN_RB(TMP_REG1) | INSN_DISP16(hi);
> +        insn2 = INSN_LDA | INSN_RA(TMP_REG1)
> +                | INSN_RB(TMP_REG1) | INSN_DISP16(lo);
> +    }
> +    *(uint64_t *)jmp_addr = insn1 + (insn2 << 32);
> +
> +    flush_icache_range(jmp_addr, jmp_addr + 8);
> +}
> diff --git a/tcg/alpha/tcg-target.h b/tcg/alpha/tcg-target.h
> new file mode 100644
> index 0000000..3611687
> --- /dev/null
> +++ b/tcg/alpha/tcg-target.h
> @@ -0,0 +1,142 @@
> +/*
> + * Tiny Code Generator for QEMU
> + *
> + * Permission is hereby granted, free of charge, to any person
> + * obtaining a copy of this software and associated documentation
> + * files (the "Software"), to deal in the Software without
> + * restriction, including without limitation the rights to use, copy,
> + * modify, merge, publish, distribute, sublicense, and/or sell copies
> + * of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> + * included in all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +#define TCG_TARGET_ALPHA 1
> +
> +#define TCG_TARGET_NB_REGS 32
> +
> +/* Having the zero register ($31) == 0 within TCG simplifies a few things.
> +   Thus we have a mapping between TCG regno and hardware regno.  */
> +#define HW_TO_TCG_REGNO(x)      ((x) ^ 0x1f)
> +#define TCG_TO_HW_REGNO(x)      ((x) ^ 0x1f)
> +
> +typedef enum TCGReg {
> +    TCG_REG_V0 = HW_TO_TCG_REGNO(0),
> +
> +    TCG_REG_T0 = HW_TO_TCG_REGNO(1),
> +    TCG_REG_T1 = HW_TO_TCG_REGNO(2),
> +    TCG_REG_T2 = HW_TO_TCG_REGNO(3),
> +    TCG_REG_T3 = HW_TO_TCG_REGNO(4),
> +    TCG_REG_T4 = HW_TO_TCG_REGNO(5),
> +    TCG_REG_T5 = HW_TO_TCG_REGNO(6),
> +    TCG_REG_T6 = HW_TO_TCG_REGNO(7),
> +    TCG_REG_T7 = HW_TO_TCG_REGNO(8),
> +
> +    TCG_REG_S0 = HW_TO_TCG_REGNO(9),
> +    TCG_REG_S1 = HW_TO_TCG_REGNO(10),
> +    TCG_REG_S2 = HW_TO_TCG_REGNO(11),
> +    TCG_REG_S3 = HW_TO_TCG_REGNO(12),
> +    TCG_REG_S4 = HW_TO_TCG_REGNO(13),
> +    TCG_REG_S5 = HW_TO_TCG_REGNO(14),
> +    TCG_REG_S6 = HW_TO_TCG_REGNO(15),
> +
> +    TCG_REG_A0 = HW_TO_TCG_REGNO(16),
> +    TCG_REG_A1 = HW_TO_TCG_REGNO(17),
> +    TCG_REG_A2 = HW_TO_TCG_REGNO(18),
> +    TCG_REG_A3 = HW_TO_TCG_REGNO(19),
> +    TCG_REG_A4 = HW_TO_TCG_REGNO(20),
> +    TCG_REG_A5 = HW_TO_TCG_REGNO(21),
> +
> +    TCG_REG_T8 = HW_TO_TCG_REGNO(22),
> +    TCG_REG_T9 = HW_TO_TCG_REGNO(23),
> +    TCG_REG_T10 = HW_TO_TCG_REGNO(24),
> +    TCG_REG_T11 = HW_TO_TCG_REGNO(25),
> +
> +    TCG_REG_RA = HW_TO_TCG_REGNO(26),
> +    TCG_REG_PV = HW_TO_TCG_REGNO(27),
> +    TCG_REG_AT = HW_TO_TCG_REGNO(28),
> +    TCG_REG_GP = HW_TO_TCG_REGNO(29),
> +    TCG_REG_SP = HW_TO_TCG_REGNO(30),
> +
> +    TCG_REG_ZERO = HW_TO_TCG_REGNO(31)
> +} TCGReg;
> +
> +/* Used for function call generation.  */
> +#define TCG_REG_CALL_STACK TCG_REG_SP
> +#define TCG_TARGET_STACK_ALIGN 16
> +#define TCG_TARGET_CALL_STACK_OFFSET 0
> +
> +/* We have signed extension instructions.  */
> +#define TCG_TARGET_HAS_ext8s_i32        1
> +#define TCG_TARGET_HAS_ext16s_i32       1
> +#define TCG_TARGET_HAS_ext8s_i64        1
> +#define TCG_TARGET_HAS_ext16s_i64       1
> +#define TCG_TARGET_HAS_ext32s_i64       1
> +
> +/* We have single-output division routines.  */
> +#define TCG_TARGET_HAS_div_i32          1
> +#define TCG_TARGET_HAS_div_i64          1
> +
> +/* We have conditional move.  */
> +#define TCG_TARGET_HAS_movcond_i32      1
> +#define TCG_TARGET_HAS_movcond_i64      1
> +
> +/* We have optimized bswap routines.  */
> +#define TCG_TARGET_HAS_bswap16_i32      1
> +#define TCG_TARGET_HAS_bswap32_i32      1
> +#define TCG_TARGET_HAS_bswap16_i64      1
> +#define TCG_TARGET_HAS_bswap32_i64      1
> +#define TCG_TARGET_HAS_bswap64_i64      1
> +
> +/* We have NOT via ORNOT.  */
> +#define TCG_TARGET_HAS_not_i32          1
> +#define TCG_TARGET_HAS_not_i64          1
> +
> +/* We have some compound logical instructions.  */
> +#define TCG_TARGET_HAS_andc_i32         1
> +#define TCG_TARGET_HAS_andc_i64         1
> +#define TCG_TARGET_HAS_orc_i32          1
> +#define TCG_TARGET_HAS_orc_i64          1
> +#define TCG_TARGET_HAS_eqv_i32          1
> +#define TCG_TARGET_HAS_eqv_i64          1
> +#define TCG_TARGET_HAS_nand_i32         0
> +#define TCG_TARGET_HAS_nand_i64         0
> +#define TCG_TARGET_HAS_nor_i32          0
> +#define TCG_TARGET_HAS_nor_i64          0
> +
> +/* We can do better for specific cases of deposit.  */
> +#define TCG_TARGET_HAS_deposit_i32      1
> +#define TCG_TARGET_HAS_deposit_i64      1
> +
> +#define TCG_TARGET_deposit_i32_valid(ofs, len) \
> +  (((ofs) & 7) == 0 && ((len) == 8 || (len) == 16 || (len) == 32))
> +
> +/* The default implementations of these are fine.  */
> +#define TCG_TARGET_HAS_neg_i32          0
> +#define TCG_TARGET_HAS_neg_i64          0
> +#define TCG_TARGET_HAS_ext8u_i32        0
> +#define TCG_TARGET_HAS_ext16u_i32       0
> +#define TCG_TARGET_HAS_ext8u_i64        0
> +#define TCG_TARGET_HAS_ext16u_i64       0
> +#define TCG_TARGET_HAS_ext32u_i64       0
> +#define TCG_TARGET_HAS_rot_i32          0
> +#define TCG_TARGET_HAS_rot_i64          0
> +
> +#define TCG_TARGET_HAS_GUEST_BASE
> +
> +#define TCG_AREG0 TCG_REG_S6
> +
> +static inline void flush_icache_range(unsigned long start, unsigned long 
> stop)
> +{
> +    __asm__ __volatile__ ("call_pal 0x86");
> +}

Is it the only way to do that, ie flushing the whole icache and not part
of it? In that case direct jumps might be more expensive than the
default load register + jump option.

> -- 
> 1.7.11.4
> 
> 
> 

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
address@hidden                 http://www.aurel32.net



reply via email to

[Prev in Thread] Current Thread [Next in Thread]