qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 08/10] target-avr: adding instruction translatio


From: Richard Henderson
Subject: Re: [Qemu-devel] [PATCH 08/10] target-avr: adding instruction translation
Date: Sat, 4 Jun 2016 20:27:48 -0700
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.1.0

On 06/02/2016 01:07 PM, Michael Rolnik wrote:
Signed-off-by: Michael Rolnik <address@hidden>
---
 target-avr/translate-inst.c | 2443 +++++++++++++++++++++++++++++++++++++++++++

Is there a reason this code isn't going into translate.c?
You wouldn't need the declarations in translate-inst.h or translate.h.

+/*
+    NOTE:   all registers are assumed to hold 8 bit values.
+            so all operations done on registers should preseve this property
+*/
+
+/*
+    NOTE:   the flags C,H,V,N,V have either 0 or 1 values
+    NOTE:   the flag Z has inverse logic, when the value of Zf is 0 the flag 
is assumed to be set, non zero - not set
+*/

This documentation belongs with the declarations in cpu.h.

+void    gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr);
+void    gen_add_Vf(TCGv R, TCGv Rd, TCGv Rr);
+void    gen_sub_CHf(TCGv R, TCGv Rd, TCGv Rr);
+void    gen_sub_Vf(TCGv R, TCGv Rd, TCGv Rr);
+void    gen_ZNSf(TCGv R);
+void    gen_push_ret(CPUAVRState *env, int    ret);
+void    gen_pop_ret(CPUAVRState *env, TCGv    ret);
+void    gen_jmp_ez(void);
+void    gen_jmp_z(void);
+
+void    gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv l); /*  H:M:L   = addr  */
+void    gen_set_xaddr(TCGv addr);
+void    gen_set_yaddr(TCGv addr);
+void    gen_set_zaddr(TCGv addr);
+
+TCGv    gen_get_addr(TCGv H, TCGv M, TCGv L);            /*  addr = H:M:L    */
+TCGv    gen_get_xaddr(void);
+TCGv    gen_get_yaddr(void);
+TCGv    gen_get_zaddr(void);
+int     sex(int Imm, unsigned bits);

Order these functions properly and you don't need forward declarations.

+
+void    gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr)
+{
+    TCGv    t1      = tcg_temp_new_i32();
+    TCGv    t2      = tcg_temp_new_i32();
+    TCGv    t3      = tcg_temp_new_i32();
+
+    tcg_gen_and_tl(t1, Rd, Rr);    /*  t1  = Rd & Rr  */
+    tcg_gen_not_tl(t2, R);             /*  t2  = Rd & ~R  */
+    tcg_gen_and_tl(t2, Rd, t2);

tcg_gen_andc_tl.

+    tcg_gen_not_tl(t3, R);             /*  t3  = Rr  *~R  */
+    tcg_gen_and_tl(t3, Rr, t3);

Likewise.

+    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = t1 | t2 | t3  */
+    tcg_gen_or_tl(t1, t1, t3);

While this is exactly the formula in the manual, it's also equal to

    ((Rd ^ Rr) ^ R) & 16

where we examine the difference between the non-carry addition (Rd ^ Rr) and the carry addition (R) to find the carry out of bit 3. This reduces the operation count to 2 from 5.

+    tcg_gen_shri_tl(cpu_Cf, t1, 7);     /*  Cf  = t1(7)  */

Of course, Cf is also bit 8 of R, at least before you masked that off.

+    tcg_gen_shri_tl(cpu_Hf, t1, 3);     /*  Hf  = t1(3)  */
+    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);

Consider leaving Hf in bit 3 (or 4, as above) instead of shifting and masking to bit 0. This makes it (slightly) more complicated to read the full SREG value, but it saves two instructions per arithmetic instruction. This will add up quickly.

One can make the same argument for most of the other flags.

Compare how this is handled in target-arm for cpu_[NZCV]F.

+void    gen_add_Vf(TCGv    R, TCGv    Rd, TCGv    Rr)
+{
+    TCGv    t1      = tcg_temp_new_i32();
+    TCGv    t2      = tcg_temp_new_i32();
+
+    tcg_gen_not_tl(t1, Rd);            /*  t1  = ~Rd & ~Rr & R  */
+    tcg_gen_not_tl(t2, Rr);
+    tcg_gen_and_tl(t1, t1, t2);
+    tcg_gen_and_tl(t1, t1, R);
+
+    tcg_gen_not_tl(t2, R);             /*  t2  = Rd & Rr & ~R  */
+    tcg_gen_and_tl(t2, t2, Rd);
+    tcg_gen_and_tl(t2, t2, Rr);
+
+    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = Rd & Rr & ~R | ~Rd & ~Rr & R  */

While this is indeed the formula in the manual, a better expression is

  (R ^ Rd) & ~(Rd ^ Rr)

which is 3 operations instead of 5. As alluded above, it might be best to keep Vf in bit 7 instead of bit 0.

Also note that if you use bit 4 to compute Hf, as above, then one can share a sub-expression with the computation of Vf.

+void    gen_sub_CHf(TCGv    R, TCGv    Rd, TCGv    Rr)
+{
+    TCGv    t1      = tcg_temp_new_i32();
+    TCGv    t2      = tcg_temp_new_i32();
+    TCGv    t3      = tcg_temp_new_i32();
+
+    /*  Cf & Hf  */
+    tcg_gen_not_tl(t1, Rd);            /*  t1  = ~Rd  */
+    tcg_gen_and_tl(t2, t1, Rr);    /*  t2  = ~Rd & Rr  */
+    tcg_gen_or_tl(t3, t1, Rr);    /*  t3  = (~Rd | Rr) & R  */
+    tcg_gen_and_tl(t3, t3, R);
+    tcg_gen_or_tl(t2, t2, t3);    /*  t2  = ~Rd & Rr | ~Rd & R | R & Rr  */
+    tcg_gen_shri_tl(cpu_Cf, t2, 7);     /*  Cf  = t2(7)  */
+    tcg_gen_shri_tl(cpu_Hf, t2, 3);     /*  Hf  = t2(3)  */
+    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);

Note that carry and borrow are related, and thus this is *also* computable via ((Rd ^ Rr) ^ R) on bit 4.

+void    gen_sub_Vf(TCGv    R, TCGv    Rd, TCGv    Rr)
+{
+    TCGv    t1      = tcg_temp_new_i32();
+    TCGv    t2      = tcg_temp_new_i32();
+
+    /*  Vf  */
+    tcg_gen_and_tl(t1, Rr, R);     /*  t1  = Rd & ~Rr & ~R  */
+    tcg_gen_not_tl(t1, t1);
+    tcg_gen_and_tl(t1, t1, Rd);
+    tcg_gen_not_tl(t2, Rd);            /*  t2  = ~Rd & Rr & R  */
+    tcg_gen_and_tl(t2, t2, Rr);
+    tcg_gen_and_tl(t2, t2, R);
+    tcg_gen_or_tl(t1, t1, t2);    /*  t1  = Rd & ~Rr & ~R | ~Rd & Rr & R  */
+    tcg_gen_shri_tl(cpu_Vf, t1, 7);     /*  Vf  = t1(7)  */

This is equal to

  (R ^ Rd) & (Rd ^ Rr)

+void    gen_ZNSf(TCGv    R)
+{
+    tcg_gen_mov_tl(cpu_Zf, R);             /*  Zf  = R  */
+    tcg_gen_shri_tl(cpu_Nf, R, 7);     /*  Nf  = R(7)  */
+    tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/*  Sf  = Nf & Vf  */

This should be xor.

+void    gen_push_ret(CPUAVRState *env, int    ret)
+{
+    tcg_gen_qemu_st8(tcg_const_local_i32((ret & 0x0000ff)), cpu_sp, 
DATA_INDEX);

You don't need a local constant, and you should be freeing these 3 temps.

I'll also note that the piece-wise store is big-endian, so you could perform this in 1 store for 2_BYTE and 2 stores for 3_BYTE.

+void    gen_jmp_ez()
+{
+    tcg_gen_mov_tl(cpu_pc, cpu_eind);
+    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
+    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[31]);
+    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
+    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
+    tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffffff);
+    tcg_gen_exit_tb(0);
+}
+
+void    gen_jmp_z()
+{
+    tcg_gen_mov_tl(cpu_pc, cpu_r[31]);
+    tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
+    tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
+    tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffff);

Note this is

    tcg_gen_deposit_tl(cpu_pc, cpu_r[30], cpu_r[31], 8, 8);

Also consider storing EIND (and perhaps already shifted so that you can just OR it in.

+void    gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv L)
+{
+    tcg_gen_andi_tl(L, addr, 0xff);
+
+    tcg_gen_shri_tl(addr, addr, 8);
+    tcg_gen_andi_tl(M, addr, 0xff);
+
+    tcg_gen_shri_tl(addr, addr, 8);
+    tcg_gen_andi_tl(H, addr, 0xff);
+}
+
+void    gen_set_xaddr(TCGv addr)
+{
+    gen_set_addr(addr, cpu_rampX, cpu_r[27], cpu_r[26]);
+}
+
+void    gen_set_yaddr(TCGv addr)
+{
+    gen_set_addr(addr, cpu_rampY, cpu_r[29], cpu_r[28]);
+}
+
+void    gen_set_zaddr(TCGv addr)
+{
+    gen_set_addr(addr, cpu_rampZ, cpu_r[31], cpu_r[30]);
+}
+
+TCGv    gen_get_addr(TCGv H, TCGv M, TCGv L)
+{
+    TCGv    addr= tcg_temp_new_i32();
+
+    tcg_gen_mov_tl(addr, H);                     /*  addr    = H:M:L  */
+    tcg_gen_shli_tl(addr, addr, 8);
+    tcg_gen_or_tl(addr, addr, M);
+    tcg_gen_shli_tl(addr, addr, 8);
+    tcg_gen_or_tl(addr, addr, L);
+
+    return  addr;
+}
+
+TCGv    gen_get_xaddr()
+{
+    return  gen_get_addr(cpu_rampX, cpu_r[27], cpu_r[26]);
+}
+
+TCGv    gen_get_yaddr()
+{
+    return  gen_get_addr(cpu_rampY, cpu_r[29], cpu_r[28]);
+}
+
+TCGv    gen_get_zaddr()
+{
+    return  gen_get_addr(cpu_rampZ, cpu_r[31], cpu_r[30]);
+}

Wow. Um... Surely it would be better to store X and Y internally as whole 24-bit quantities, and Z as a 16-bit quantity (to be extended with rampz, rampd, or eind as needed).

This would allow normal loads, stores, and autoincrement to operate efficiently. When you see byte accesses to R[26-31], you extract/deposit the bytes as required. I assume that happens (significantly) less often than operating on X/Y/Z as a unit...

One might make the same argument for R[24-25] as it pertains to adiw et al.

+int     sex(int Imm, unsigned bits)
+{
+    Imm <<= 32 - bits;
+    Imm >>= 32 - bits;
+
+    return  Imm;
+}

This is sextract32(imm, 0, bits).

+int    avr_translate_ADIW(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_ADIW_SBIW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    RdL = cpu_r[24 + 2 * ADIW_Rd(opcode)];
+    TCGv    RdH = cpu_r[25 + 2 * ADIW_Rd(opcode)];
+    int     Imm = (ADIW_hImm(opcode) << 4) | (ADIW_lImm(opcode));
+    TCGv    R   = tcg_temp_new_i32();
+    TCGv    Rd  = tcg_temp_new_i32();
+    TCGv    t0  = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_shli_tl(Rd, RdH, 8);     /*  R = ((H << 8) | L) + Imm  */
+    tcg_gen_or_tl(Rd, RdL, Rd);
+    tcg_gen_addi_tl(R, Rd, Imm);
+    tcg_gen_andi_tl(R, R, 0xffff);/*  make it 16 bits  */
+
+    /*  Cf  */
+    tcg_gen_not_tl(t0, R);             /*  t0  = Rd & ~R  */
+    tcg_gen_and_tl(t0, Rd, t0);
+    tcg_gen_shri_tl(cpu_Cf, t0, 15);    /*  Cf  = t0(15)  */

Or simply bit 16.

+    /*  Sf  */
+    tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/*  Sf  = Nf & Vf  */

xor.

+int    avr_translate_ASR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[ASR_Rd(opcode)];
+    TCGv    t1  = tcg_temp_new_i32();
+    TCGv    t2  = tcg_temp_new_i32();
+
+    /*  op  */
+    tcg_gen_andi_tl(t1, Rd, 0x80);      /*  t1  = (Rd & 0x80) | (Rd >> 1)  */
+    tcg_gen_shri_tl(t2, Rd, 1);
+    tcg_gen_or_tl(t1, t1, t2);
+
+    /*  Cf  */
+    tcg_gen_andi_tl(cpu_Cf, Rd, 1);         /*  Cf  = Rd(0)  */
+
+    /*  Vf  */
+    tcg_gen_and_tl(cpu_Vf, cpu_Nf, cpu_Cf);/*  Vf  = Nf & Cf  */

xor.

+/*
+    Copies the T Flag in the SREG (Status Register) to bit b in register Rd.
+*/
+int    avr_translate_BLD(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[BLD_Rd(opcode)];
+    TCGv    t1  = tcg_temp_new_i32();
+
+    tcg_gen_shli_tl(t1, cpu_Tf, BLD_Bit(opcode));
+    tcg_gen_or_tl(Rd, Rd, t1);
+    tcg_gen_and_tl(Rd, Rd, t1);

Err... extra and? I'm pretty sure this insn simply sets bit B, and doesn't clear all of the others.

+/*
+    Clears the specified bits in register Rd. Performs the logical AND between 
the contents of register Rd and the
+    complement of the constant mask K. The result will be placed in register 
Rd.
+*/
+int    avr_translate_COM(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[COM_Rd(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_movi_tl(R, 0xff);
+    tcg_gen_sub_tl(Rd, R, Rd);

Better as xor with 0xff.

+/*
+    This instruction performs a compare between two registers Rd and Rr, and 
skips the next instruction if Rd = Rr.
+*/
+int    avr_translate_CPSE(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv        Rd  = cpu_r[CPSE_Rd(opcode)];
+    TCGv        Rr  = cpu_r[(CPSE_hRr(opcode) << 4) | (CPSE_lRr(opcode))];
+    TCGLabel   *skip= gen_new_label();
+
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[1].npc);      /*  PC if next inst is 
skipped  */
+    tcg_gen_brcond_i32(TCG_COND_EQ, Rd, Rr, skip);
+    tcg_gen_movi_tl(cpu_pc, ctx->inst[0].npc);      /*  PC if next inst is not 
skipped  */
+    gen_set_label(skip);

Any reason you're not using goto_tb?

+/*
+    Subtracts one -1- from the contents of register Rd and places the result 
in the destination register Rd.
+    The C Flag in SREG is not affected by the operation, thus allowing the DEC 
instruction to be used on a loop
+    counter in multiple-precision computations.
+    When operating on unsigned values, only BREQ and BRNE branches can be 
expected to perform consistently.
+    When operating on two’s complement values, all signed branches are 
available.
+*/
+int    avr_translate_DEC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv        Rd  = cpu_r[DEC_Rd(opcode)];
+
+    tcg_gen_subi_tl(Rd, Rd, 1);     /*  Rd  = Rd - 1  */
+    tcg_gen_andi_tl(Rd, Rd, 0xff);  /*  make it 8 bits  */
+
+    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x7f);  /* cpu_Vf   = Rd == 
0x7f  */

This is INC overflow.

+int    avr_translate_INC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[INC_Rd(opcode)];
+
+    tcg_gen_addi_tl(Rd, Rd, 1);
+    tcg_gen_andi_tl(Rd, Rd, 0xff);
+
+    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x80);  /* cpu_Vf   = Rd == 
0x80  */

This is DEC overflow.

+/*
+    Load one byte indirect from data space to register and stores an clear the 
bits in data space specified by the
+    register. The instruction can > +/*
only be used towards internal SRAM.
+    The data location is pointed to by the Z (16 bits) Pointer Register in the 
Register File. Memory access is limited
+    to the current data segment of 64KB. To access another data segment in 
devices with more than 64KB data
+    space, the RAMPZ in register in the I/O area has to be changed.
+    The Z-pointer Register is left unchanged by the operation. This 
instruction is especially suited for clearing status
+    bits stored in SRAM.
+*/
+int    avr_translate_LAC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rr  = cpu_r[LAC_Rr(opcode)];
+    TCGv    addr= gen_get_zaddr();
+    TCGv    t0  = tcg_temp_new_i32();
+    TCGv    t1  = tcg_temp_new_i32();
+
+    tcg_gen_qemu_ld8u(
+                    t0, addr, DATA_INDEX);    /*  t0      = mem[addr]  */
+    tcg_gen_movi_tl(t1, 0xff);                  /*  t1      = t0 & (0xff - Rr) 
 */
+    tcg_gen_sub_tl(t1, t1, Rr);
+    tcg_gen_and_tl(t1, t0, t1);

These last 3 are

  tcg_gen_andc_tl(t1, t0, Rr);

+/*
+    This instruction performs 8-bit x 8-bit -> 16-bit signed multiplication.
+*/
+int    avr_translate_MULS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_MUL) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    R0  = cpu_r[0];
+    TCGv    R1  = cpu_r[1];
+    TCGv    Rd  = cpu_r[16 + MULS_Rd(opcode)];
+    TCGv    Rr  = cpu_r[16 + MULS_Rr(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_ext8s_tl(Rd, Rd);            /*  make Rd full 32 bit signed  */
+    tcg_gen_ext8s_tl(Rr, Rr);            /*  make Rr full 32 bit signed  */

You need to either zero-extend these again afterward, or you need to perform the extensions into a temporary.

+/*
+    Replaces the contents of register Rd with its two’s complement; the value 
$80 is left unchanged.
+*/
+int    avr_translate_NEG(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    TCGv    Rd  = cpu_r[NEG_Rd(opcode)];
+    TCGv    R   = tcg_temp_new_i32();
+
+    tcg_gen_neg_tl(R, Rd);
+
+    tcg_gen_setcondi_tl(TCG_COND_NE, cpu_Cf, R, 0x00);  /*  Cf  = R != 0x00  */
+    tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, R, 0x80);  /*  Vf  = R == 0x80  */

Correct, but surely it's better to re-use the normal subtraction infrastructure. In particular, setcond is at minimum two operations. The tcg optimizer should do approximately as well folding away the zero from the subtract.

+    tcg_gen_not_tl(cpu_Hf, Rd);                                /*  Hf  = (~Rd 
| R)(3)  */
+    tcg_gen_or_tl(cpu_Hf, cpu_Hf, R);
+    tcg_gen_shri_tl(cpu_Hf, cpu_Hf, 3);
+    tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
+    gen_ZNSf(R);
+
+    tcg_temp_free_i32(R);

You failed to write back the result.

+int    avr_translate_XCH(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+    if (avr_feature(env, AVR_FEATURE_RMW) == false) {
+        gen_helper_unsupported(cpu_env);
+
+        return BS_EXCP;
+    }
+
+    TCGv    Rd      = cpu_r[XCH_Rd(opcode)];
+    TCGv    t0      = tcg_temp_new_i32();
+    TCGv    addr    = gen_get_zaddr();
+
+    tcg_gen_qemu_ld8u(
+                    addr, t0, DATA_INDEX);
+    tcg_gen_qemu_st8(
+                    addr, Rd, DATA_INDEX);

The address and data temporaries are swapped.


r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]