[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v4 12/17] target/riscv: support for 128-bit arithmetic instru
From: |
Richard Henderson |
Subject: |
Re: [PATCH v4 12/17] target/riscv: support for 128-bit arithmetic instructions |
Date: |
Tue, 2 Nov 2021 08:43:28 -0400 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Thunderbird/78.13.0 |
On 10/25/21 5:28 AM, Frédéric Pétrot wrote:
+ if (get_xl(ctx) < MXL_RV128 || get_ol(ctx) < MXL_RV128) {
Only the second test. Two times.
+static bool gen_setcond_i128(TCGv rl, TCGv rh,
+ TCGv al, TCGv ah,
+ TCGv bl, TCGv bh,
+ TCGCond cond)
+{
+ switch (cond) {
+ case TCG_COND_EQ:
+ tcg_gen_xor_tl(rl, al, bl);
+ tcg_gen_xor_tl(rh, ah, bh);
+ tcg_gen_or_tl(rh, rl, rh);
+ tcg_gen_setcondi_tl(TCG_COND_EQ, rl, rh, 0);
+ break;
Indentation.
+
+ case TCG_COND_NE:
+ tcg_gen_xor_tl(rl, al, bl);
+ tcg_gen_xor_tl(rh, ah, bh);
+ tcg_gen_or_tl(rh, rl, rh);
+ tcg_gen_setcondi_tl(TCG_COND_NE, rl, rh, 0);
+ break;
+
+ case TCG_COND_LT:
+ {
+ TCGv tmp1 = tcg_temp_new();
+ TCGv tmp2 = tcg_temp_new();
+
+ tcg_gen_sub2_tl(rl, rh, al, ah, bl, bh);
+ tcg_gen_xor_tl(tmp1, rh, ah);
+ tcg_gen_xor_tl(tmp2, ah, bh);
+ tcg_gen_and_tl(tmp1, tmp1, tmp2);
+ tcg_gen_xor_tl(tmp1, rh, tmp1);
+ tcg_gen_shri_tl(rl, tmp1, 63);
+
+ tcg_temp_free(tmp1);
+ tcg_temp_free(tmp2);
+ break;
+ }
+
+ case TCG_COND_GE:
+ /* Invert result of TCG_COND_LT */
+ gen_setcond_i128(rl, rh, al, ah, bl, bh, TCG_COND_LT);
+ tcg_gen_xori_tl(rl, rl, 1);
+ break;
+
+ case TCG_COND_LTU:
+ gen_setcond_i128(rl, rh, al, ah, bl, bh, TCG_COND_GEU);
+ tcg_gen_xori_tl(rl, rl, 1);
+ break;
+
+ case TCG_COND_GEU:
+ {
+ TCGv tmp1 = tcg_temp_new();
+ TCGv tmp2 = tcg_temp_new();
+ TCGv zero = tcg_constant_tl(0);
+ TCGv one = tcg_constant_tl(1);
+ /* borrow in to second word */
+ tcg_gen_setcond_tl(TCG_COND_LTU, tmp1, al, bl);
+ /* seed third word with 1, which will be result */
+ tcg_gen_sub2_tl(tmp1, tmp2, ah, one, tmp1, zero);
+ tcg_gen_sub2_tl(tmp1, rl, tmp1, tmp2, bh, zero);
+
+ tcg_temp_free(tmp1);
+ tcg_temp_free(tmp2);
+ break;
+ }
+
+ default:
+ return false;
This should be g_assert_not_reached(), not return false.
+ }
+ tcg_gen_movi_tl(rh, 0);
+ return true;
+}
Which begs the question of what the return value is for when you don't even use
it below.
I think we should treat this as more of a subroutine, and return the final TCGCond
required to get the correct result, *without* actually computing the final setcond.
static TCGCond compare_128i(TCGv rl, int rs1, int rs2, TCGCond cond)
{
TCGv al = get_gpr(ctx, rs1, EXT_SIGN);
TCGv bl = get_gpr(ctx, rs2, EXT_SIGN);
TCGv ah = get_gprh(ctx, a->rs1);
TCGv bh = get_gprh(ctx, a->rs2);
TCGv rh = tcg_temp_new();
bool invert = false;
switch (cond) {
case TCG_COND_EQ:
case TCG_COND_NE:
if (rs2 == 0) {
tcg_gen_or_tl(rl, al, ah);
} else {
tcg_gen_xor_tl(rl, al, bl);
tcg_gen_xor_tl(rh, ah, bh);
tcg_gen_or_tl(rl, rl, rh);
}
break;
case TCG_COND_GE:
invert = true;
cond = TCG_COND_LT;
/* fall through */
case TCG_COND_LT:
if (rs2 == 0) {
tcg_gen_mov_tl(rl, ah);
} else {
TCGv tmp1 = tcg_temp_new();
TCGv tmp2 = tcg_temp_new();
tcg_gen_sub2_tl(rl, rh, al, ah, bl, bh);
tcg_gen_xor_tl(tmp1, rh, ah);
tcg_gen_xor_tl(tmp2, ah, bh);
tcg_gen_and_tl(tmp1, tmp1, tmp2);
tcg_gen_xor_tl(rl, rh, tmp1);
tcg_temp_free(tmp1);
tcg_temp_free(tmp2);
}
break;
...
}
tcg_temp_free(rh);
if (invert) {
cond = tcg_invert_cond(cond);
}
return cond;
}
static void setcond_128i(...)
{
cond = compare_128i(rl, rh, al, ah, bl, bh, cond);
tcg_gen_setcondi_tl(cond, rl, rl, 0);
tcg_gen_movi_tl(rh, 0);
}
static void branch_128i(...)
{
TCGv tmpl = tcg_temp_new();
TCGv tmph = tcg_temp_new();
cond = compare_128i(tmpl, tmph, al, ah, bl, bh, cond);
tcg_gen_brcondi_tl(cond, tmpl, 0, label);
tcg_temp_free(tmpl);
tcg_temp_free(tmph);
}
+
static bool gen_branch(DisasContext *ctx, arg_b *a, TCGCond cond)
{
TCGLabel *l = gen_new_label();
TCGv src1 = get_gpr(ctx, a->rs1, EXT_SIGN);
TCGv src2 = get_gpr(ctx, a->rs2, EXT_SIGN);
- tcg_gen_brcond_tl(cond, src1, src2, l);
+ if (get_xl(ctx) == MXL_RV128) {
+ TCGv src1h = get_gprh(ctx, a->rs1);
+ TCGv src2h = get_gprh(ctx, a->rs2);
+ TCGv tmpl = tcg_temp_new();
+ TCGv tmph = tcg_temp_new();
+
+ /*
+ * Do not use gen_setcond_i128 for EQ and NE as these conditions are
+ * often met and can be more efficiently implemented.
+ * Some comparisons with zero are also simpler, let's do them.
+ */
+ if (cond == TCG_COND_EQ || cond == TCG_COND_NE) {
+ /*
+ * bnez and beqz being used quite often too, lets optimize them,
+ * although QEMU's tcg optimizer handles these cases nicely
+ */
+ if (a->rs2 == 0) {
+ tcg_gen_or_tl(tmpl, src1, src1h);
+ tcg_gen_brcondi_tl(cond, tmpl, 0, l);
+ } else {
+ tcg_gen_xor_tl(tmpl, src1, src2);
+ tcg_gen_xor_tl(tmph, src1h, src2h);
+ tcg_gen_or_tl(tmpl, tmpl, tmph);
+ tcg_gen_brcondi_tl(cond, tmpl, 0, l);
+ }
+ } else if (a->rs2 == 0
+ && (cond == TCG_COND_LT || cond == TCG_COND_GE)) {
+ tcg_gen_shri_tl(tmpl, src1h, 63);
+ /* hack: transform LT in EQ and GE in NE */
+ tcg_gen_brcondi_tl((cond & 13) | 8, tmpl, 1, l);
+ } else {
+ if (cond == TCG_COND_GE || cond == TCG_COND_LTU) {
+ gen_setcond_i128(tmpl, tmph, src1, src1h, src2, src2h,
+ tcg_invert_cond(cond));
+ tcg_gen_brcondi_tl(TCG_COND_EQ, tmpl, 0, l);
+ } else {
+ gen_setcond_i128(tmpl, tmph, src1, src1h, src2, src2h, cond);
+ tcg_gen_brcondi_tl(TCG_COND_NE, tmpl, 0, l);
+ }
+ }
Which then takes care of all of this.
BTW, the hack was quite a hack, and quite unnecessary. You could have dropped the shift
and performed the LT/GE brcond directly on src1h.
r~
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [PATCH v4 12/17] target/riscv: support for 128-bit arithmetic instructions,
Richard Henderson <=