[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [RFC 27/30] target-arm: emulate aarch64's LL/SC using cmpxc
From: |
Emilio G. Cota |
Subject: |
[Qemu-devel] [RFC 27/30] target-arm: emulate aarch64's LL/SC using cmpxchg helpers |
Date: |
Mon, 27 Jun 2016 15:02:13 -0400 |
Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.
The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.
Hi-res plots: http://imgur.com/a/JVc8Y
atomic_add-bench: 1000000 ops/thread, [0,1] range
18 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
16 ++master +-H--+ ++
|| |
14 ++ ++
| | |
12 ++| ++
| | |
10 ++++ ++
8 ++E ++
|+++ |
6 ++ | ++
| | |
4 ++ | ++
| | |
2 +H++E+--- ++
+ | +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E|
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,2] range
18 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
16 ++master +-H--+ ++
| | |
14 ++E ++
| | |
12 ++| ++
|+++ |
10 ++ | ++
8 ++ | ++
| | |
6 ++ | ++
| | |
4 ++ | ++
| +E+--- |
2 +H+ +E+-----+++ +++ +++ ---+E+-----+E+------+++
+++ + +E+---+--+E+----++E+------+E+--- ++++ +++ + +E|
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,128] range
70 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
60 ++master +-H--+ +++ ---+E+-----+E+------+E+
| +E+------E-------+E+--- |
| --- +++ |
50 ++ +++--- ++
| -+E+ |
40 ++ +++---- ++
| E- |
| --| |
30 ++ -- +++ ++
| +E+ |
20 ++E+ ++
|E+ |
| |
10 ++ ++
+ + + + + + + |
0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,1024] range
160 ++---------+---------+----------+---------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
140 ++master +-H--+ +++ +++
| -+E+-----+E+-------E|
120 ++ +++ ---- +++
| +++ ----E-- |
100 ++ --E--- +++ ++
| +++ ---- +++ |
80 ++ --E-- ++
| ---- +++ |
| -+E+ |
60 ++ ---- +++ ++
| +E+- |
40 ++ -- ++
| +E+ |
20 +EE+ ++
+++ + + + + + + |
0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
Signed-off-by: Emilio G. Cota <address@hidden>
---
target-arm/translate-a64.c | 79 ++++++++++++++++++----------------------------
1 file changed, 30 insertions(+), 49 deletions(-)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index f5e29d2..1676519 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1754,17 +1754,6 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t
insn)
}
}
-/*
- * Load/Store exclusive instructions are implemented by remembering
- * the value/address loaded, and seeing if these are the same
- * when the store is performed. This is not actually the architecturally
- * mandated semantics, but it works for typical guest code sequences
- * and avoids having to monitor regular stores.
- *
- * In system emulation mode only one CPU will be running at once, so
- * this sequence is effectively atomic. In user emulation mode we
- * throw an exception and handle the atomic operation elsewhere.
- */
static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
TCGv_i64 addr, int size, bool is_pair)
{
@@ -1794,16 +1783,6 @@ static void gen_load_exclusive(DisasContext *s, int rt,
int rt2,
tcg_gen_mov_i64(cpu_exclusive_addr, addr);
}
-#ifdef CONFIG_USER_ONLY
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
- TCGv_i64 addr, int size, int is_pair)
-{
- tcg_gen_mov_i64(cpu_exclusive_test, addr);
- tcg_gen_movi_i32(cpu_exclusive_info,
- size | is_pair << 2 | (rd << 4) | (rt << 9) | (rt2 <<
14));
- gen_exception_internal_insn(s, 4, EXCP_STREX);
-}
-#else
static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
TCGv_i64 inaddr, int size, int is_pair)
{
@@ -1831,46 +1810,48 @@ static void gen_store_exclusive(DisasContext *s, int
rd, int rt, int rt2,
tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);
tmp = tcg_temp_new_i64();
- tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), s->be_data + size);
- tcg_gen_brcond_i64(TCG_COND_NE, tmp, cpu_exclusive_val, fail_label);
- tcg_temp_free_i64(tmp);
-
if (is_pair) {
- TCGv_i64 addrhi = tcg_temp_new_i64();
- TCGv_i64 tmphi = tcg_temp_new_i64();
-
- tcg_gen_addi_i64(addrhi, addr, 1 << size);
- tcg_gen_qemu_ld_i64(tmphi, addrhi, get_mem_index(s),
- s->be_data + size);
- tcg_gen_brcond_i64(TCG_COND_NE, tmphi, cpu_exclusive_high, fail_label);
-
- tcg_temp_free_i64(tmphi);
- tcg_temp_free_i64(addrhi);
- }
-
- /* We seem to still have the exclusive monitor, so do the store */
- tcg_gen_qemu_st_i64(cpu_reg(s, rt), addr, get_mem_index(s),
- s->be_data + size);
- if (is_pair) {
- TCGv_i64 addrhi = tcg_temp_new_i64();
+ if (size == 2) {
+ gen_helper_paired_cmpxchg64l(tmp, cpu_env, addr, cpu_exclusive_val,
+ cpu_exclusive_high,
+ cpu_reg(s, rt), cpu_reg(s, rt2));
+ } else {
+ gen_helper_paired_cmpxchg64q(tmp, cpu_env, addr, cpu_exclusive_val,
+ cpu_exclusive_high, cpu_reg(s, rt),
+ cpu_reg(s, rt2));
+ }
+ } else {
+ TCGv_i64 val = cpu_reg(s, rt);
- tcg_gen_addi_i64(addrhi, addr, 1 << size);
- tcg_gen_qemu_st_i64(cpu_reg(s, rt2), addrhi,
- get_mem_index(s), s->be_data + size);
- tcg_temp_free_i64(addrhi);
+ switch (size) {
+ case 0:
+ gen_helper_cmpxchg64b(tmp, cpu_env, addr, cpu_exclusive_val, val);
+ break;
+ case 1:
+ gen_helper_cmpxchg64w(tmp, cpu_env, addr, cpu_exclusive_val, val);
+ break;
+ case 2:
+ gen_helper_cmpxchg64l(tmp, cpu_env, addr, cpu_exclusive_val, val);
+ break;
+ case 3:
+ gen_helper_cmpxchg64q(tmp, cpu_env, addr, cpu_exclusive_val, val);
+ break;
+ default:
+ abort();
+ }
}
tcg_temp_free_i64(addr);
- tcg_gen_movi_i64(cpu_reg(s, rd), 0);
+ tcg_gen_mov_i64(cpu_reg(s, rd), tmp);
+ tcg_temp_free_i64(tmp);
tcg_gen_br(done_label);
+
gen_set_label(fail_label);
tcg_gen_movi_i64(cpu_reg(s, rd), 1);
gen_set_label(done_label);
tcg_gen_movi_i64(cpu_exclusive_addr, -1);
-
}
-#endif
/* Update the Sixty-Four bit (SF) registersize. This logic is derived
* from the ARMv8 specs for LDR (Shared decode for all encodings).
--
2.5.0
- [Qemu-devel] [RFC 09/30] softmmu: add atomic helpers, (continued)
- [Qemu-devel] [RFC 09/30] softmmu: add atomic helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 07/30] atomics: add atomic_xor, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 08/30] atomics: add atomic_op_fetch variants, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 14/30] target-i386: emulate LOCK'ed NOT using atomic helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 10/30] cpu_ldst: add cpu_atomic helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 11/30] target-i386: add atomic helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 06/30] target-i386: emulate LOCK'ed cmpxchg8b/16b using cmpxchg helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 05/30] target-i386: emulate LOCK'ed cmpxchg using cmpxchg helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 13/30] target-i386: emulate LOCK'ed INC using atomic helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 15/30] target-i386: emulate LOCK'ed NEG using cmpxchg helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 27/30] target-arm: emulate aarch64's LL/SC using cmpxchg helpers,
Emilio G. Cota <=
- [Qemu-devel] [RFC 25/30] helper: add DEF_HELPER_6, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 16/30] target-i386: emulate LOCK'ed XADD using atomic helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 23/30] target-arm: add atomic_xchg helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 22/30] target-arm: emulate LL/SC using cmpxchg helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 29/30] linux-user: remove handling of aarch64's EXCP_STREX, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 20/30] target-i386: remove helper_lock(), Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 26/30] target-arm: add cmpxchg helpers for aarch64, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 17/30] target-i386: emulate LOCK'ed BTX ops using atomic helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 24/30] target-arm: emulate SWP with atomic_xchg helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 30/30] target-arm: remove EXCP_STREX + cpu_exclusive_{test, info}, Emilio G. Cota, 2016/06/27