[PATCH v2 16/30] accel/tcg: Add aarch64 specific support in ldst

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2 16/30] accel/tcg: Add aarch64 specific support in ldst_atomici

From:	Richard Henderson
Subject:	[PATCH v2 16/30] accel/tcg: Add aarch64 specific support in ldst_atomicity
Date:	Wed, 15 Feb 2023 16:57:25 -1000

We have code in atomic128.h noting that through GCC 8, there
was no support for atomic operations on __uint128.  This has
been fixed in GCC 10.  But we can still improve over any
basic compare-and-swap loop using the ldxp/stxp instructions.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/ldst_atomicity.c.inc | 60 ++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index 07982e021d..9a95ac327d 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -247,7 +247,22 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, 
uintptr_t ra, void *pv)
      * In system mode all guest pages are writable, and for user-only
      * we have just checked writability.  Try cmpxchg.
      */
-#if defined(CONFIG_CMPXCHG128)
+#if defined(__aarch64__)
+    /* We can do better than cmpxchg for AArch64.  */
+    {
+        uint64_t l, h;
+        uint32_t fail;
+
+        /* The load must be paired with the store to guarantee not tearing. */
+        asm("0: ldxp %0, %1, %3\n\t"
+            "stxp %w2, %0, %1, %3\n\t"
+            "cbnz %w2, 0b"
+            : "=&r"(l), "=&r"(h), "=&r"(fail) : "Q"(*p));
+
+        qemu_build_assert(!HOST_BIG_ENDIAN);
+        return int128_make128(l, h);
+    }
+#elif defined(CONFIG_CMPXCHG128)
     /* Swap 0 with 0, with the side-effect of returning the old value. */
     {
         Int128Alias r;
@@ -740,7 +755,22 @@ store_atomic16(void *pv, Int128Alias val)
         return;
     }
 #endif
-#if defined(CONFIG_CMPXCHG128)
+#if defined(__aarch64__)
+    /* We can do better than cmpxchg for AArch64.  */
+    {
+        uint64_t l, h, t;
+
+        qemu_build_assert(!HOST_BIG_ENDIAN);
+        l = int128_getlo(val.s);
+        h = int128_gethi(val.s);
+
+        asm("0: ldxp %0, xzr, %1\n\t"
+            "stxp %w0, %2, %3, %1\n\t"
+            "cbnz %w0, 0b"
+            : "=&r"(t), "=Q"(*(__uint128_t *)pv) : "r"(l), "r"(h));
+        return;
+    }
+#elif defined(CONFIG_CMPXCHG128)
     {
         __uint128_t *pu = __builtin_assume_aligned(pv, 16);
         __uint128_t o;
@@ -838,7 +868,31 @@ static void store_atom_insert_al8(uint64_t *p, uint64_t 
val, uint64_t msk)
 static void ATTRIBUTE_ATOMIC128_OPT
 store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
 {
-#if defined(CONFIG_ATOMIC128)
+#if defined(__aarch64__)
+    /*
+     * GCC only implements __sync* primitives for int128 on aarch64.
+     * We can do better without the barriers, and integrating the
+     * arithmetic into the load-exclusive/store-conditional pair.
+     */
+    uint64_t tl, th, vl, vh, ml, mh;
+    uint32_t fail;
+
+    qemu_build_assert(!HOST_BIG_ENDIAN);
+    vl = int128_getlo(val.s);
+    vh = int128_gethi(val.s);
+    ml = int128_getlo(msk.s);
+    mh = int128_gethi(msk.s);
+
+    asm("0: ldxp %[l], %[h], %[mem]\n\t"
+        "bic %[l], %[l], %[ml]\n\t"
+        "bic %[h], %[h], %[mh]\n\t"
+        "orr %[l], %[l], %[vl]\n\t"
+        "orr %[h], %[h], %[vh]\n\t"
+        "stxp %w[f], %[l], %[h], %[mem]\n\t"
+        "cbnz %w[f], 0b\n"
+        : [mem] "+Q"(*ps), [f] "=&r"(fail), [l] "=&r"(tl), [h] "=&r"(th)
+        : [vl] "r"(vl), [vh] "r"(vh), [ml] "r"(ml), [mh] "r"(mh));
+#elif defined(CONFIG_ATOMIC128)
     __uint128_t *pu, old, new;
 
     /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
-- 
2.34.1

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH v2 04/30] accel/tcg: Introduce tlb_read_idx, (continued)
- [PATCH v2 04/30] accel/tcg: Introduce tlb_read_idx, Richard Henderson, 2023/02/15
- [PATCH v2 05/30] accel/tcg: Reorg system mode load helpers, Richard Henderson, 2023/02/15
- [PATCH v2 06/30] accel/tcg: Reorg system mode store helpers, Richard Henderson, 2023/02/15
- [PATCH v2 07/30] accel/tcg: Honor atomicity of loads, Richard Henderson, 2023/02/15
- [PATCH v2 08/30] accel/tcg: Honor atomicity of stores, Richard Henderson, 2023/02/15
- [PATCH v2 09/30] tcg/tci: Use cpu_{ld,st}_mmu, Richard Henderson, 2023/02/15
- [PATCH v2 10/30] tcg: Unify helper_{be,le}_{ld,st}*, Richard Henderson, 2023/02/15
- [PATCH v2 13/30] meson: Detect atomic128 support with optimization, Richard Henderson, 2023/02/15
- [PATCH v2 11/30] accel/tcg: Implement helper_{ld, st}*_mmu for user-only, Richard Henderson, 2023/02/15
- [PATCH v2 12/30] tcg: Add 128-bit guest memory primitives, Richard Henderson, 2023/02/15
- [PATCH v2 16/30] accel/tcg: Add aarch64 specific support in ldst_atomicity, Richard Henderson <=
- [PATCH v2 14/30] tcg/i386: Add have_atomic16, Richard Henderson, 2023/02/15
- [PATCH v2 15/30] accel/tcg: Use have_atomic16 in ldst_atomicity.c.inc, Richard Henderson, 2023/02/15
- [PATCH v2 17/30] tcg/aarch64: Detect have_lse, have_lse2 for linux, Richard Henderson, 2023/02/15
- [PATCH v2 19/30] accel/tcg: Add have_lse2 support in ldst_atomicity, Richard Henderson, 2023/02/15
- [PATCH v2 18/30] tcg/aarch64: Detect have_lse, have_lse2 for darwin, Richard Henderson, 2023/02/15
- [PATCH v2 20/30] tcg: Introduce TCG_OPF_TYPE_MASK, Richard Henderson, 2023/02/15
- [PATCH v2 22/30] tcg/i386: Introduce tcg_out_mov2, Richard Henderson, 2023/02/15
- [PATCH v2 21/30] tcg: Add INDEX_op_qemu_{ld,st}_i128, Richard Henderson, 2023/02/15
- [PATCH v2 23/30] tcg/i386: Introduce tcg_out_testi, Richard Henderson, 2023/02/15
- [PATCH v2 24/30] tcg/i386: Use full load/store helpers in user-only mode, Richard Henderson, 2023/02/15

Prev by Date: [PATCH v2 12/30] tcg: Add 128-bit guest memory primitives
Next by Date: [PATCH v2 14/30] tcg/i386: Add have_atomic16
Previous by thread: [PATCH v2 12/30] tcg: Add 128-bit guest memory primitives
Next by thread: [PATCH v2 14/30] tcg/i386: Add have_atomic16
Index(es):
- Date
- Thread