guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] 03/07: First implementation of a template JIT


From: Andy Wingo
Subject: [Guile-commits] 03/07: First implementation of a template JIT
Date: Mon, 20 Aug 2018 06:08:27 -0400 (EDT)

wingo pushed a commit to branch lightning
in repository guile.

commit 698bff8748492c10d9935d7837776c323dc4f383
Author: Andy Wingo <address@hidden>
Date:   Sun Aug 19 17:39:16 2018 +0200

    First implementation of a template JIT
    
    * libguile/jit.c: Implement most of a JIT.  Untested and still needs to
      be wired up.
---
 libguile/jit.c | 2400 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 2316 insertions(+), 84 deletions(-)

diff --git a/libguile/jit.c b/libguile/jit.c
index afd3f05..7f0bac5 100644
--- a/libguile/jit.c
+++ b/libguile/jit.c
@@ -26,18 +26,739 @@
 #include <lightning.h>
 #endif
 
+#include "frames.h"
+#include "gsubr.h"
+#include "instructions.h"
+#include "intrinsics.h"
 #include "threads.h"
+#include "vm-builtins.h"
 #include "vm-operations.h"
 
 #include "jit.h"
 
+
+
+
 typedef struct {
+  jit_state_t *jit;
   scm_thread *thread;
   const uint32_t *start;
   uint32_t *ip;
   const uint32_t *end;
+  int32_t frame_size;
+  uint8_t hooks_enabled;
 } scm_jit_state;
 
+/* Lightning routines take an implicit parameter, _jit.  All functions
+   that call lightning API should have a parameter "scm_jit_state *j";
+   this definition makes lightning load its state from that
+   parameter.  */
+#define _jit (j->jit)
+
+static const uint32_t program_word_offset_free_variable = 2;
+
+static const uint32_t frame_offset_mra = 0 * sizeof(union 
scm_vm_stack_element);
+static const uint32_t frame_offset_vra = 1 * sizeof(union 
scm_vm_stack_element);
+static const uint32_t frame_offset_prev = 2 * sizeof(union 
scm_vm_stack_element);
+static const uint32_t frame_overhead_slots = 3;
+
+#define DEFINE_THREAD_OFFSET(f)                                         \
+  static const uint32_t thread_offset_##f =                             \
+    offsetof (struct scm_thread, f)
+
+DEFINE_THREAD_OFFSET (handle);
+DEFINE_THREAD_OFFSET (pending_asyncs);
+DEFINE_THREAD_OFFSET (block_asyncs);
+
+#define DEFINE_THREAD_VP_OFFSET(f)                                      \
+  static const uint32_t thread_offset_##f =                             \
+    offsetof (struct scm_thread, vm) + offsetof (struct scm_vm, f)
+
+DEFINE_THREAD_VP_OFFSET (fp);
+DEFINE_THREAD_VP_OFFSET (sp);
+DEFINE_THREAD_VP_OFFSET (ip);
+DEFINE_THREAD_VP_OFFSET (compare_result);
+DEFINE_THREAD_VP_OFFSET (sp_min_since_gc);
+DEFINE_THREAD_VP_OFFSET (stack_limit);
+DEFINE_THREAD_VP_OFFSET (trace_level);
+
+static const jit_gpr_t THREAD = JIT_V0;
+static const jit_gpr_t SP = JIT_V1;
+
+static const jit_gpr_t T0 = JIT_R0;
+static const jit_gpr_t T1 = JIT_R1;
+static const jit_gpr_t T2 = JIT_R2;
+static const jit_gpr_t T3 = JIT_V2;
+
+/* Sometimes you want to call out the fact that T3 is preserved across
+   calls.  In that case, use T3_PRESERVED.  */
+static const jit_gpr_t T3_PRESERVED = JIT_V2;
+
+#ifdef WORDS_BIGENDIAN
+#define BIGENDIAN 1
+#else
+#define BIGENDIAN 0
+#endif
+
+#if BIGENDIAN
+static const uint32_t uint32_offset_low_byte = 3;
+#else
+static const uint32_t uint32_offset_low_byte = 0;
+#endif
+
+#if SCM_SIZEOF_UINTPTR_T == 4
+static const uint32_t log2_sizeof_uintptr_t = 2;
+#elif SCM_SIZEOF_UINTPTR_T == 8
+static const uint32_t log2_sizeof_uintptr_t = 3;
+#else
+#error unhandled uintptr_t size
+#endif
+
+static void
+emit_reload_sp (scm_jit_state *j)
+{
+  jit_ldxi (SP, THREAD, thread_offset_sp);
+}
+
+static void
+emit_store_sp (scm_jit_state *j)
+{
+  jit_stxi (thread_offset_sp, THREAD, SP);
+}
+
+static void
+emit_load_fp (scm_jit_state *j, jit_gpr_t dst)
+{
+  jit_ldxi (dst, THREAD, thread_offset_fp);
+}
+
+static void
+emit_store_fp (scm_jit_state *j, jit_gpr_t fp)
+{
+  jit_stxi (thread_offset_fp, THREAD, fp);
+}
+
+static void
+emit_subtract_stack_slots (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t src,
+                           uint32_t n)
+{
+  jit_subi (dst, src, n * sizeof (union scm_vm_stack_element));
+}
+
+static void
+emit_load_mra (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t fp)
+{
+  if (frame_offset_mra != 0)
+    abort ();
+  jit_ldr (dst, fp);
+}
+
+static jit_node_t *
+emit_store_mra (scm_jit_state *j, jit_gpr_t fp, jit_gpr_t t)
+{
+  jit_node_t *addr = jit_movi (t, 0); /* patched later */
+  if (frame_offset_mra != 0)
+    abort ();
+  jit_str (fp, t);
+  return addr;
+}
+
+static void
+emit_load_vra (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t fp)
+{
+  jit_ldxi (dst, fp, frame_offset_vra);
+}
+
+static void
+emit_store_vra (scm_jit_state *j, jit_gpr_t fp, jit_gpr_t t, const uint32_t 
*vra)
+{
+  jit_movi (t, (intptr_t) vra);
+  jit_stxi (frame_offset_vra, fp, t);
+}
+
+static void
+emit_load_prev_frame_size (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t fp)
+{
+  jit_ldxi (dst, fp, frame_offset_prev);
+}
+
+static void
+emit_store_prev_frame_size (scm_jit_state *j, jit_gpr_t fp, jit_gpr_t t,
+                            uint32_t n)
+{
+  jit_movi (t, n);
+  jit_stxi (frame_offset_prev, fp, t);
+}
+
+static void
+emit_store_ip (scm_jit_state *j, jit_gpr_t ip)
+{
+  jit_stxi (thread_offset_ip, THREAD, ip);
+}
+
+static void
+emit_store_current_ip (scm_jit_state *j, jit_gpr_t t)
+{
+  jit_movi (t, (intptr_t) j->ip);
+  emit_store_ip (j, t);
+}
+
+static void
+emit_load_compare_result (scm_jit_state *j, jit_gpr_t dst)
+{
+  jit_ldxi_uc (dst, THREAD, thread_offset_compare_result);
+}
+
+static void
+emit_store_compare_result (scm_jit_state *j, jit_gpr_t src)
+{
+  jit_stxi_c (thread_offset_compare_result, THREAD, src);
+}
+
+static void
+emit_reset_frame (scm_jit_state *j, jit_gpr_t fp, uint32_t nlocals)
+{
+  emit_subtract_stack_slots (j, SP, fp, nlocals);
+  emit_store_sp (j);
+}
+
+static void
+emit_call (scm_jit_state *j, void *f)
+{
+  jit_prepare ();
+  jit_finishi (f);
+}
+
+static void
+emit_call_r (scm_jit_state *j, void *f, jit_gpr_t a)
+{
+  jit_prepare ();
+  jit_pushargr (a);
+  jit_finishi (f);
+}
+
+static void
+emit_call_i (scm_jit_state *j, void *f, intptr_t a)
+{
+  jit_prepare ();
+  jit_pushargi (a);
+  jit_finishi (f);
+}
+
+static void
+emit_call_r_r (scm_jit_state *j, void *f, jit_gpr_t a, jit_gpr_t b)
+{
+  jit_prepare ();
+  jit_pushargr (a);
+  jit_pushargr (b);
+  jit_finishi (f);
+}
+
+static void
+emit_call_r_i (scm_jit_state *j, void *f, jit_gpr_t a, intptr_t b)
+{
+  jit_prepare ();
+  jit_pushargr (a);
+  jit_pushargi ((intptr_t) b);
+  jit_finishi (f);
+}
+
+static void
+emit_call_r_r_r (scm_jit_state *j, void *f, jit_gpr_t a, jit_gpr_t b,
+                 jit_gpr_t c)
+{
+  jit_prepare ();
+  jit_pushargr (a);
+  jit_pushargr (b);
+  jit_pushargr (c);
+  jit_finishi (f);
+}
+
+static void
+emit_alloc_frame_for_sp (scm_jit_state *j, jit_gpr_t fp, jit_gpr_t t)
+{
+  jit_node_t *k, *fast, *watermark;
+
+  jit_ldxr (t, THREAD, thread_offset_sp_min_since_gc);
+  fast = jit_bger (SP, t);
+  jit_ldxr (t, THREAD, thread_offset_stack_limit);
+  watermark = jit_bger (SP, t);
+
+  /* Slow case: call out to expand stack.  */
+  emit_store_current_ip (j, t);
+  emit_call_r_r (j, scm_vm_intrinsics.expand_stack, THREAD, SP);
+  emit_reload_sp (j);
+  k = jit_jmpi ();
+
+  /* Past sp_min_since_gc, but within stack_limit: update watermark and
+     fall through.  */
+  jit_patch (watermark);
+  jit_stxr (thread_offset_sp_min_since_gc, THREAD, SP);
+  jit_patch (fast);
+  /* Fast case: Just update sp.  */
+  emit_store_sp (j);
+  jit_patch (k);
+}
+
+static void
+emit_alloc_frame (scm_jit_state *j, jit_gpr_t fp, jit_gpr_t t, uint32_t 
nlocals)
+{
+  emit_subtract_stack_slots (j, SP, fp, nlocals);
+  emit_alloc_frame_for_sp (j, fp, t);
+}
+
+static void
+emit_get_callee_vcode (scm_jit_state *j, jit_gpr_t dst)
+{
+  emit_call_r (j, scm_vm_intrinsics.get_callee_vcode, THREAD);
+  jit_retval (dst);
+}
+
+static void
+emit_get_vcode_low_byte (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t addr)
+{
+  if (uint32_offset_low_byte == 0)
+    jit_ldr_uc (dst, addr);
+  else
+    jit_ldxi_uc (dst, addr, uint32_offset_low_byte);
+}
+
+static void
+emit_get_ip_relative_addr (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t ip,
+                           uint32_t offset)
+{
+  uint32_t byte_offset = offset * sizeof (uint32_t);
+  jit_ldxi (dst, ip, byte_offset);
+  jit_lshi (dst, dst, 2); /* Multiply by sizeof (uint32_t) */
+  jit_addr (dst, dst, ip);
+}
+
+static jit_node_t*
+emit_push_frame (scm_jit_state *j, uint32_t proc_slot, uint32_t nlocals,
+                 const uint32_t *vra)
+{
+  jit_gpr_t fp = T0, old_fp = T1;
+  jit_node_t *continuation;
+
+  emit_load_fp (j, old_fp);
+  emit_subtract_stack_slots (j, fp, old_fp, proc_slot);
+  continuation = emit_store_mra (j, fp, T1);
+  emit_store_vra (j, fp, T1, vra);
+  emit_store_prev_frame_size (j, fp, T1, proc_slot - frame_overhead_slots);
+  emit_store_fp (j, fp);
+  emit_reset_frame (j, fp, nlocals);
+
+  return continuation;
+}
+
+static void
+emit_indirect_tail_call (scm_jit_state *j)
+{
+  jit_node_t *not_instrumented, *no_mcode;
+
+  emit_get_callee_vcode (j, T0);
+
+  emit_get_vcode_low_byte (j, T1, T0);
+  not_instrumented = jit_bnei (T1, scm_op_instrument_entry);
+  emit_get_ip_relative_addr (j, T1, T0, 1);
+  jit_ldr (T1, T1);
+  no_mcode = jit_beqi (T1, 0);
+  jit_jmpr (T1);
+
+  jit_patch (not_instrumented);
+  jit_patch (no_mcode);
+
+  emit_store_ip (j, T0);
+  jit_ret ();
+}
+
+static void
+emit_direct_tail_call (scm_jit_state *j, const uint32_t *vcode)
+{
+  if ((vcode[0] & 0xff) != scm_op_instrument_entry)
+    {
+      jit_movi (T0, (intptr_t) vcode);
+      emit_store_ip (j, T0);
+      jit_ret ();
+    }
+  else
+    {
+      struct scm_jit_function_data *data;
+      data = (struct scm_jit_function_data *) (vcode + (int32_t)(vcode[1]));
+
+      if (data->mcode)
+        {
+          jit_patch_abs (jit_jmpi (), data->mcode);
+        }
+      else
+        {
+          jit_node_t *no_mcode;
+
+          jit_ldi (T0, &data->mcode);
+          no_mcode = jit_beqi (T0, 0);
+          jit_jmpr (T0);
+          jit_patch (no_mcode);
+          jit_movi (T0, (intptr_t) vcode);
+          emit_store_ip (j, T0);
+          jit_ret ();
+        }
+    }
+}
+
+static void
+emit_fp_ref_scm (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t fp, uint32_t slot)
+{
+  jit_ldxi (dst, fp, -8 * (slot + 1));
+}
+
+static void
+emit_fp_set_scm (scm_jit_state *j, jit_gpr_t fp, uint32_t slot, jit_gpr_t val)
+{
+  jit_stxi (-8 * (slot + 1), fp, val);
+}
+
+static void
+emit_sp_ref_scm (scm_jit_state *j, jit_gpr_t dst, uint32_t slot)
+{
+  if (slot == 0)
+    jit_ldr (dst, SP);
+  else
+    jit_ldxi (dst, SP, 8 * slot);
+}
+
+static void
+emit_sp_set_scm (scm_jit_state *j, uint32_t slot, jit_gpr_t val)
+{
+  if (slot == 0)
+    jit_str (SP, val);
+  else
+    jit_stxi (8 * slot, SP, val);
+}
+
+static void
+emit_mov (scm_jit_state *j, uint32_t dst, uint32_t src, jit_gpr_t t)
+{
+  /* FIXME: The compiler currently emits "push" for SCM, F64, U64,
+     and S64 variables.  However SCM values are the usual case, and
+     on a 32-bit machine it might be cheaper to move a SCM than to
+     move a 64-bit number.  */
+  if (sizeof (void*) < sizeof (union scm_vm_stack_element))
+    {
+      uintptr_t src_offset = src * sizeof (union scm_vm_stack_element);
+      uintptr_t dst_offset = dst * sizeof (union scm_vm_stack_element);
+
+      jit_ldxi (t, SP, src_offset + sizeof (void*));
+      jit_stxi (dst_offset + sizeof (void*), SP, t);
+      if (src_offset == 0)
+        jit_ldr (t, SP);
+      else
+        jit_ldxi (t, SP, src_offset);
+      if (dst_offset == 0)
+        jit_str (SP, t);
+      else
+        jit_stxi (dst_offset, SP, t);
+    }
+  else
+    {
+      emit_sp_ref_scm (j, t, src);
+      emit_sp_set_scm (j, dst, t);
+    }
+}
+
+static void
+emit_run_hook (scm_jit_state *j, jit_gpr_t t, scm_t_thread_intrinsic f)
+{
+  jit_node_t *k;
+  jit_ldxi_ui (T0, THREAD, thread_offset_trace_level);
+  k = jit_beqi (T0, 0);
+  emit_store_current_ip (j, T0);
+  emit_call_r (j, f, THREAD);
+  emit_reload_sp (j);
+  jit_patch (k);
+}
+
+static jit_node_t*
+emit_branch_if_frame_locals_count_less_than (scm_jit_state *j, jit_gpr_t fp,
+                                             jit_gpr_t t, uint32_t nlocals)
+{
+  jit_subr (t, fp, SP);
+  return jit_blti (t, nlocals * sizeof (union scm_vm_stack_element));
+}
+
+static jit_node_t*
+emit_branch_if_frame_locals_count_eq (scm_jit_state *j, jit_gpr_t fp,
+                                      jit_gpr_t t, uint32_t nlocals)
+{
+  jit_subr (t, fp, SP);
+  return jit_beqi (t, nlocals * sizeof (union scm_vm_stack_element));
+}
+
+static jit_node_t*
+emit_branch_if_frame_locals_count_greater_than (scm_jit_state *j, jit_gpr_t fp,
+                                                jit_gpr_t t, uint32_t nlocals)
+{
+  jit_subr (t, fp, SP);
+  return jit_bgti (t, nlocals * sizeof (union scm_vm_stack_element));
+}
+
+static void
+emit_load_fp_slot (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t fp, uint32_t 
slot)
+{
+  jit_subi (dst, fp, (slot + 1) * sizeof (union scm_vm_stack_element));
+}
+
+static jit_node_t *
+emit_branch_if_immediate (scm_jit_state *j, jit_gpr_t r)
+{
+  return jit_bmsi (r, 6);
+}
+
+static void
+emit_load_heap_object_word (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t r,
+                            uint32_t word)
+{
+  if (word == 0)
+    jit_ldr (dst, r);
+  else
+    jit_ldxi (dst, r, word * sizeof(SCM));
+}
+
+static void
+emit_load_heap_object_tc (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t r,
+                          scm_t_bits mask)
+{
+  emit_load_heap_object_word (j, dst, r, 0);
+  jit_andi (dst, dst, mask);
+}
+
+static jit_node_t *
+emit_branch_if_heap_object_has_tc (scm_jit_state *j, jit_gpr_t r, jit_gpr_t t,
+                                   scm_t_bits mask, scm_t_bits tc)
+{
+  emit_load_heap_object_tc (j, t, r, mask);
+  return jit_beqi (t, tc);
+}
+
+static jit_node_t *
+emit_branch_if_heap_object_not_tc (scm_jit_state *j, jit_gpr_t r, jit_gpr_t t,
+                                   scm_t_bits mask, scm_t_bits tc)
+{
+  emit_load_heap_object_tc (j, t, r, mask);
+  return jit_bnei (t, tc);
+}
+
+static jit_node_t *
+emit_branch_if_heap_object_not_tc7 (scm_jit_state *j, jit_gpr_t r, jit_gpr_t t,
+                                    scm_t_bits tc7)
+{
+  return emit_branch_if_heap_object_not_tc (j, r, t, 0x7f, tc7);
+}
+
+static void
+emit_entry_trampoline (scm_jit_state *j)
+{
+  jit_node_t *thread, *ip;
+  jit_prolog ();
+  thread = jit_arg ();
+  ip = jit_arg ();
+  /* Ensure that callee-saved registers are used and thus saved by
+     lightning in the prolog.  */
+  jit_xorr (JIT_V0, JIT_V0, JIT_V0);
+  jit_xorr (JIT_V1, JIT_V1, JIT_V1);
+  jit_xorr (JIT_V2, JIT_V2, JIT_V2);
+  /* Load our reserved registers: THREAD and SP.  */
+  jit_getarg (THREAD, thread);
+  emit_reload_sp (j);
+  /* Call the mcode!  */
+  jit_getarg (JIT_R0, ip);
+  jit_callr (JIT_R0);
+  /* When mcode returns, interpreter should continue with vp->ip.  */
+  jit_ret ();
+}
+
+static void
+emit_free_variable_ref (scm_jit_state *j, jit_gpr_t dst, jit_gpr_t prog,
+                        size_t n)
+{
+  emit_load_heap_object_word (j, dst, prog,
+                              n + program_word_offset_free_variable);
+}
+
+/* Use when you know that the u64 value will be within the size_t range,
+   for example when it's ensured by the compiler.  */
+static void
+emit_sp_ref_sz (scm_jit_state *j, jit_gpr_t dst, uint32_t src)
+{
+  if (BIGENDIAN && sizeof (size_t) == 4)
+    jit_ldxi (dst, SP, src * 8 + 4);
+  else
+    jit_ldxi (dst, SP, src * 8);
+}
+
+static void
+emit_sp_set_sz (scm_jit_state *j, uint32_t dst, jit_gpr_t src)
+{
+  size_t offset = dst * 8;
+
+  if (sizeof (size_t) == 4)
+    {
+      size_t lo, hi;
+      if (BIGENDIAN)
+        lo = offset + 4, hi = offset;
+      else
+        lo = offset, hi = offset + 4;
+      
+      jit_stxi (lo, SP, src);
+      /* Set high word to 0.  Clobber src.  */
+      jit_xorr (src, src, src);
+      jit_stxi (hi, SP, src);
+    }
+  else
+    jit_stxi (offset, SP, src);
+}
+
+#if SIZEOF_UINTPTR_T >= 8
+static void
+emit_sp_ref_u64 (scm_jit_state *j, jit_gpr_t dst, uint32_t src)
+{
+  size_t offset = src * 8;
+
+  if (offset == 0)
+    jit_ldr (dst, SP);
+  else
+    jit_ldxi (dst, SP, offset);
+}
+
+static void
+emit_sp_set_u64 (scm_jit_state *j, uint32_t dst, jit_gpr_t src)
+{
+  size_t offset = dst * 8;
+
+  if (dst == 0)
+    jit_str (SP, src);
+  else
+    jit_stxi (offset, SP, src);
+}
+
+static void
+emit_sp_ref_s64 (scm_jit_state *j, jit_gpr_t dst, uint32_t src)
+{
+  emit_sp_ref_u64 (j, dst, src);
+}
+
+static void
+emit_sp_set_s64 (scm_jit_state *j, uint32_t dst, jit_gpr_t src)
+{
+  emit_sp_set_u64 (j, dst, src);
+}
+
+static void
+emit_sp_ref_ptr (scm_jit_state *j, jit_gpr_t dst, uint32_t src)
+{
+  emit_sp_ref_u64 (j, dst, src);
+}
+#else
+static void
+emit_sp_ref_u64 (scm_jit_state *j, jit_gpr_t dst_lo, jit_gpr_t dst_hi,
+                 uint32_t src)
+{
+  size_t offset = src * 8;
+  jit_gpr_t first, second;
+
+#if BIGENDIAN
+  first = dst_hi, second = dst_lo;
+#else
+  first = dst_lo, second = dst_hi;
+#endif
+
+  if (offset == 0)
+    jit_ldr (first, SP);
+  else
+    jit_ldxi (first, SP, offset);
+  jit_ldxi (second, SP, offset + 4);
+}
+
+static void
+emit_sp_set_u64 (scm_jit_state *j, uint32_t dst, jit_gpr_t lo, jit_gpr_t hi)
+{
+  size_t offset = dst * 8;
+  jit_gpr_t first, second;
+
+#if BIGENDIAN
+  first = hi, second = lo;
+#else
+  first = lo, second = hi;
+#endif
+
+  if (offset == 0)
+    jit_str (SP, first);
+  else
+    jit_stxi (offset, SP, first);
+  jit_stxi (offset + 4, SP, second);
+}
+
+static void
+emit_sp_ref_s64 (scm_jit_state *j, jit_gpr_t dst_lo, jit_gpr_t dst_hi,
+                 uint32_t src)
+{
+  emit_sp_ref_u64 (j, dst_lo, dst_hi, src);
+}
+
+static void
+emit_sp_set_s64 (scm_jit_state *j, uint32_t dst, jit_gpr_t lo, jit_gpr_t hi)
+{
+  emit_sp_set_u64 (j, dst, lo, hi);
+}
+
+static void
+emit_sp_ref_u64_lower_half (scm_jit_state *j, jit_gpr_t dst, uint32_t src)
+{
+  size_t offset = src * 8;
+
+  if (offset == 0)
+    emit_ldr (dst, SP);
+  else
+    emit_ldxi (dst, SP, offset);
+}
+
+static void
+emit_sp_ref_ptr (scm_jit_state *j, jit_gpr_t dst, uint32_t src)
+{
+  emit_sp_ref_u64_lower_half (j, dst, src);
+}
+#endif
+
+static void
+emit_sp_ref_f64 (scm_jit_state *j, jit_gpr_t dst, uint32_t src)
+{
+  size_t offset = src * 8;
+
+  if (offset == 0)
+    jit_ldr_d (dst, SP);
+  else
+    jit_ldxi_d (dst, SP, offset);
+}
+
+static void
+emit_sp_set_f64 (scm_jit_state *j, uint32_t dst, jit_gpr_t src)
+{
+  size_t offset = dst * 8;
+
+  if (offset == 0)
+    jit_str_d (SP, src);
+  else
+    jit_stxi_d (offset, SP, src);
+}
+
+static void
+add_inter_instruction_patch (scm_jit_state *j, jit_node_t *label,
+                             const uint32_t *target)
+{
+  abort ();
+}
+
+
+
 static void
 bad_instruction (scm_jit_state *j)
 {
@@ -51,773 +772,2284 @@ compile_halt (scm_jit_state *j)
 }
 
 static void
-compile_call (scm_jit_state *j, uint32_t a, uint32_t b)
+compile_call (scm_jit_state *j, uint32_t proc, uint32_t nlocals)
 {
+  /* 2 = size of call inst */
+  jit_node_t *mcont = emit_push_frame (j, proc, nlocals, j->ip + 2);
+
+  emit_indirect_tail_call (j);
+
+  jit_patch (mcont);
+
+  j->frame_size = -1;
 }
 
 static void
-compile_call_label (scm_jit_state *j, uint32_t a, uint32_t b, const uint32_t 
*vcode)
+compile_call_label (scm_jit_state *j, uint32_t proc, uint32_t nlocals, const 
uint32_t *vcode)
 {
+  /* 2 = size of call-label inst */
+  jit_node_t *mcont = emit_push_frame (j, proc, nlocals, j->ip + 3);
+
+  emit_direct_tail_call (j, vcode);
+
+  jit_patch (mcont);
+
+  j->frame_size = -1;
 }
 
 static void
 compile_tail_call (scm_jit_state *j)
 {
+  emit_indirect_tail_call (j);
+
+  j->frame_size = -1;
 }
 
 static void
 compile_tail_call_label (scm_jit_state *j, const uint32_t *vcode)
 {
+  emit_direct_tail_call (j, vcode);
+
+  j->frame_size = -1;
 }
 
 static void
 compile_instrument_entry (scm_jit_state *j, void *data)
 {
+  if (j->hooks_enabled)
+    emit_run_hook (j, T0, scm_vm_intrinsics.invoke_apply_hook);
 }
 
 static void
 compile_instrument_loop (scm_jit_state *j, void *data)
 {
+  /* Nothing to do.  */
 }
 
 static void
-compile_receive (scm_jit_state *j, uint16_t dst, uint16_t a, uint32_t b)
+compile_receive (scm_jit_state *j, uint16_t dst, uint16_t proc, uint32_t 
nlocals)
 {
+  jit_gpr_t fp = T0, t = T1;
+  jit_node_t *k;
+
+  emit_load_fp (j, fp);
+  k = emit_branch_if_frame_locals_count_greater_than (j, fp, t, proc);
+  emit_store_current_ip (j, T0);
+  emit_call (j, scm_vm_intrinsics.error_no_values);
+  jit_patch (k);
+  emit_fp_ref_scm (j, t, fp, proc);
+  emit_fp_set_scm (j, fp, dst, t);
+  emit_reset_frame (j, fp, nlocals);
+
+  j->frame_size = nlocals;
 }
 
 static void
-compile_receive_values (scm_jit_state *j, uint32_t a, uint8_t b, uint32_t c)
+compile_receive_values (scm_jit_state *j, uint32_t proc, uint8_t allow_extra,
+                        uint32_t nvalues)
 {
+  jit_gpr_t fp = T0, t = T1;
+
+  emit_load_fp (j, fp);
+  if (allow_extra)
+    {
+      jit_node_t *k;
+      k = emit_branch_if_frame_locals_count_greater_than (j, fp, t,
+                                                          proc + nvalues - 1);
+      emit_store_current_ip (j, T0);
+      emit_call (j, scm_vm_intrinsics.error_not_enough_values);
+      jit_patch (k);
+    }
+  else
+    {
+      jit_node_t *k;
+      k = emit_branch_if_frame_locals_count_eq (j, fp, t, proc + nvalues);
+      emit_store_current_ip (j, T0);
+      emit_call_i (j, scm_vm_intrinsics.error_wrong_number_of_values, nvalues);
+      jit_patch (k);
+
+      j->frame_size = proc + nvalues;
+    }
 }
 
 static void
 compile_shuffle_down (scm_jit_state *j, uint16_t from, uint16_t to)
 {
+  jit_gpr_t fp = T0, walk = T0, t = T1;
+  size_t offset = (from - to) * sizeof (union scm_vm_stack_element);
+  jit_node_t *done, *head, *back;
+
+  emit_load_fp (j, fp);
+  emit_load_fp_slot (j, walk, fp, from);
+  done = jit_bltr (walk, SP);
+  head = jit_label ();
+  jit_ldr (t, walk);
+  jit_stxi (offset, walk, t);
+  jit_subi (walk, walk, sizeof (union scm_vm_stack_element));
+  back = jit_bltr (walk, SP);
+  jit_patch_at (back, head);
+  jit_patch (done);
+  jit_addi (SP, SP, offset);
+  emit_store_sp (j);
+
+  if (j->frame_size >= 0)
+    j->frame_size -= (from - to);
 }
 
 static void
 compile_return_values (scm_jit_state *j)
 {
+  jit_gpr_t old_fp = T0, offset = T1, new_fp = T1, ra = T1;
+  jit_node_t *interp;
+  if (j->hooks_enabled)
+    emit_run_hook (j, T0, scm_vm_intrinsics.invoke_return_hook);
+
+  emit_load_fp (j, old_fp);
+  emit_load_prev_frame_size (j, offset, old_fp);
+  jit_addi (offset, offset, frame_overhead_slots);
+  jit_lshi (offset, offset, 3); /* Multiply by sizeof (scm_vm_stack_element) */
+  jit_addr (new_fp, old_fp, offset);
+  emit_store_fp (j, new_fp);
+
+  emit_load_mra (j, ra, old_fp);
+  interp = jit_beqi (ra, 0);
+  jit_jmpr (ra);
+
+  jit_patch (interp);
+  emit_load_vra (j, ra, old_fp);
+  emit_store_ip (j, ra);
+  jit_ret ();
 }
 
 static void
 compile_subr_call (scm_jit_state *j, uint32_t idx)
 {
+  jit_gpr_t fp = T0, t = T1, ret = T2;
+  void *subr;
+  uint32_t i;
+  jit_node_t *immediate, *not_values, *k;
+
+  if (j->frame_size < 0)
+    abort ();
+
+  subr = scm_subr_function_by_index (idx);
+  emit_store_current_ip (j, t);
+  emit_load_fp (j, fp);
+  jit_prepare ();
+  for (i = 0; i < j->frame_size; i++)
+    {
+      emit_fp_ref_scm (j, t, fp, i);
+      jit_pushargr (t);
+    }
+  jit_finishi (subr);
+  jit_retval (ret);
+
+  emit_load_fp (j, fp);
+
+  immediate = emit_branch_if_immediate (j, ret);
+  not_values = emit_branch_if_heap_object_not_tc7 (j, ret, t, scm_tc7_values);
+  emit_call_r_r (j, scm_vm_intrinsics.unpack_values_object, THREAD, ret);
+  emit_reload_sp (j);
+  k = jit_jmpi ();
+
+  jit_patch (immediate);
+  jit_patch (not_values);
+  emit_subtract_stack_slots (j, SP, fp, 1);
+  emit_store_sp (j);
+  jit_str (SP, ret);
+  jit_patch (k);
 }
 
 static void
-compile_foreign_call (scm_jit_state *j, uint16_t a, uint16_t b)
+compile_foreign_call (scm_jit_state *j, uint16_t cif_idx, uint16_t ptr_idx)
 {
+  emit_store_current_ip (j, T0);
+  emit_load_fp (j, T0);
+  emit_fp_ref_scm (j, T0, T0, 0);
+  emit_free_variable_ref (j, T1, T0, cif_idx);
+  emit_free_variable_ref (j, T2, T0, ptr_idx);
+
+  /* FIXME: Inline the foreign call.  */
+  emit_call_r_r_r (j, scm_vm_intrinsics.foreign_call, THREAD, T1, T2);
+  emit_reload_sp (j);
+
+  j->frame_size = 2; /* Return value and errno.  */
 }
 
 static void
-compile_continuation_call (scm_jit_state *j, uint32_t a)
+compile_continuation_call (scm_jit_state *j, uint32_t contregs_idx)
 {
+  emit_store_current_ip (j, T0);
+  emit_load_fp (j, T0);
+  emit_fp_ref_scm (j, T0, T0, 0);
+  emit_free_variable_ref (j, T0, T0, contregs_idx);
+  emit_call_r_r (j, scm_vm_intrinsics.reinstate_continuation_x, THREAD, T0);
+  /* Does not fall through.  */
+
+  j->frame_size = -1;
 }
 
 static void
-compile_compose_continuation (scm_jit_state *j, uint32_t a)
+compile_compose_continuation (scm_jit_state *j, uint32_t cont_idx)
 {
+  jit_node_t *interp;
+
+  emit_store_current_ip (j, T0);
+  emit_load_fp (j, T0);
+  emit_fp_ref_scm (j, T0, T0, 0);
+  emit_free_variable_ref (j, T0, T0, cont_idx);
+  emit_call_r_r (j, scm_vm_intrinsics.compose_continuation, THREAD, T0);
+  jit_retval (T0);
+  emit_reload_sp (j);
+  interp = jit_bnei (T0, 0);
+  jit_jmpr (T0);
+
+  jit_patch (interp);
+  jit_ret ();
+
+  j->frame_size = -1;
 }
 
 static void
 compile_capture_continuation (scm_jit_state *j, uint32_t dst)
 {
+  emit_store_current_ip (j, T0);
+  emit_call_r (j, scm_vm_intrinsics.capture_continuation, THREAD);
+  jit_retval (T0);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
 compile_abort (scm_jit_state *j)
 {
+  jit_node_t *k, *interp;
+
+  jit_movi (T0, (intptr_t) (j->ip + 1));
+  emit_store_ip (j, T0);
+  k = jit_movi (T0, 0);
+  emit_call_r_r (j, scm_vm_intrinsics.abort_to_prompt, THREAD, T0);
+  jit_retval (T3_PRESERVED);
+  
+  if (j->hooks_enabled)
+    emit_run_hook (j, T0, scm_vm_intrinsics.invoke_abort_hook);
+
+  interp = jit_bnei (T3_PRESERVED, 0);
+  emit_reload_sp (j);
+  jit_jmpr (T3_PRESERVED);
+
+  jit_patch (interp);
+  jit_ret ();
+
+  jit_patch (k);
+
+  j->frame_size = -1;
 }
 
 static void
-compile_builtin_ref (scm_jit_state *j, uint16_t dst, uint16_t a)
+compile_builtin_ref (scm_jit_state *j, uint16_t dst, uint16_t idx)
 {
+  SCM builtin = scm_vm_builtin_ref (idx);
+
+  jit_movi (T0, SCM_UNPACK (builtin));
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_throw (scm_jit_state *j, uint16_t a, uint16_t b)
+compile_throw (scm_jit_state *j, uint16_t key, uint16_t args)
 {
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, key);
+  emit_sp_ref_scm (j, T1, args);
+  emit_call_r_r (j, scm_vm_intrinsics.throw_, T0, T1);
+  /* throw_ does not return.  */
 }
 
 static void
-compile_throw_value (scm_jit_state *j, uint32_t a, const void *data)
+compile_throw_value (scm_jit_state *j, uint32_t val,
+                     const void *key_subr_and_message)
 {
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, val);
+  emit_call_r_i (j, scm_vm_intrinsics.throw_with_value, T0,
+                 (intptr_t) key_subr_and_message);
+  /* throw_with_value does not return.  */
 }
 
 static void
-compile_throw_value_and_data (scm_jit_state *j, uint32_t a, const void *data)
+compile_throw_value_and_data (scm_jit_state *j, uint32_t val,
+                              const void *key_subr_and_message)
 {
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, val);
+  emit_call_r_i (j, scm_vm_intrinsics.throw_with_value_and_data, T0,
+                 (intptr_t) key_subr_and_message);
+  /* throw_with_value_and_data does not return.  */
 }
 
 static void
-compile_assert_nargs_ee (scm_jit_state *j, uint32_t a)
+compile_assert_nargs_ee (scm_jit_state *j, uint32_t nlocals)
 {
+  jit_node_t *k;
+  jit_gpr_t fp = T0, t = T1;
+
+  emit_load_fp (j, fp);
+  k = emit_branch_if_frame_locals_count_eq (j, fp, t, nlocals);
+  emit_store_current_ip (j, t);
+  emit_call_r (j, scm_vm_intrinsics.error_wrong_num_args, THREAD);
+  jit_patch (k);
+
+  j->frame_size = nlocals;
 }
 
 static void
-compile_assert_nargs_ge (scm_jit_state *j, uint32_t a)
+compile_assert_nargs_ge (scm_jit_state *j, uint32_t nlocals)
 {
+  if (nlocals > 0)
+    {
+      jit_gpr_t fp = T0, t = T1;
+      jit_node_t *k;
+      emit_load_fp (j, fp);
+      k = emit_branch_if_frame_locals_count_greater_than (j, fp, t, nlocals-1);
+      emit_store_current_ip (j, t);
+      emit_call_r (j, scm_vm_intrinsics.error_wrong_num_args, THREAD);
+      jit_patch (k);
+    }
 }
 
 static void
-compile_assert_nargs_le (scm_jit_state *j, uint32_t a)
+compile_assert_nargs_le (scm_jit_state *j, uint32_t nlocals)
 {
+  jit_node_t *k;
+  jit_gpr_t fp = T0, t = T1;
+
+  emit_load_fp (j, fp);
+  k = emit_branch_if_frame_locals_count_less_than (j, fp, t, nlocals + 1);
+  emit_store_current_ip (j, t);
+  emit_call_r (j, scm_vm_intrinsics.error_wrong_num_args, THREAD);
+  jit_patch (k);
 }
 
 static void
-compile_alloc_frame (scm_jit_state *j, uint32_t a)
+compile_alloc_frame (scm_jit_state *j, uint32_t nlocals)
 {
+  jit_gpr_t fp = T0, t = T1;
+  emit_load_fp (j, fp);
+  if (j->frame_size < 0)
+    jit_movr (T3_PRESERVED, SP);
+  emit_alloc_frame (j, fp, t, nlocals);
+
+  if (j->frame_size >= 0)
+    {
+      int32_t slots = nlocals - j->frame_size;
+
+      if (slots > 0)
+        {
+          jit_movi (T0, SCM_UNPACK (SCM_UNDEFINED));
+          while (slots-- > 0)
+            emit_sp_set_scm (j, slots, T0);
+        }
+    }
+  else
+    {
+      jit_node_t *head, *k, *back;
+      jit_movi (T0, SCM_UNPACK (SCM_UNDEFINED));
+      k = jit_bler (T3_PRESERVED, SP);
+      head = jit_label ();
+      jit_str (T3_PRESERVED, T0);
+      jit_subi (T3_PRESERVED, T3_PRESERVED, sizeof (union 
scm_vm_stack_element));
+      back = jit_bner (T3_PRESERVED, SP);
+      jit_patch_at (back, head);
+      jit_patch (k);
+    }
+
+  j->frame_size = nlocals;
 }
 
 static void
-compile_reset_frame (scm_jit_state *j, uint32_t a)
+compile_reset_frame (scm_jit_state *j, uint32_t nlocals)
 {
+  jit_gpr_t fp = T0;
+  emit_load_fp (j, fp);
+  emit_reset_frame (j, fp, nlocals);
+
+  j->frame_size = nlocals;
 }
 
 static void
-compile_push (scm_jit_state *j, uint32_t a)
+compile_push (scm_jit_state *j, uint32_t src)
 {
+  jit_gpr_t fp = T0, t = T1;
+  emit_load_fp (j, fp);
+  jit_subi (SP, SP, sizeof (union scm_vm_stack_element));
+  emit_alloc_frame_for_sp (j, fp, t);
+  emit_mov (j, 0, src + 1, t);
+
+  if (j->frame_size >= 0)
+    j->frame_size++;
 }
 
 static void
 compile_pop (scm_jit_state *j, uint32_t dst)
 {
+  emit_mov (j, dst + 1, 0, T0);
+  jit_addi (SP, SP, sizeof (union scm_vm_stack_element));
+  emit_store_sp (j);
+
+  if (j->frame_size >= 0)
+    j->frame_size--;
 }
 
 static void
-compile_drop (scm_jit_state *j, uint32_t a)
+compile_drop (scm_jit_state *j, uint32_t nvalues)
 {
+  jit_addi (SP, SP, nvalues * sizeof (union scm_vm_stack_element));
+  emit_store_sp (j);
+
+  if (j->frame_size >= 0)
+    j->frame_size -= nvalues;
 }
 
 static void
-compile_assert_nargs_ee_locals (scm_jit_state *j, uint16_t a, uint16_t b)
+compile_assert_nargs_ee_locals (scm_jit_state *j, uint16_t expected,
+                                uint16_t nlocals)
 {
+  compile_assert_nargs_ee (j, expected);
+  compile_alloc_frame (j, expected + nlocals);
 }
 
 static void
 compile_expand_apply_argument (scm_jit_state *j)
 {
+  emit_store_current_ip (j, T0);
+  emit_call_r (j, scm_vm_intrinsics.expand_apply_argument, THREAD);
+  emit_reload_sp (j);
+
+  j->frame_size = -1;
 }
 
 static void
-compile_bind_kwargs (scm_jit_state *j, uint32_t a, uint8_t b, uint32_t c, 
uint32_t d, const void *data)
+compile_bind_kwargs (scm_jit_state *j, uint32_t nreq, uint8_t flags,
+                     uint32_t nreq_and_opt, uint32_t ntotal, const void *kw)
 {
+  uint8_t allow_other_keys = flags & 0x1, has_rest = flags & 0x2;
+  jit_gpr_t t = T0, npositional = T1, fp = T1;
+
+  emit_store_current_ip (j, t);
+
+  jit_prepare ();
+  jit_pushargr (THREAD);
+  jit_pushargi (nreq);
+  jit_pushargi (nreq_and_opt - nreq);
+  jit_finishi (scm_vm_intrinsics.compute_kwargs_npositional);
+  jit_retval_i (npositional);
+
+  jit_prepare ();
+  jit_pushargr (THREAD);
+  jit_pushargr (npositional);
+  jit_pushargi (ntotal);
+  jit_pushargi ((intptr_t) kw);
+  jit_pushargi (!has_rest);
+  jit_pushargi (allow_other_keys);
+  jit_finishi (scm_vm_intrinsics.bind_kwargs);
+  
+  emit_reload_sp (j);
+
+  if (has_rest)
+    {
+      emit_call_r_i (j, scm_vm_intrinsics.cons_rest, THREAD, ntotal);
+      jit_retval (t);
+      emit_load_fp (j, fp);
+      emit_fp_set_scm (j, fp, nreq_and_opt, t);
+    }
+  else
+    emit_load_fp (j, fp);
+
+  emit_reset_frame (j, T1, ntotal);
+  j->frame_size = ntotal;
 }
 
 static void
 compile_bind_rest (scm_jit_state *j, uint32_t dst)
 {
+  jit_node_t *k, *cons;
+  jit_gpr_t fp = T0, t = T1;
+  
+  emit_load_fp (j, fp);
+  cons = emit_branch_if_frame_locals_count_greater_than (j, fp, t, dst);
+
+  compile_alloc_frame (j, dst + 1);
+  jit_movi (t, SCM_UNPACK (SCM_EOL));
+  emit_sp_set_scm (j, 0, t);
+  k = jit_jmpi ();
+
+  jit_patch (cons);
+  emit_store_current_ip (j, t);
+  emit_call_r_i (j, scm_vm_intrinsics.cons_rest, THREAD, dst);
+  jit_retval (t);
+  emit_sp_set_scm (j, 0, t);
+  compile_reset_frame (j, dst + 1);
+  
+  jit_patch (k);
 }
 
 static void
-compile_allocate_words (scm_jit_state *j, uint16_t dst, uint16_t a)
+compile_allocate_words (scm_jit_state *j, uint16_t dst, uint16_t nwords)
 {
+  jit_gpr_t t = T0;
+
+  emit_store_current_ip (j, t);
+  emit_sp_ref_sz (j, t, nwords);
+  emit_call_r_r (j, scm_vm_intrinsics.allocate_words, THREAD, t);
+  jit_retval (t);
+  emit_sp_set_scm (j, dst, t);
 }
 
 static void
-compile_allocate_words_immediate (scm_jit_state *j, uint16_t dst, uint16_t a)
+compile_allocate_words_immediate (scm_jit_state *j, uint16_t dst, uint16_t 
nwords)
 {
+  jit_gpr_t t = T0;
+
+  emit_store_current_ip (j, t);
+  jit_movi (t, nwords);
+  emit_call_r_r (j, scm_vm_intrinsics.allocate_words, THREAD, t);
+  jit_retval (t);
+  emit_sp_set_scm (j, dst, t);
 }
 
 static void
-compile_scm_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_scm_ref (scm_jit_state *j, uint8_t dst, uint8_t obj, uint8_t idx)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_sz (j, T1, idx);
+  jit_lshi (T1, T1, log2_sizeof_uintptr_t);
+  jit_ldxr (T0, T0, T1);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_scm_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_scm_set (scm_jit_state *j, uint8_t obj, uint8_t idx, uint8_t val)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_sz (j, T1, idx);
+  emit_sp_ref_scm (j, T2, val);
+  jit_lshi (T1, T1, log2_sizeof_uintptr_t);
+  jit_stxr (T0, T1, T2);
 }
 
 static void
-compile_scm_ref_tag (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_scm_ref_tag (scm_jit_state *j, uint8_t dst, uint8_t obj, uint8_t tag)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  jit_ldr (T0, T0);
+  jit_subi (T0, T0, tag);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_scm_set_tag (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_scm_set_tag (scm_jit_state *j, uint8_t obj, uint8_t tag, uint8_t val)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_scm (j, T1, val);
+  jit_addi (T1, T1, tag);
+  jit_str (T0, T1);
 }
 
 static void
-compile_scm_ref_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_scm_ref_immediate (scm_jit_state *j, uint8_t dst, uint8_t obj, uint8_t 
idx)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  jit_ldxi (T0, T0, idx * sizeof (SCM));
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_scm_set_immediate (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_scm_set_immediate (scm_jit_state *j, uint8_t obj, uint8_t idx, uint8_t 
val)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_scm (j, T1, val);
+  jit_stxi (idx * sizeof (SCM), T0, T1);
 }
 
 static void
-compile_word_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_word_ref (scm_jit_state *j, uint8_t dst, uint8_t obj, uint8_t idx)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_sz (j, T1, idx);
+  jit_lshi (T1, T1, log2_sizeof_uintptr_t);
+  jit_ldxr (T0, T0, T1);
+  emit_sp_set_sz (j, dst, T0);
 }
 
 static void
-compile_word_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_word_set (scm_jit_state *j, uint8_t obj, uint8_t idx, uint8_t val)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_sz (j, T1, idx);
+  emit_sp_ref_sz (j, T2, val);
+  jit_lshi (T1, T1, log2_sizeof_uintptr_t);
+  jit_stxr (T0, T1, T2);
 }
 
 static void
-compile_word_ref_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t 
b)
+compile_word_ref_immediate (scm_jit_state *j, uint8_t dst, uint8_t obj, 
uint8_t idx)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  jit_ldxi (T0, T0, idx * sizeof (SCM));
+  emit_sp_set_sz (j, dst, T0);
 }
 
 static void
-compile_word_set_immediate (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_word_set_immediate (scm_jit_state *j, uint8_t obj, uint8_t idx, 
uint8_t val)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_sz (j, T1, val);
+  jit_stxi (idx * sizeof (SCM), T0, T1);
 }
 
 static void
-compile_pointer_ref_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, 
uint8_t b)
+compile_pointer_ref_immediate (scm_jit_state *j, uint8_t dst, uint8_t obj, 
uint8_t idx)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  jit_ldxi (T0, T0, idx * sizeof (SCM));
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_pointer_set_immediate (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t 
c)
+compile_pointer_set_immediate (scm_jit_state *j, uint8_t obj, uint8_t idx, 
uint8_t val)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_scm (j, T1, val);
+  jit_stxi (idx * sizeof (SCM), T0, T1);
 }
 
 static void
-compile_tail_pointer_ref_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, 
uint8_t b)
+compile_tail_pointer_ref_immediate (scm_jit_state *j, uint8_t dst, uint8_t 
obj, uint8_t idx)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  jit_addi (T0, T0, idx * sizeof (SCM));
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_mov (scm_jit_state *j, uint16_t dst, uint16_t a)
+compile_mov (scm_jit_state *j, uint16_t dst, uint16_t src)
 {
+  emit_mov (j, dst, src, T0);
 }
 
 static void
-compile_long_mov (scm_jit_state *j, uint32_t dst, uint32_t a)
+compile_long_mov (scm_jit_state *j, uint32_t dst, uint32_t src)
 {
+  emit_mov (j, dst, src, T0);
 }
 
 static void
-compile_long_fmov (scm_jit_state *j, uint32_t dst, uint32_t a)
+compile_long_fmov (scm_jit_state *j, uint32_t dst, uint32_t src)
 {
+  jit_gpr_t fp = T0, t = T1;
+  emit_load_fp (j, fp);
+  emit_fp_ref_scm (j, t, fp, src);
+  emit_fp_set_scm (j, fp, dst, t);
 }
 
 static void
-compile_call_scm_from_scm_scm (scm_jit_state *j, uint8_t dst, uint8_t a, 
uint8_t b, uint32_t c)
+compile_call_scm_from_scm_scm (scm_jit_state *j, uint8_t dst, uint8_t a, 
uint8_t b, uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  emit_sp_ref_scm (j, T1, b);
+  emit_call_r_r (j, intrinsic, T0, T1);
+  jit_retval (T0);
+  emit_reload_sp (j);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_call_scm_from_scm_uimm (scm_jit_state *j, uint8_t dst, uint8_t a, 
uint8_t b, uint32_t c)
+compile_call_scm_from_scm_uimm (scm_jit_state *j, uint8_t dst, uint8_t a, 
uint8_t b, uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  jit_prepare ();
+  jit_pushargr (T0);
+  jit_pushargi (b);
+  jit_finishi (intrinsic);
+  jit_retval (T0);
+  emit_reload_sp (j);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_call_scm_sz_u32 (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c, 
uint32_t d)
+compile_call_scm_sz_u32 (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c, 
uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  emit_sp_ref_sz (j, T1, b);
+  emit_sp_ref_sz (j, T2, c);
+  jit_prepare ();
+  jit_pushargr (T0);
+  jit_pushargr (T1);
+  jit_pushargr (T2);
+  jit_finishi (intrinsic);
+  emit_reload_sp (j);
 }
 
 static void
-compile_call_scm_from_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t b)
+compile_call_scm_from_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  jit_prepare ();
+  jit_pushargr (T0);
+  jit_finishi (intrinsic);
+  jit_retval (T0);
+  emit_reload_sp (j);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_call_f64_from_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t b)
+compile_call_f64_from_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  jit_prepare ();
+  jit_pushargr (T0);
+  jit_finishi (intrinsic);
+  jit_retval (JIT_F0);
+  emit_reload_sp (j);
+  emit_sp_set_f64 (j, dst, JIT_F0);
 }
 
 static void
-compile_call_u64_from_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t b)
+compile_call_u64_from_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+#if INDIRECT_INT64_INTRINSICS
+  jit_prepare ();
+  jit_addi (T1, SP, dst * sizeof (union scm_vm_stack_element));
+  jit_pushargr (T1);
+  jit_pushargr (T0);
+  jit_finishi (intrinsic);
+#else
+  jit_prepare ();
+  jit_pushargr (T0);
+  jit_finishi (intrinsic);
+  jit_retval (T0);
+  emit_sp_set_u64 (j, dst, T0);
+#endif
 }
 
 static void
 compile_make_short_immediate (scm_jit_state *j, uint8_t dst, SCM a)
 {
+  jit_movi (T0, SCM_UNPACK (a));
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
 compile_make_long_immediate (scm_jit_state *j, uint32_t dst, SCM a)
 {
+  jit_movi (T0, SCM_UNPACK (a));
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
 compile_make_long_long_immediate (scm_jit_state *j, uint32_t dst, SCM a)
 {
+  jit_movi (T0, SCM_UNPACK (a));
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
 compile_make_non_immediate (scm_jit_state *j, uint32_t dst, const void *data)
 {
+  jit_movi (T0, (uintptr_t)data);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
 compile_static_ref (scm_jit_state *j, uint32_t dst, void *loc)
 {
+  jit_ldi (T0, loc);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_static_set (scm_jit_state *j, uint32_t a, void *loc)
+compile_static_set (scm_jit_state *j, uint32_t obj, void *loc)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  jit_sti (loc, T0);
 }
 
 static void
 compile_static_patch (scm_jit_state *j, void *dst, const void *src)
 {
+  jit_movi (T0, (uintptr_t) src);
+  jit_sti (dst, T0);
 }
 
 static void
-compile_prompt (scm_jit_state *j, uint32_t a, uint8_t b, uint32_t c, const 
uint32_t *vcode)
+compile_prompt (scm_jit_state *j, uint32_t tag, uint8_t escape_only_p,
+                uint32_t proc_slot, const uint32_t *vcode)
 {
+  jit_node_t *mra;
+  emit_store_current_ip (j, T0);
+  jit_prepare ();
+  jit_pushargr (THREAD);
+  jit_pushargi (escape_only_p);
+  emit_sp_ref_scm (j, T0, tag);
+  jit_pushargr (T0);
+  emit_load_fp (j, T1);
+  jit_subi (T1, T1, proc_slot * sizeof (union scm_vm_stack_element));
+  jit_pushargr (T1);
+  jit_pushargi ((uintptr_t) vcode);
+  mra = jit_movi (T2, 0);
+  jit_finishi (scm_vm_intrinsics.push_prompt);
+  add_inter_instruction_patch (j, mra, vcode);
 }
 
 static void
 compile_load_label (scm_jit_state *j, uint32_t dst, const uint32_t *vcode)
 {
+  jit_movi (T0, (uintptr_t) vcode);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  jit_movi (T1, 0);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
-compile_call_s64_from_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t b)
+compile_call_s64_from_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t idx)
 {
+  compile_call_u64_from_scm (j, dst, a, idx);
 }
 
 static void
-compile_call_scm_from_u64 (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t b)
+compile_call_scm_from_u64 (scm_jit_state *j, uint16_t dst, uint16_t src, 
uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  jit_prepare ();
+#if INDIRECT_INT64_INTRINSICS
+  jit_addi (T0, SP, src * sizeof (union scm_vm_stack_element));
+#else
+  emit_sp_ref_u64 (j, T0, src);
+  jit_pushargr (T0);
+#endif
+  jit_finishi (intrinsic);
+  jit_retval (T0);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
 compile_call_scm_from_s64 (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t b)
 {
+  compile_call_scm_from_u64 (j, dst, a, b);
 }
 
 static void
-compile_tag_char (scm_jit_state *j, uint16_t dst, uint16_t a)
+compile_tag_char (scm_jit_state *j, uint16_t dst, uint16_t src)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, src);
+#else
+  emit_sp_ref_u64_lower_half (j, T0, src);
+#endif
+  jit_lshi (T0, T0, 8);
+  jit_addi (T0, T0, scm_tc8_char);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_untag_char (scm_jit_state *j, uint16_t dst, uint16_t a)
+compile_untag_char (scm_jit_state *j, uint16_t dst, uint16_t src)
 {
+  emit_sp_ref_scm (j, T0, src);
+  jit_rshi (T0, T0, 8);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  jit_movi (T1, 0);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
-compile_atomic_ref_scm_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, 
uint8_t b)
+compile_atomic_ref_scm_immediate (scm_jit_state *j, uint8_t dst, uint8_t obj, 
uint8_t offset)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  jit_addi (T0, T0, offset * sizeof (SCM));
+  emit_call_r (j, scm_vm_intrinsics.atomic_ref_scm, T0);
+  jit_retval (T0);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_atomic_set_scm_immediate (scm_jit_state *j, uint8_t a, uint8_t b, 
uint8_t c)
+compile_atomic_set_scm_immediate (scm_jit_state *j, uint8_t obj, uint8_t 
offset, uint8_t val)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_scm (j, T1, val);
+  jit_addi (T0, T0, offset * sizeof (SCM));
+  emit_call_r_r (j, scm_vm_intrinsics.atomic_set_scm, T0, T1);
 }
 
 static void
-compile_atomic_scm_swap_immediate (scm_jit_state *j, uint32_t dst, uint32_t a, 
uint8_t b, uint32_t c)
+compile_atomic_scm_swap_immediate (scm_jit_state *j, uint32_t dst, uint32_t 
obj, uint8_t offset, uint32_t val)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_scm (j, T1, val);
+  jit_addi (T0, T0, offset * sizeof (SCM));
+  emit_call_r_r (j, scm_vm_intrinsics.atomic_swap_scm, T0, T1);
+  jit_retval (T0);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_atomic_scm_compare_and_swap_immediate (scm_jit_state *j, uint32_t dst, 
uint32_t a, uint8_t b, uint32_t c, uint32_t d)
+compile_atomic_scm_compare_and_swap_immediate (scm_jit_state *j, uint32_t dst,
+                                               uint32_t obj, uint8_t offset,
+                                               uint32_t expected, uint32_t 
desired)
 {
+  emit_sp_ref_scm (j, T0, obj);
+  emit_sp_ref_scm (j, T1, expected);
+  emit_sp_ref_scm (j, T2, desired);
+  jit_addi (T0, T0, offset * sizeof (SCM));
+  emit_call_r_r_r (j, scm_vm_intrinsics.atomic_swap_scm, T0, T1, T3);
+  jit_retval (T0);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_call_thread_scm_scm (scm_jit_state *j, uint16_t a, uint16_t b, 
uint32_t c)
+compile_call_thread_scm_scm (scm_jit_state *j, uint16_t a, uint16_t b, 
uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  emit_sp_ref_scm (j, T1, b);
+  emit_call_r_r_r (j, intrinsic, THREAD, T0, T1);
+  emit_reload_sp (j);
 }
 
 static void
-compile_call_thread (scm_jit_state *j, uint32_t a)
+compile_call_thread (scm_jit_state *j, uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_call_r (j, intrinsic, THREAD);
+  emit_reload_sp (j);
 }
 
 static void
-compile_call_scm_from_thread_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t b)
+compile_call_scm_from_thread_scm (scm_jit_state *j, uint16_t dst, uint16_t a, 
uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  emit_call_r_r (j, intrinsic, THREAD, T0);
+  jit_retval (T0);
+  emit_reload_sp (j);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_call_thread_scm (scm_jit_state *j, uint32_t a, uint32_t b)
+compile_call_thread_scm (scm_jit_state *j, uint32_t a, uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  emit_call_r_r (j, intrinsic, THREAD, T0);
+  emit_reload_sp (j);
 }
 
 static void
-compile_call_scm_from_scm_u64 (scm_jit_state *j, uint8_t dst, uint8_t a, 
uint8_t b, uint32_t c)
+compile_call_scm_from_scm_u64 (scm_jit_state *j, uint8_t dst, uint8_t a, 
uint8_t b, uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  jit_prepare ();
+  jit_pushargr (T0);
+#if INDIRECT_INT64_INTRINSICS
+  jit_addi (T1, SP, b * sizeof (union scm_vm_stack_element));
+#else
+  emit_sp_ref_u64 (j, T1, b);
+  jit_pushargr (T1);
+#endif
+  jit_finishi (intrinsic);
+  jit_retval (T0);
+  emit_reload_sp (j);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
-compile_call_scm_from_thread (scm_jit_state *j, uint32_t dst, uint32_t a)
+compile_call_scm_from_thread (scm_jit_state *j, uint32_t dst, uint32_t idx)
 {
+  void *intrinsic = ((void **) &scm_vm_intrinsics)[idx];
+
+  emit_store_current_ip (j, T0);
+  emit_call_r (j, intrinsic, THREAD);
+  jit_retval (T0);
+  emit_reload_sp (j);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
 compile_fadd (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+  emit_sp_ref_f64 (j, JIT_F0, a);
+  emit_sp_ref_f64 (j, JIT_F1, b);
+  jit_addr_d (JIT_F0, JIT_F0, JIT_F1);
+  emit_sp_set_f64 (j, dst, JIT_F0);
 }
 
 static void
 compile_fsub (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+  emit_sp_ref_f64 (j, JIT_F0, a);
+  emit_sp_ref_f64 (j, JIT_F1, b);
+  jit_subr_d (JIT_F0, JIT_F0, JIT_F1);
+  emit_sp_set_f64 (j, dst, JIT_F0);
 }
 
 static void
 compile_fmul (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+  emit_sp_ref_f64 (j, JIT_F0, a);
+  emit_sp_ref_f64 (j, JIT_F1, b);
+  jit_mulr_d (JIT_F0, JIT_F0, JIT_F1);
+  emit_sp_set_f64 (j, dst, JIT_F0);
 }
 
 static void
 compile_fdiv (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+  emit_sp_ref_f64 (j, JIT_F0, a);
+  emit_sp_ref_f64 (j, JIT_F1, b);
+  jit_divr_d (JIT_F0, JIT_F0, JIT_F1);
+  emit_sp_set_f64 (j, dst, JIT_F0);
 }
 
 static void
 compile_uadd (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  jit_addr (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, VT3, b);
+  jit_addcr (T0, T0, T2);
+  jit_addxr (T1, T1, T3);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_usub (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  jit_subr (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, VT3, b);
+  jit_subcr (T0, T0, T2);
+  jit_subxr (T1, T1, T3);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_umul (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  jit_mulr (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  /* FIXME: This is untested!  */
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, VT3, b);
+  jit_mulr (T1, T1, T2);      /* High A times low B */
+  jit_mulr (VT3, VT3, T0);    /* High B times low A */
+  jit_addr (T1, T1, VT3);       /* Add high results, throw away overflow */
+  jit_qmulr_u (T0, T2, T0, T2); /* Low A times low B */
+  jit_addr (T1, T1, T2);        /* Add high result of low product */
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_uadd_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  jit_addi (T0, T0, a);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  emit_sp_ref_u64 (j, T0, T1, a);
+  jit_addci (T0, T0, a);
+  jit_addxi (T1, T1, 0);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_usub_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  jit_subi (T0, T0, a);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  emit_sp_ref_u64 (j, T0, T1, a);
+  jit_subci (T0, T0, a);
+  jit_subxi (T1, T1, 0);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_umul_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  jit_muli (T0, T0, b);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  /* FIXME: This is untested!  */
+  emit_sp_ref_u64 (j, T0, T1, a);
+  jit_muli (T1, T1, b);         /* High A times low B */
+  /* High B times low A is 0.  */
+  jit_movi (T2, b);
+  jit_qmulr_u (T0, T2, T0, T2); /* Low A times low B */
+  jit_addr (T1, T1, T2);        /* Add high result of low product */
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_load_f64 (scm_jit_state *j, uint32_t dst, double a)
 {
+  jit_movi_d (JIT_F0, a);
+  emit_sp_set_f64 (j, dst, JIT_F0);
 }
 
 static void
 compile_load_u64 (scm_jit_state *j, uint32_t dst, uint64_t a)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  jit_movi (T0, a);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  jit_movi (T0, a & 0xffffffff);
+  jit_movi (T1, a >> 32);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_load_s64 (scm_jit_state *j, uint32_t dst, int64_t a)
 {
+  compile_load_u64 (j, dst, a);
 }
 
 static void
 compile_current_thread (scm_jit_state *j, uint32_t dst)
 {
+  jit_ldxr (T0, THREAD, thread_offset_handle);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
 compile_ulogand (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  jit_andr (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, T3, b);
+  jit_andr (T0, T0, T2);
+  jit_andr (T1, T1, T3);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_ulogior (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  jit_orr (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, T3, b);
+  jit_orr (T0, T0, T2);
+  jit_orr (T1, T1, T3);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_ulogsub (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  jit_comr (T1, T1);
+  jit_andr (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, T3, b);
+  jit_comr (T2, T2);
+  jit_comr (T3, T3);
+  jit_andr (T0, T0, T2);
+  jit_andr (T1, T1, T3);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_ursh (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  jit_andi (T1, T1, 63);
+  jit_rshr_u (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  /* FIXME: Not tested.  */
+  jit_node_t *zero, *both, *done;
+
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, T3, b);
+  jit_andi (T2, T2, 63);
+  zero = jit_beqi (T2, 0);
+  both = jit_blti (T2, 32);
+
+  /* 32 <= s < 64: hi = 0, lo = hi >> (s-32) */
+  jit_subi (T2, 32);
+  jit_rshr_u (T0, T1, T2);
+  jit_movi (T1, 0);
+  done = jit_jmpi ();
+
+  jit_patch (both);
+  /* 0 < s < 32: hi = hi >> s, lo = lo >> s + hi << (32-s) */
+  jit_negr (T3, T2);
+  jit_addi (T3, T3, 32);
+  jit_lshr (T3, T1, T3);
+  jit_rshr_u (T1, T1, T2);
+  jit_rshr_u (T0, T0, T2);
+  jit_addr (T0, T0, T3);
+
+  jit_patch (done);
+  jit_patch (zero);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_ulsh (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  jit_andi (T1, T1, 63);
+  jit_lshr (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  /* FIXME: Not tested.  */
+  jit_node_t *zero, *both, *done;
+
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, T3, b);
+  jit_andi (T2, T2, 63);
+  zero = jit_beqi (T2, 0);
+  both = jit_blti (T2, 32);
+
+  /* 32 <= s < 64: hi = lo << (s-32), lo = 0 */
+  jit_subi (T2, 32);
+  jit_lshr (T1, T0, T2);
+  jit_movi (T0, 0);
+  done = jit_jmpi ();
+
+  jit_patch (both);
+  /* 0 < s < 32: hi = hi << s + lo >> (32-s), lo = lo << s */
+  jit_negr (T3, T2);
+  jit_addi (T3, T3, 32);
+  jit_rshr_u (T3, T0, T3);
+  jit_lshr (T1, T1, T2);
+  jit_lshr (T0, T0, T2);
+  jit_addr (T1, T1, T3);
+
+  jit_patch (done);
+  jit_patch (zero);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_ursh_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+  b &= 63;
+
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  jit_rshi_u (T0, T0, b);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  /* FIXME: Not tested.  */
+  emit_sp_ref_u64 (j, T0, T1, a);
+  if (b == 0)
+    {
+      /* Nothing to do.  */
+    }
+  else if (b >= 32)
+    {
+      /*  hi = 0, lo = hi >> (s-32) */
+      jit_rshi_u (T0, T1, b - 32);
+      jit_movi (T1, 0);
+    }
+  else
+    {
+      /* 0 < s < 32: hi = hi >> s, lo = lo >> s + hi << (32-s) */
+      jit_lshi (T2, T1, 32 - b);
+      jit_rshi_u (T1, T1, b);
+      jit_rshi_u (T0, T0, b);
+      jit_addr (T0, T0, T2);
+    }
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_ulsh_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+  b &= 63;
+
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  jit_lshi (T0, T0, b);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  /* FIXME: Not tested.  */
+  emit_sp_ref_u64 (j, T0, T1, a);
+  if (b == 0)
+    {
+      /* Nothing to do.  */
+    }
+  else if (b >= 32)
+    {
+      /* hi = lo << (s-32), lo = 0 */
+      jit_lshr (T1, T0, b - 32);
+      jit_movi (T0, 0);
+    }
+  else
+    {
+      /* hi = hi << s + lo >> (32-s), lo = lo << s */
+      jit_rshi_u (T2, T0, 32 - b);
+      jit_lshi (T1, T1, b);
+      jit_lshi (T0, T0, b);
+      jit_addr (T1, T1, T2);
+    }
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_ulogxor (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  jit_xorr (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, T3, b);
+  jit_xorr (T0, T0, T2);
+  jit_xorr (T1, T1, T3);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_handle_interrupts (scm_jit_state *j)
 {
+  jit_node_t *again, *mra, *none_pending, *blocked;
+
+  again = jit_label ();
+  jit_addi (T0, THREAD, thread_offset_pending_asyncs);
+  emit_call_r (j, scm_vm_intrinsics.atomic_ref_scm, T0);
+  jit_retval (T0);
+  none_pending = jit_beqi (T0, SCM_UNPACK (SCM_EOL));
+  jit_ldxi_i (T0, THREAD, thread_offset_block_asyncs);
+  blocked = jit_beqi (T0, 0);
+
+  emit_store_current_ip (j, T0);
+  mra = jit_movi (T0, 0);
+  jit_patch_at (mra, again);
+  emit_call_r_r (j, scm_vm_intrinsics.push_interrupt_frame, THREAD, T0);
+  emit_reload_sp (j);
+  emit_direct_tail_call (j, scm_vm_intrinsics.handle_interrupt_code);
+
+  jit_patch (none_pending);
+  jit_patch (blocked);
 }
 
 static void
 compile_return_from_interrupt (scm_jit_state *j)
 {
+  jit_gpr_t old_fp = T0, offset = T1, new_fp = T1, ra = T1;
+  jit_node_t *interp;
+
+  if (j->hooks_enabled)
+    emit_run_hook (j, T0, scm_vm_intrinsics.invoke_return_hook);
+
+  emit_load_fp (j, old_fp);
+  emit_load_prev_frame_size (j, offset, old_fp);
+  jit_addi (offset, offset, frame_overhead_slots);
+  jit_lshi (offset, offset, 3); /* Multiply by sizeof (scm_vm_stack_element) */
+  jit_addr (new_fp, old_fp, offset);
+  emit_store_fp (j, new_fp);
+
+  emit_load_mra (j, ra, old_fp);
+  interp = jit_beqi (ra, 0);
+  jit_addi (SP, old_fp, frame_overhead_slots * sizeof (union 
scm_vm_stack_element));
+  emit_store_sp (j);
+  jit_jmpr (ra);
+
+  jit_patch (interp);
+  emit_load_vra (j, ra, old_fp);
+  emit_store_ip (j, ra);
+  jit_addi (SP, old_fp, frame_overhead_slots * sizeof (union 
scm_vm_stack_element));
+  emit_store_sp (j);
+  jit_ret ();
 }
 
 static void
 compile_u64_numerically_equal (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  jit_node_t *k;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  k = jit_bner (T0, T1);
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  jit_patch (k);
+#else
+  jit_node_t *k1, *k2, *k3;
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, T3, b);
+  k1 = jit_bner (T0, T2);
+  k2 = jit_bner (T1, T3);
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  k3 = jit_jmpi ();
+  jit_patch (k1);
+  jit_patch (k2);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+#endif
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_u64_less (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  jit_node_t *k;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  k = jit_bger_u (T0, T1);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  jit_patch (k);
+#else
+  jit_node_t *k1, *k2, *less, *k3;
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, T3, b);
+  less = jit_bltr_u (T1, T3);
+  k1 = jit_bner (T1, T3);
+  k2 = jit_bger_u (T0, T2);
+  jit_patch (less);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  k3 = jit_jmpi ();
+  jit_patch (k1);
+  jit_patch (k2);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  jit_patch (k3);
+#endif
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_s64_numerically_equal (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+  compile_u64_numerically_equal (j, a, b);
 }
 
 static void
 compile_s64_less (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  jit_node_t *k;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_u64 (j, T0, a);
+  emit_sp_ref_u64 (j, T1, b);
+  k = jit_bger (T0, T1);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  jit_patch (k);
+#else
+  jit_node_t *k1, *k2, *less, *k3;
+  emit_sp_ref_u64 (j, T0, T1, a);
+  emit_sp_ref_u64 (j, T2, T3, b);
+  less = jit_bltr (T1, T3);
+  k1 = jit_bner (T1, T3);
+  k2 = jit_bger (T0, T2);
+  jit_patch (less);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  k3 = jit_jmpi ();
+  jit_patch (k1);
+  jit_patch (k2);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  jit_patch (k3);
+#endif
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_f64_numerically_equal (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+  jit_node_t *k;
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  emit_sp_ref_f64 (j, JIT_F0, a);
+  emit_sp_ref_f64 (j, JIT_F1, b);
+  k = jit_beqr_d (JIT_F0, JIT_F1);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  jit_patch (k);
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_f64_less (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+  jit_node_t *less, *ge, *k1, *k2;
+  emit_sp_ref_f64 (j, JIT_F0, a);
+  emit_sp_ref_f64 (j, JIT_F1, b);
+  less = jit_bltr_d (JIT_F0, JIT_F1);
+  ge = jit_bger_d (JIT_F0, JIT_F1);
+  jit_movi (T2, SCM_F_COMPARE_INVALID);
+  k1 = jit_jmpi ();
+  jit_patch (ge);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  k2 = jit_jmpi ();
+  jit_patch (less);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  jit_patch (k1);
+  jit_patch (k2);
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_numerically_equal (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+  jit_node_t *k;
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  emit_sp_ref_scm (j, T1, b);
+  emit_call_r_r (j, scm_vm_intrinsics.numerically_equal_p, T0, T1);
+  jit_retval (T0);
+  emit_reload_sp (j);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  k = jit_bnei (T0, 0);
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  jit_patch (k);
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_less (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  emit_sp_ref_scm (j, T1, b);
+  emit_call_r_r (j, scm_vm_intrinsics.less_p, T0, T1);
+  jit_retval (T2);
+  emit_reload_sp (j);
+  emit_store_compare_result (j, T2);
 }
 
 static void
-compile_check_arguments (scm_jit_state *j, uint32_t a)
+compile_check_arguments (scm_jit_state *j, uint32_t expected)
 {
+  jit_node_t *eq, *k;
+  jit_gpr_t fp = T0, t = T1, res = T2;
+  
+  emit_load_fp (j, fp);
+  jit_movi (res, SCM_F_COMPARE_EQUAL);
+  eq = emit_branch_if_frame_locals_count_eq (j, fp, t, expected);
+  if (expected > 0)
+    {
+      jit_node_t *k2, *ge;
+      ge = emit_branch_if_frame_locals_count_greater_than (j, fp, t, 
expected-1);
+      jit_movi (res, SCM_F_COMPARE_LESS_THAN);
+      k2 = jit_jmpi ();
+      jit_patch (ge);
+      jit_movi (res, SCM_F_COMPARE_NONE);
+      jit_patch (k2);
+    }
+  else
+    jit_movi (res, SCM_F_COMPARE_NONE);
+  jit_patch (eq);
+  jit_patch (k);
+  emit_store_compare_result (j, T2);
 }
 
 static void
-compile_check_positional_arguments (scm_jit_state *j, uint32_t a, uint32_t b)
+compile_check_positional_arguments (scm_jit_state *j, uint32_t nreq, uint32_t 
expected)
 {
+  jit_node_t *k, *head, *lt, *eq, *done1, *done2;
+  jit_gpr_t walk = T0, npos = T1, obj = T2, t = T3, res = T0;
+
+  emit_load_fp (j, walk);
+  if (nreq == 0) abort ();
+  emit_subtract_stack_slots (j, walk, walk, nreq-1);
+  jit_movi (npos, nreq - 1);
+  
+  head = jit_label ();
+  jit_addi (npos, npos, 1);
+  emit_subtract_stack_slots (j, walk, walk, 1);
+  k = jit_beqr (walk, SP);
+  jit_ldr (obj, walk);
+  jit_patch_at (emit_branch_if_immediate (j, obj), head);
+  jit_patch_at (emit_branch_if_heap_object_not_tc7 (j, obj, t, 
scm_tc7_keyword),
+                head);
+  jit_patch (k);
+
+  lt = jit_blti (npos, expected);
+  eq = jit_beqi (npos, expected);
+  jit_movi (res, SCM_F_COMPARE_NONE);
+  done1 = jit_jmpi ();
+  jit_patch (lt);
+  jit_movi (res, SCM_F_COMPARE_LESS_THAN);
+  done2 = jit_jmpi ();
+  jit_patch (eq);
+  jit_movi (res, SCM_F_COMPARE_EQUAL);
+  jit_patch (done1);
+  jit_patch (done2);
+  jit_stxi (thread_offset_compare_result, THREAD, res);
 }
 
 static void
-compile_immediate_tag_equals (scm_jit_state *j, uint32_t a, uint16_t b, 
uint16_t c)
+compile_immediate_tag_equals (scm_jit_state *j, uint32_t a, uint16_t mask,
+                              uint16_t expected)
 {
+  jit_node_t *k;
+  emit_sp_ref_scm (j, T0, a);
+  jit_andi (T0, T0, mask);
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  k = jit_beqi (T0, expected);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  jit_patch (k);
+  emit_store_compare_result (j, T2);
 }
 
 static void
-compile_heap_tag_equals (scm_jit_state *j, uint32_t a, uint16_t b, uint16_t c)
+compile_heap_tag_equals (scm_jit_state *j, uint32_t obj,
+                         uint16_t mask, uint16_t expected)
 {
+  jit_node_t *k;
+  emit_sp_ref_scm (j, T0, obj);
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  k = emit_branch_if_heap_object_has_tc (j, T0, T0, mask, expected);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  jit_patch (k);
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_eq (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+  jit_node_t *k;
+  emit_sp_ref_scm (j, T0, a);
+  emit_sp_ref_scm (j, T1, b);
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  k = jit_beqi (T0, T1);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  jit_patch (k);
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_j (scm_jit_state *j, const uint32_t *vcode)
 {
+  jit_node_t *jmp;
+  jmp = jit_jmpi ();
+  add_inter_instruction_patch (j, jmp, vcode);
 }
 
 static void
 compile_jl (scm_jit_state *j, const uint32_t *vcode)
 {
+  jit_node_t *jmp;
+  emit_load_compare_result (j, T0);
+  jmp = jit_beqi (T0, SCM_F_COMPARE_LESS_THAN);
+  add_inter_instruction_patch (j, jmp, vcode);
 }
 
 static void
 compile_je (scm_jit_state *j, const uint32_t *vcode)
 {
+  jit_node_t *jmp;
+  emit_load_compare_result (j, T0);
+  jmp = jit_beqi (T0, SCM_F_COMPARE_EQUAL);
+  add_inter_instruction_patch (j, jmp, vcode);
 }
 
 static void
 compile_jnl (scm_jit_state *j, const uint32_t *vcode)
 {
+  jit_node_t *jmp;
+  emit_load_compare_result (j, T0);
+  jmp = jit_bnei (T0, SCM_F_COMPARE_LESS_THAN);
+  add_inter_instruction_patch (j, jmp, vcode);
 }
 
 static void
 compile_jne (scm_jit_state *j, const uint32_t *vcode)
 {
+  jit_node_t *jmp;
+  emit_load_compare_result (j, T0);
+  jmp = jit_bnei (T0, SCM_F_COMPARE_EQUAL);
+  add_inter_instruction_patch (j, jmp, vcode);
 }
 
 static void
 compile_jge (scm_jit_state *j, const uint32_t *vcode)
 {
+  jit_node_t *jmp;
+  emit_load_compare_result (j, T0);
+  jmp = jit_beqi (T0, SCM_F_COMPARE_NONE);
+  add_inter_instruction_patch (j, jmp, vcode);
 }
 
 static void
 compile_jnge (scm_jit_state *j, const uint32_t *vcode)
 {
+  jit_node_t *jmp;
+  emit_load_compare_result (j, T0);
+  jmp = jit_bnei (T0, SCM_F_COMPARE_NONE);
+  add_inter_instruction_patch (j, jmp, vcode);
 }
 
 static void
 compile_heap_numbers_equal (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+  jit_node_t *k;
+  emit_store_current_ip (j, T0);
+  emit_sp_ref_scm (j, T0, a);
+  emit_sp_ref_scm (j, T1, b);
+  emit_call_r_r (j, scm_vm_intrinsics.heap_numbers_equal_p, T0, T1);
+  jit_retval (T0);
+  emit_reload_sp (j);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  k = jit_bnei (T0, 0);
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  jit_patch (k);
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_untag_fixnum (scm_jit_state *j, uint16_t dst, uint16_t a)
 {
+  emit_sp_ref_scm (j, T0, a);
+  jit_rshi (T0, T0, 2);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_set_s64 (j, dst, T0);
+#else
+  /* FIXME: Untested!  */
+  jit_movi (T1, T0);
+  jit_rshi (T1, T1, 31);
+  emit_sp_set_s64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_tag_fixnum (scm_jit_state *j, uint16_t dst, uint16_t a)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_s64 (j, T0, a);
+#else
+  emit_sp_ref_s32 (j, T0, a);
+#endif
+  jit_lshi (T0, T0, 2);
+  jit_addi (T0, T0, scm_tc2_int);
+  emit_sp_set_scm (j, dst, T0);
 }
 
 static void
 compile_srsh (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_s64 (j, T0, a);
+  emit_sp_ref_s64 (j, T1, b);
+  jit_andi (T1, T1, 63);
+  jit_rshr (T0, T0, T1);
+  emit_sp_set_s64 (j, dst, T0);
+#else
+  /* FIXME: Not tested.  */
+  jit_node_t *zero, *both, *done;
+
+  emit_sp_ref_s64 (j, T0, T1, a);
+  emit_sp_ref_s64 (j, T2, T3, b);
+  jit_andi (T2, T2, 63);
+  zero = jit_beqi (T2, 0);
+  both = jit_blti (T2, 32);
+
+  /* 32 <= s < 64: hi = hi >> 31, lo = hi >> (s-32) */
+  jit_subi (T2, 32);
+  jit_rshr (T0, T1, T2);
+  jit_rshi (T1, T1, 31);
+  done = jit_jmpi ();
+
+  jit_patch (both);
+  /* 0 < s < 32: hi = hi >> s, lo = lo >> s + hi << (32-s) */
+  jit_negr (T3, T2);
+  jit_addi (T3, T3, 32);
+  jit_lshr (T3, T1, T3);
+  jit_rshr (T1, T1, T2);
+  jit_rshr_u (T0, T0, T2);
+  jit_addr (T0, T0, T3);
+
+  jit_patch (done);
+  jit_patch (zero);
+  emit_sp_set_s64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_srsh_immediate (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
 {
+  b &= 63;
+
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_s64 (j, T0, a);
+  jit_rshi (T0, T0, b);
+  emit_sp_set_s64 (j, dst, T0);
+#else
+  /* FIXME: Not tested.  */
+  emit_sp_ref_s64 (j, T0, T1, a);
+  if (b == 0)
+    {
+      /* Nothing to do.  */
+    }
+  else if (b >= 32)
+    {
+      /*  hi = sign-ext, lo = hi >> (s-32) */
+      jit_rshi (T0, T1, b - 32);
+      jit_rshi (T1, T1, 31);
+    }
+  else
+    {
+      /* 0 < s < 32: hi = hi >> s, lo = lo >> s + hi << (32-s) */
+      jit_lshi (T2, T1, 32 - b);
+      jit_rshi (T1, T1, b);
+      jit_rshi_u (T0, T0, b);
+      jit_addr (T0, T0, T2);
+    }
+  emit_sp_set_s64 (j, dst, T0, T1);
+#endif
 }
 
 static void
 compile_s64_imm_numerically_equal (scm_jit_state *j, uint16_t a, int16_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  jit_node_t *k;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_u64 (j, T0, a);
+  k = jit_bnei (T0, b);
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  jit_patch (k);
+#else
+  jit_node_t *k1, *k2;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_u64 (j, T0, T1, a);
+  k1 = jit_bnei (T0, b);
+  k2 = jit_bnei (T1, b < 0 ? -1 : 0);
+  jit_movi (T2, SCM_F_COMPARE_EQUAL);
+  jit_patch (k1);
+  jit_patch (k2);
+#endif
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_u64_imm_less (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  jit_node_t *k;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_u64 (j, T0, a);
+  k = jit_bgei_u (T0, b);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  jit_patch (k);
+#else
+  jit_node_t *k1, *k2;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_u64 (j, T0, T1, a);
+  k1 = jit_bgei_u (T0, b);
+  k2 = jit_bnei (T1, 0);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  jit_patch (k1);
+  jit_patch (k2);
+#endif
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_imm_u64_less (scm_jit_state *j, uint16_t a, uint16_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  jit_node_t *k;
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  emit_sp_ref_u64 (j, T0, a);
+  k = jit_bgti_u (T0, b);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  jit_patch (k);
+#else
+  jit_node_t *k1, *k2;
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  emit_sp_ref_u64 (j, T0, T1, a);
+  k1 = jit_bnei (T1, 0);
+  k2 = jit_bgti_u (T0, b);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  jit_patch (k1);
+  jit_patch (k2);
+#endif
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_s64_imm_less (scm_jit_state *j, uint16_t a, int16_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  jit_node_t *k;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_s64 (j, T0, a);
+  k = jit_bgei (T0, b);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  jit_patch (k);
+#else
+  jit_node_t *k1, *k2, *k3;
+  int32_t sign = b < 0 ? -1 : 0;
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  emit_sp_ref_s64 (j, T0, T1, a);
+  k1 = jit_blti (T1, sign);
+  k2 = jit_bnei (T1, sign);
+  k3 = jit_blti (T0, b);
+  jit_patch (k2);
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  jit_patch (k1);
+  jit_patch (k3);
+#endif
+  emit_store_compare_result (j, T2);
 }
 
 static void
 compile_imm_s64_less (scm_jit_state *j, uint16_t a, int16_t b)
 {
+#if SIZEOF_UINTPTR_T >= 8
+  jit_node_t *k;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_s64 (j, T0, a);
+  k = jit_blei (T0, b);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  jit_patch (k);
+#else
+  jit_node_t *k1, *k2, *k3;
+  int32_t sign = b < 0 ? -1 : 0;
+  jit_movi (T2, SCM_F_COMPARE_NONE);
+  emit_sp_ref_s64 (j, T0, T1, a);
+  k1 = jit_blti (T1, sign);
+  k2 = jit_bnei (T1, sign);
+  k3 = jit_blei (T0, b);
+  jit_patch (k2);
+  jit_movi (T2, SCM_F_COMPARE_LESS_THAN);
+  jit_patch (k1);
+  jit_patch (k3);
+#endif
+  emit_store_compare_result (j, T2);
 }
 
 static void
-compile_u8_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_u8_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+  jit_ldxr_uc (T0, T0, T1);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  jit_movi (T1, 0);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
-compile_u16_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_u16_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+  jit_ldxr_us (T0, T0, T1);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  jit_movi (T1, 0);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
-compile_u32_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_u32_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+#if SIZEOF_UINTPTR_T >= 8
+  jit_ldxr_ui (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  jit_ldxr (T0, T0, T1);
+  jit_movi (T1, 0);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
-compile_u64_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_u64_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+#if SIZEOF_UINTPTR_T >= 8
+  jit_ldxr (T0, T0, T1);
+  emit_sp_set_u64 (j, dst, T0);
+#else
+  jit_addr (T0, T0, T1);
+  if (BIGENDIAN)
+    {
+      jit_ldxi (T1, T0, 4);
+      jit_ldr (T0, T0);
+    }
+  else
+    {
+      jit_ldr (T1, T0);
+      jit_ldxi (T0, T0, 4);
+    }
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
-compile_u8_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_u8_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T2, v);
+#else
+  emit_sp_ref_u64_lower_half (j, T2, v);
+#endif
+  jit_stxr_c (T0, T1, T2);
 }
 
 static void
-compile_u16_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_u16_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T2, v);
+#else
+  emit_sp_ref_u64_lower_half (j, T2, v);
+#endif
+  jit_stxr_s (T0, T1, T2);
 }
 
 static void
-compile_u32_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_u32_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T2, v);
+  jit_stxr_i (T0, T1, T2);
+#else
+  emit_sp_ref_u64_lower_half (j, T2, v);
+  jit_stxr (T0, T1, T2);
+#endif
 }
 
 static void
-compile_u64_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_u64_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_ref_u64 (j, T2, v);
+  jit_stxr (T0, T1, T2);
+#else
+  jit_addr (T0, T0, T1);
+  emit_sp_ref_u64 (j, T1, T2, v);
+  if (BIGENDIAN)
+    {
+      jit_str (T0, T2);
+      jit_stxi (4, T0, T1);
+    }
+  else
+    {
+      jit_str (T0, T1);
+      jit_stxi (4, T0, T2);
+    }
+#endif
 }
 
 static void
-compile_s8_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_s8_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+  jit_ldxr_c (T0, T0, T1);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_set_s64 (j, dst, T0);
+#else
+  jit_rshi (T1, T0, 7);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
-compile_s16_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_s16_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+  jit_ldxr_s (T0, T0, T1);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_set_s64 (j, dst, T0);
+#else
+  jit_rshi (T1, T0, 15);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
-compile_s32_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_s32_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+  jit_ldxr_i (T0, T0, T1);
+#if SIZEOF_UINTPTR_T >= 8
+  emit_sp_set_s64 (j, dst, T0);
+#else
+  jit_rshi (T1, T0, 31);
+  emit_sp_set_u64 (j, dst, T0, T1);
+#endif
 }
 
 static void
-compile_s64_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_s64_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  compile_u64_ref (j, dst, ptr, idx);
 }
 
 static void
-compile_s8_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_s8_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  compile_u8_set (j, ptr, idx, v);
 }
 
 static void
-compile_s16_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_s16_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  compile_u16_set (j, ptr, idx, v);
 }
 
 static void
-compile_s32_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_s32_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  compile_u32_set (j, ptr, idx, v);
 }
 
 static void
-compile_s64_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_s64_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  compile_u64_set (j, ptr, idx, v);
 }
 
 static void
-compile_f32_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_f32_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+  jit_ldxr_f (JIT_F0, T0, T1);
+  jit_extr_f_d (JIT_F0, JIT_F0);
+  emit_sp_set_f64 (j, dst, JIT_F0);
 }
 
 static void
-compile_f64_ref (scm_jit_state *j, uint8_t dst, uint8_t a, uint8_t b)
+compile_f64_ref (scm_jit_state *j, uint8_t dst, uint8_t ptr, uint8_t idx)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+  jit_ldxr_d (JIT_F0, T0, T1);
+  emit_sp_set_f64 (j, dst, JIT_F0);
 }
 
 static void
-compile_f32_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_f32_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+  emit_sp_ref_f64 (j, JIT_F0, v);
+  jit_extr_d_f (JIT_F0, JIT_F0);
+  jit_stxr_d (T0, T1, JIT_F0);
 }
 
 static void
-compile_f64_set (scm_jit_state *j, uint8_t a, uint8_t b, uint8_t c)
+compile_f64_set (scm_jit_state *j, uint8_t ptr, uint8_t idx, uint8_t v)
 {
+  emit_sp_ref_ptr (j, T0, ptr);
+  emit_sp_ref_sz (j, T1, idx);
+  emit_sp_ref_f64 (j, JIT_F0, v);
+  jit_stxr_d (T0, T1, JIT_F0);
 }
 
 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]