bug-hurd
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 5/5] x86_64: add 64-bit syscall entry point


From: Luca Dariz
Subject: [PATCH 5/5] x86_64: add 64-bit syscall entry point
Date: Mon, 27 Feb 2023 21:45:01 +0100

While theoretically we could still use the same call gate as for
32-bit userspace, it doesn't seem very common, and gcc seems to not
encode properly the instruction. Instead we use syscall/sysret as
other kernels (e.g. XNU,Linux). This version still has some
limitations, but should be enough to start working on the 64-bit user
space.

* i386/i386/i386asm.sym: add more constants to fill pcb->iss
* i386/i386/ldt.c: configure 64-bit syscall entry point
* i386/i386/ldt.h: swap CS/DS segments order if !USER32 as required by
  sysret
* i386/i386/locore.h: add syscall64 and MSR definitions
* i386/include/mach/i386/syscall_sw.h: add simple entry point from
  user space. This is just for simple tests, it seems glibc doesn't
  use this
* x86_64/locore.S: implement syscall64 entry point
---
 i386/i386/i386asm.sym               |  11 +++
 i386/i386/ldt.c                     |  15 ++-
 i386/i386/ldt.h                     |   7 +-
 i386/i386/locore.h                  |  29 ++++++
 i386/include/mach/i386/syscall_sw.h |  16 ++--
 x86_64/locore.S                     | 136 ++++++++++++++++++++++++++++
 6 files changed, 204 insertions(+), 10 deletions(-)

diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
index 8317db6c..733cc4eb 100644
--- a/i386/i386/i386asm.sym
+++ b/i386/i386/i386asm.sym
@@ -52,6 +52,8 @@ expr  CALL_SINGLE_FUNCTION_BASE
 
 offset ApicLocalUnit           lu      apic_id         APIC_ID
 
+offset pcb                     pcb     iss
+
 offset thread                  th      pcb
 offset thread                  th      task
 offset thread                  th      recover
@@ -82,9 +84,15 @@ size i386_kernel_state       iks
 
 size   i386_exception_link     iel
 
+offset i386_saved_state        r       gs
+offset i386_saved_state        r       fs
 offset i386_saved_state        r       cs
 offset i386_saved_state        r       uesp
 offset i386_saved_state        r       eax
+offset i386_saved_state        r       ebx
+offset i386_saved_state        r       ecx
+offset i386_saved_state        r       edx
+offset i386_saved_state        r       ebp
 offset i386_saved_state        r       trapno
 offset i386_saved_state        r       err
 offset i386_saved_state        r       efl             R_EFLAGS
@@ -92,6 +100,9 @@ offset       i386_saved_state        r       eip
 offset i386_saved_state        r       cr2
 offset i386_saved_state        r       edi
 #ifdef __x86_64__
+offset i386_saved_state        r       r12
+offset i386_saved_state        r       r13
+offset i386_saved_state        r       r14
 offset i386_saved_state        r       r15
 #endif
 
diff --git a/i386/i386/ldt.c b/i386/i386/ldt.c
index b86a0e3c..61a03d65 100644
--- a/i386/i386/ldt.c
+++ b/i386/i386/ldt.c
@@ -31,6 +31,7 @@
 #include <mach/xen.h>
 
 #include <intel/pmap.h>
+#include <kern/debug.h>
 
 #include "vm_param.h"
 #include "seg.h"
@@ -65,10 +66,22 @@ ldt_fill(struct real_descriptor *myldt, struct 
real_descriptor *mygdt)
                                ACC_PL_K|ACC_LDT, 0);
 #endif /* MACH_PV_DESCRIPTORS */
 
-       /* Initialize the 32bit LDT descriptors.  */
+       /* Initialize the syscall entry point */
+#if defined(__x86_64__) && ! defined(USER32)
+        if (!(CPU_HAS_FEATURE(CPU_FEATURE_MSR) && 
CPU_HAS_FEATURE(CPU_FEATURE_SEP)))
+            panic("syscall support is missing on 64 bit");
+        /* Enable 64-bit syscalls */
+        wrmsr(MSR_REG_EFER, rdmsr(MSR_REG_EFER) | MSR_EFER_SCE);
+        wrmsr(MSR_REG_LSTAR, syscall64);
+        wrmsr(MSR_REG_STAR, ((((long)USER_CS - 16) << 16) | (long)KERNEL_CS) 
<< 32);
+        wrmsr(MSR_REG_FMASK, 0);  // ?
+#else /* defined(__x86_64__) && ! defined(USER32) */
        fill_ldt_gate(myldt, USER_SCALL,
                      (vm_offset_t)&syscall, KERNEL_CS,
                      ACC_PL_U|ACC_CALL_GATE, 0);
+#endif /* defined(__x86_64__) && ! defined(USER32) */
+
+       /* Initialize the 32bit LDT descriptors.  */
        fill_ldt_descriptor(myldt, USER_CS,
                            VM_MIN_USER_ADDRESS,
                            VM_MAX_USER_ADDRESS-VM_MIN_USER_ADDRESS-4096,
diff --git a/i386/i386/ldt.h b/i386/i386/ldt.h
index b15f11a5..4490f99f 100644
--- a/i386/i386/ldt.h
+++ b/i386/i386/ldt.h
@@ -45,9 +45,14 @@
 #define        USER_SCALL      0x07            /* system call gate */
 #ifdef __x86_64__
 /* Call gate needs two entries */
-#endif
+
+/* The sysret instruction puts some constraints on the user segment indexes */
+#define        USER_CS         0x1f            /* user code segment */
+#define        USER_DS         0x17            /* user data segment */
+#else
 #define        USER_CS         0x17            /* user code segment */
 #define        USER_DS         0x1f            /* user data segment */
+#endif
 
 #define        LDTSZ           4
 
diff --git a/i386/i386/locore.h b/i386/i386/locore.h
index a8807dbf..39545ff5 100644
--- a/i386/i386/locore.h
+++ b/i386/i386/locore.h
@@ -57,6 +57,7 @@ extern int inst_fetch (int eip, int cs);
 extern void cpu_shutdown (void);
 
 extern int syscall (void);
+extern int syscall64 (void);
 
 extern unsigned int cpu_features[2];
 
@@ -93,5 +94,33 @@ extern unsigned int cpu_features[2];
 
 #define CPU_HAS_FEATURE(feature) (cpu_features[(feature) / 32] & (1 << 
((feature) % 32)))
 
+#define MSR_REG_EFER  0xC0000080
+#define MSR_REG_STAR  0xC0000081
+#define MSR_REG_LSTAR 0xC0000082
+#define MSR_REG_CSTAR 0xC0000083
+#define MSR_REG_FMASK 0xC0000084
+
+#define MSR_EFER_SCE  0x00000001
+
+static inline void wrmsr(uint32_t regaddr, uint64_t value)
+{
+    uint32_t low=(uint32_t)value, high=((uint32_t)(value >> 32));
+    asm volatile("wrmsr\n"                              \
+                 :                                      \
+                 : "c" (regaddr), "a" (low), "d" (high) \
+                 : "memory"                             \
+        );
+}
+
+static inline uint64_t rdmsr(uint32_t regaddr)
+{
+    uint32_t low, high;
+    asm volatile("rdmsr\n"                              \
+                 : "=a" (low), "=d" (high)              \
+                 : "c" (regaddr)                        \
+        );
+    return ((uint64_t)high << 32) | low;
+}
+
 #endif /* _MACHINE__LOCORE_H_ */
 
diff --git a/i386/include/mach/i386/syscall_sw.h 
b/i386/include/mach/i386/syscall_sw.h
index 86f6ff2f..20ef7c13 100644
--- a/i386/include/mach/i386/syscall_sw.h
+++ b/i386/include/mach/i386/syscall_sw.h
@@ -29,16 +29,16 @@
 
 #include <mach/machine/asm.h>
 
-#if BSD_TRAP
-#define kernel_trap(trap_name,trap_number,number_args) \
-ENTRY(trap_name) \
-       movl    $ trap_number,%eax; \
-       SVC; \
-       jb LCL(cerror); \
-       ret; \
+#if defined(__x86_64__) && ! defined(USER32)
+#define kernel_trap(trap_name,trap_number,number_args)  \
+ENTRY(trap_name)                                       \
+       movq    $ trap_number,%rax;                     \
+       movq    %rcx,%r10;                              \
+       syscall;                                        \
+       ret;                                            \
 END(trap_name)
 #else
-#define kernel_trap(trap_name,trap_number,number_args) \
+#define kernel_trap(trap_name,trap_number,number_args)  \
 ENTRY(trap_name) \
        movl    $ trap_number,%eax; \
        SVC; \
diff --git a/x86_64/locore.S b/x86_64/locore.S
index 47d9085c..fdf7300b 100644
--- a/x86_64/locore.S
+++ b/x86_64/locore.S
@@ -1281,6 +1281,142 @@ DATA(cpu_features_ecx)
 
 END(syscall)
 
+
+/* Entry point for 64-bit syscalls.
+ * On entry we're still on the user stack, so better not use it. Instead we
+ * save the thread state immediately in thread->pcb->iss, then try to invoke
+ * the syscall.
+ * TODO:
+     - for now we assume the return address is canonical, but apparently there
+       can be cases where it's not (see how Linux handles this). Does it apply
+       here?
+     - do we need to check for ast on syscalls? Maybe on interrupts is enough
+     - check that the case where a task is suspended, and later returns via
+       iretq from return_from_trap, works fine in all combinations
+     - emulated syscalls - are they used anywhere?
+ */
+ENTRY(syscall64)
+       /* RFLAGS[32:63] are reserved, so combine syscall num (32 bit) and
+        * eflags in RAX to allow using r11 as temporary register */
+       shlq    $32,%r11
+       shlq    $32,%rax        /* make sure bits 32:63 of %rax are zero */
+       shrq    $32,%rax
+       or      %r11,%rax
+
+       /* Save thread state in pcb->iss, as on exception entry.
+        * Since this is triggered synchronously from userspace, we can
+        * save only the callee-preserved status according to the C ABI,
+        * plus RIP and EFLAGS for sysret */
+       CPU_NUMBER(%r11)
+       movq    CX(EXT(active_threads),%r11),%r11 /* point to current thread */
+       movq    TH_PCB(%r11),%r11               /* point to pcb */
+       addq    $ PCB_ISS,%r11                  /* point to saved state */
+
+       mov     %gs,R_GS(%r11)
+       mov     %fs,R_FS(%r11)
+       mov     %rsp,R_UESP(%r11)       /* callee-preserved register */
+       mov     %rcx,R_EIP(%r11)        /* syscall places user RIP in RCX */
+       mov     %rbx,R_EBX(%r11)        /* callee-preserved register */
+       mov     %rax,%rbx               /* Now we can unpack eflags again */
+       shr     $32,%rbx
+       mov     %rbx,R_EFLAGS(%r11)     /* ... and save them in pcb as well */
+       mov     %rbp,R_EBP(%r11)        /* callee-preserved register */
+       mov     %r12,R_R12(%r11)        /* callee-preserved register */
+       mov     %r13,R_R13(%r11)        /* callee-preserved register */
+       mov     %r14,R_R14(%r11)        /* callee-preserved register */
+       mov     %r15,R_R15(%r11)        /* callee-preserved register */
+       mov     %r11,%rbx               /* prepare for error handling */
+       mov     %r10,%rcx               /* fix arg3 location according to C ABI 
*/
+
+       /* switch to kernel stack */
+       CPU_NUMBER(%r11)
+       movq    CX(EXT(kernel_stack),%r11),%rsp
+
+       /* Now we have saved state and args 1-6 are in place.
+        * Before invoking the syscall we do some bound checking and,
+        * if we have more that 6 arguments, we need to copy the
+        * remaining ones to the kernel stack, handling page faults when
+        * accessing the user stack.
+        */
+       shlq    $32,%rax                /* make sure bits 32:63 of %rax are 
zero */
+       shrq    $32,%rax
+       negl    %eax                    /* get system call number */
+       jl      _syscall64_range        /* out of range if it was positive */
+       cmpl    EXT(mach_trap_count),%eax       /* check system call table 
bounds */
+       jg      _syscall64_range        /* error if out of range */
+       shll    $5,%eax                 /* manual indexing of mach_trap_t */
+
+       /* check if we need to place some arguments on the stack */
+_syscall64_args_stack:
+       mov     EXT(mach_trap_table)(%rax),%r10 /* get number of arguments */
+       subq    $6,%r10                 /* the first 6 args are already in 
place */
+       jl      _syscall64_call         /* skip argument copy if >6 args */
+
+       movq    R_UESP(%rbx),%r11       /* get user stack pointer */
+       addq    $8,%r11                 /* Skip user return address */
+
+       mov     $USER_DS,%r12           /* use user data segment for accesses */
+       mov     %r12,%fs
+
+       lea     (%r11,%r10,8),%r11      /* point past last argument */
+       xorq    %r12,%r12
+
+0:     subq    $8,%r11
+       RECOVER(_syscall64_addr_push)
+       mov     %fs:(%r11),%r12
+       pushq   %r12                    /* push argument on stack */
+       dec     %r10
+       jnz     0b                      /* loop for all remaining arguments */
+
+_syscall64_call:
+       call    *EXT(mach_trap_table)+8(%rax)  /* call procedure */
+       // XXX: check ast on exit?
+
+       /* avoid leaking information in callee-clobbered registers */
+       mov     $0,%rdi
+       mov     $0,%rsi
+       mov     $0,%rdx
+       mov     $0,%r10
+       mov     $0,%r9
+       mov     $0,%r8
+
+       /* restore thread state and return to user using sysret */
+       CPU_NUMBER(%r11)
+       movq    CX(EXT(active_threads),%r11),%r11 /* point to current thread */
+       movq    TH_PCB(%r11),%r11               /* point to pcb */
+       addq    $ PCB_ISS,%r11                  /* point to saved state */
+
+       mov     R_GS(%r11),%gs
+       mov     R_FS(%r11),%fs
+       mov     R_UESP(%r11),%rsp       /* callee-preserved register,
+                                        * switch to user stack */
+       mov     R_EIP(%r11),%rcx        /* sysret convention */
+       mov     R_EBX(%r11),%rbx        /* callee-preserved register */
+       mov     R_EBP(%r11),%rbp        /* callee-preserved register */
+       mov     R_R12(%r11),%r12        /* callee-preserved register */
+       mov     R_R13(%r11),%r13        /* callee-preserved register */
+       mov     R_R14(%r11),%r14        /* callee-preserved register */
+       mov     R_R15(%r11),%r15        /* callee-preserved register */
+       mov     R_EFLAGS(%r11),%r11     /* sysret convention */
+
+       sysretq         /* fast return to user-space, the thread didn't block */
+
+/* Error handling fragments, from here we jump directly to the trap handler */
+_syscall64_addr_push:
+       movq    %rbx,%rsp               /* clean parameters from stack */
+       movq    %r11,R_CR2(%rbx)        /* set fault address */
+       movq    $(T_PAGE_FAULT),R_TRAPNO(%rbx)  /* set page-fault trap */
+       movq    $(T_PF_USER),R_ERR(%rbx) /* set error code - read user space */
+       jmp     _take_trap              /* treat as a trap */
+
+_syscall64_range:
+       movq    $(T_INVALID_OPCODE),R_TRAPNO(%rbx)
+                                       /* set invalid-operation trap */
+       movq    $0,R_ERR(%rbx)          /* clear error code */
+       jmp     _take_trap              /* treat as a trap */
+
+END(syscall64)
+
 /* Discover what kind of cpu we have; return the family number
    (3, 4, 5, 6, for 386, 486, 586, 686 respectively).  */
 ENTRY(discover_x86_cpu_type)
-- 
2.30.2




reply via email to

[Prev in Thread] Current Thread [Next in Thread]