syscall_asm.s revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
/* All Rights Reserved */
/* Copyright (c) 1987, 1988 Microsoft Corporation */
/* All Rights Reserved */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/asm_linkage.h>
#include <sys/asm_misc.h>
#include <sys/regset.h>
#include <sys/psw.h>
#include <sys/x86_archext.h>
#if defined(__lint)
#include <sys/types.h>
#include <sys/thread.h>
#include <sys/systm.h>
#else /* __lint */
#include <sys/segments.h>
#include <sys/pcb.h>
#include <sys/trap.h>
#include <sys/ftrace.h>
#include <sys/traptrace.h>
#include <sys/clock.h>
#include <sys/panic.h>
#include "assym.h"
#endif /* __lint */
/*
* We implement two flavours of system call entry points
*
* - {int,lcall}/iret (i386)
* - sysenter/sysexit (Pentium II and beyond)
*
* The basic pattern used in the handlers is to check to see if we can
* do fast (simple) version of the system call; if we can't we use various
* C routines that handle corner cases and debugging.
*
* To reduce the amount of assembler replication, yet keep the system call
* implementations vaguely comprehensible, the common code in the body
* of the handlers is broken up into a set of preprocessor definitions
* below.
*/
/*
* When we have SYSCALLTRACE defined, we sneak an extra
* predicate into a couple of tests.
*/
#if defined(SYSCALLTRACE)
#define ORL_SYSCALLTRACE(r32) \
orl syscalltrace, r32
#else
#define ORL_SYSCALLTRACE(r32)
#endif
/*
* This check is false whenever we want to go fast i.e.
*
* if (code >= NSYSCALL ||
* t->t_pre_sys || (t->t_proc_flag & TP_WATCHPT) != 0)
* do full version
* #ifdef SYSCALLTRACE
* if (syscalltrace)
* do full version
* #endif
*
* Preconditions:
* - t curthread
* - code contains the syscall number
* Postconditions:
* - %ecx and %edi are smashed
* - condition code flag ZF is cleared if pre-sys is too complex
*/
#define CHECK_PRESYS_NE(t, code) \
movzbl T_PRE_SYS(t), %edi; \
movzwl T_PROC_FLAG(t), %ecx; \
andl $TP_WATCHPT, %ecx; \
orl %ecx, %edi; \
cmpl $NSYSCALL, code; \
setae %cl; \
movzbl %cl, %ecx; \
orl %ecx, %edi; \
ORL_SYSCALLTRACE(%edi)
#define MSTATE_TRANSITION(from, to) \
pushl $to; \
pushl $from; \
call syscall_mstate; \
addl $0x8, %esp
/*
* aka CPU_STATS_ADDQ(CPU, sys.syscall, 1)
* This must be called with interrupts or preemption disabled.
*/
#define CPU_STATS_SYS_SYSCALL_INC \
addl $1, %gs:CPU_STATS_SYS_SYSCALL; \
adcl $0, %gs:CPU_STATS_SYS_SYSCALL+4;
#if !defined(__lint)
/*
* ASSERT(lwptoregs(lwp) == rp);
*
* this may seem obvious, but very odd things happen if this
* assertion is false
*
* Preconditions:
* -none-
* Postconditions (if assertion is true):
* %esi and %edi are smashed
*/
#if defined(DEBUG)
__lwptoregs_msg:
.string "%M%:%d lwptoregs(%p) [%p] != rp [%p]"
#define ASSERT_LWPTOREGS(t, rp) \
movl T_LWP(t), %esi; \
movl LWP_REGS(%esi), %edi; \
cmpl rp, %edi; \
je 7f; \
pushl rp; \
pushl %edi; \
pushl %esi; \
pushl $__LINE__; \
pushl $__lwptoregs_msg; \
call panic; \
7:
#else
#define ASSERT_LWPTOREGS(t, rp)
#endif
#endif /* __lint */
/*
* This is an assembler version of this fragment:
*
* lwp->lwp_state = LWP_SYS;
* lwp->lwp_ru.sysc++;
* lwp->lwp_eosys = NORMALRETURN;
* lwp->lwp_ap = argp;
*
* Preconditions:
* -none-
* Postconditions:
* -none-
*/
#define SET_LWP(lwp, argp) \
movb $LWP_SYS, LWP_STATE(lwp); \
addl $1, LWP_RU_SYSC(lwp); \
adcl $0, LWP_RU_SYSC+4(lwp); \
movb $NORMALRETURN, LWP_EOSYS(lwp); \
movl argp, LWP_AP(lwp)
/*
* Set up the thread, lwp, find the handler, and copy
* in the arguments from userland to the kernel stack.
*
* Preconditions:
* - %eax contains the syscall number
* Postconditions:
* - %eax contains a pointer to the sysent structure
* - %ecx is zeroed
* - %esi, %edi are smashed
* - %esp is SYS_DROPped ready for the syscall
*/
#define SIMPLE_SYSCALL_PRESYS(t, faultlabel) \
movl T_LWP(t), %esi; \
movw %ax, T_SYSNUM(t); \
subl $SYS_DROP, %esp; \
shll $SYSENT_SIZE_SHIFT, %eax; \
SET_LWP(%esi, %esp); \
leal sysent(%eax), %eax; \
movzbl SY_NARG(%eax), %ecx; \
testl %ecx, %ecx; \
jz 4f; \
movl %esp, %edi; \
movl SYS_DROP + REGOFF_UESP(%esp), %esi; \
movl $faultlabel, T_LOFAULT(t); \
addl $4, %esi; \
rep; \
smovl; \
movl %ecx, T_LOFAULT(t); \
4:
/*
* Check to see if a simple return is possible i.e.
*
* if ((t->t_post_sys_ast | syscalltrace) != 0)
* do full version;
*
* Preconditions:
* - t is curthread
* Postconditions:
* - condition code NE is set if post-sys is too complex
* - rtmp is zeroed if it isn't (we rely on this!)
*/
#define CHECK_POSTSYS_NE(t, rtmp) \
xorl rtmp, rtmp; \
ORL_SYSCALLTRACE(rtmp); \
orl T_POST_SYS_AST(t), rtmp; \
cmpl $0, rtmp
/*
* Fix up the lwp, thread, and eflags for a successful return
*
* Preconditions:
* - zwreg contains zero
* Postconditions:
* - %esp has been unSYS_DROPped
* - %esi is smashed (points to lwp)
*/
#define SIMPLE_SYSCALL_POSTSYS(t, zwreg) \
movl T_LWP(t), %esi; \
addl $SYS_DROP, %esp; \
movw zwreg, T_SYSNUM(t); \
movb $LWP_USER, LWP_STATE(%esi); \
andb $_CONST(0xffff - PS_C), REGOFF_EFL(%esp)
/*
* System call handler. This is the destination of both the call
* gate (lcall 0x27) _and_ the interrupt gate (int 0x91). For our purposes,
* there are two significant differences between an interrupt gate and a call
* gate:
*
* 1) An interrupt gate runs the handler with interrupts disabled, whereas a
* call gate runs the handler with whatever EFLAGS settings were in effect at
* the time of the call.
*
* 2) An interrupt gate pushes the contents of the EFLAGS register at the time
* of the interrupt onto the stack, whereas a call gate does not.
*
* Because we use the following code sequence to handle system calls made from
* _both_ a call gate _and_ an interrupt gate, these two differences must be
* respected. In regards to number 1) above, the handler must ensure that a sane
* EFLAGS snapshot is stored on the stack so that when the kernel returns back
* to the user via iret (which returns to user with the EFLAGS value saved on
* the stack), interrupts are re-enabled.
*
* In regards to number 2) above, the handler must always put a current snapshot
* of EFLAGS onto the stack in the appropriate place. If we came in via an
* interrupt gate, we will be clobbering the EFLAGS value that was pushed by
* the interrupt gate. This is OK, as the only bit that was changed by the
* hardware was the IE (interrupt enable) bit, which for an interrupt gate is
* now off. If we were to do nothing, the stack would contain an EFLAGS with
* IE off, resulting in us eventually returning back to the user with interrupts
* disabled. The solution is to turn on the IE bit in the EFLAGS value saved on
* the stack.
*
* Another subtlety which deserves mention is the difference between the two
* descriptors. The call gate descriptor is set to instruct the hardware to copy
* one parameter from the user stack to the kernel stack, whereas the interrupt
* gate descriptor doesn't use the parameter passing mechanism at all. The
* kernel doesn't actually use the parameter that is copied by the hardware; the
* only reason it does this is so that there is a space on the stack large
* enough to hold an EFLAGS register value, which happens to be in the correct
* place for use by iret when we go back to userland. How convenient.
*
* Stack frame description in syscall() and callees.
*
* |------------|
* | regs | +(8*4)+4 registers
* |------------|
* | 8 args | <- %esp MAXSYSARGS (currently 8) arguments
* |------------|
*
*/
#define SYS_DROP _CONST(_MUL(MAXSYSARGS, 4))
#if defined(__lint)
/*ARGSUSED*/
void
sys_call()
{}
void
_allsyscalls()
{}
size_t _allsyscalls_size;
#else /* __lint */
ENTRY_NP2(sys_call, _allsyscalls)
/ on entry eax = system call number
/ set up the stack to look as in reg.h
subl $8, %esp / pad the stack with ERRCODE and TRAPNO
SYSCALL_PUSH
#ifdef TRAPTRACE
TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSCALL) / Uses labels "8" and "9"
TRACE_REGS(%edi, %esp, %ebx, %ecx) / Uses label "9"
pushl %eax
TRACE_STAMP(%edi) / Clobbers %eax, %edx, uses "9"
popl %eax
movl %eax, TTR_SYSNUM(%edi)
#endif
_watch_do_syscall:
movl %esp, %ebp
pushl %eax / preserve across mstate call
MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
popl %eax
movl %gs:CPU_THREAD, %ebx
/ Interrupts are enabled here, so we must make sure this thread doesn't
/ migrate off the CPU while it updates the CPU stats.
addb $1, T_PREEMPT(%ebx)
CPU_STATS_SYS_SYSCALL_INC
subb $1, T_PREEMPT(%ebx)
/ Set EFLAGS to standard kernel settings.
ENABLE_INTR_FLAGS
ASSERT_LWPTOREGS(%ebx, %esp)
CHECK_PRESYS_NE(%ebx, %eax)
jne _full_syscall_presys
SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
_syslcall_call:
call *SY_CALLC(%eax)
_syslcall_done:
CHECK_POSTSYS_NE(%ebx, %ecx)
jne _full_syscall_postsys
SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
movl %eax, REGOFF_EAX(%esp)
movl %edx, REGOFF_EDX(%esp)
MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
/
/ get back via iret
/
cli
jmp set_user_regs
_full_syscall_presys:
movl T_LWP(%ebx), %esi
subl $SYS_DROP, %esp
movb $LWP_SYS, LWP_STATE(%esi)
pushl %esp
pushl %ebx
call syscall_entry
addl $8, %esp
jmp _syslcall_call
_full_syscall_postsys:
addl $SYS_DROP, %esp
pushl %edx
pushl %eax
pushl %ebx
call syscall_exit
addl $12, %esp
MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
jmp sys_rtt_syscall
_syscall_fault:
push $0xe / EFAULT
call set_errno
addl $4, %esp
xorl %eax, %eax / fake syscall_err()
xorl %edx, %edx
jmp _syslcall_done
SET_SIZE(sys_call)
#endif /* __lint */
/*
* System call handler via the sysenter instruction
*
* Here's how syscall entry usually works (see sys_call for details).
*
* There, the caller (lcall or int) in userland has arranged that:
*
* - %eax contains the syscall number
* - the user stack contains the args to the syscall
*
* Normally the lcall instruction into the call gate causes the processor
* to push %ss, %esp, <top-of-stack>, %cs, %eip onto the kernel stack.
* The sys_call handler then leaves space for r_trapno and r_err, and
* pusha's {%eax, %ecx, %edx, %ebx, %esp, %ebp, %esi, %edi}, followed
* by %ds, %es, %fs and %gs to capture a 'struct regs' on the stack.
* Then the kernel sets %ds, %es and %gs to kernel selectors, and finally
* extracts %efl and puts it into r_efl (which happens to live at the offset
* that <top-of-stack> was copied into). Note that the value in r_efl has
* the IF (interrupt enable) flag turned on. (The int instruction into the
* interrupt gate does essentially the same thing, only instead of
* <top-of-stack> we get eflags - see comment above.)
*
* In the sysenter case, things are a lot more primitive.
*
* The caller in userland has arranged that:
*
* - %eax contains the syscall number
* - %ecx contains the user %esp
* - %edx contains the return %eip
* - the user stack contains the args to the syscall
*
* e.g.
* <args on the stack>
* mov $SYS_callnum, %eax
* mov $1f, %edx / return %eip
* mov %esp, %ecx / return %esp
* sysenter
* 1:
*
* Hardware and (privileged) initialization code have arranged that by
* the time the sysenter instructions completes:
*
* - %eip is pointing to sys_sysenter (below).
* - %cs and %ss are set to kernel text and stack (data) selectors.
* - %esp is pointing at the lwp's stack
* - Interrupts have been disabled.
*
* The task for the sysenter handler is:
*
* - recreate the same regs structure on the stack and the same
* kernel state as if we'd come in on an lcall
* - do the normal work of a syscall
* - execute the system call epilogue, use sysexit to return to userland.
*
* Note that we are unable to return both "rvals" to userland with this
* call, as %edx is used by the sysexit instruction.
*/
#if defined(__lint)
void
sys_sysenter()
{}
#else /* __lint */
ENTRY_NP(sys_sysenter)
/
/ do what the call gate would've done to the stack ..
/
pushl $UDS_SEL / (really %ss, but it's the same ..)
pushl %ecx / userland makes this a copy of %esp
pushfl
orl $PS_IE, (%esp) / turn interrupts on when we return to user
pushl $UCS_SEL
pushl %edx / userland makes this a copy of %eip
/
/ done. finish building the stack frame
/
subl $8, %esp / leave space for ERR and TRAPNO
SYSENTER_PUSH
#ifdef TRAPTRACE
TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSENTER) / uses labels 8 and 9
TRACE_REGS(%edi, %esp, %ebx, %ecx) / uses label 9
pushl %eax
TRACE_STAMP(%edi) / clobbers %eax, %edx, uses label 9
popl %eax
movl %eax, TTR_SYSNUM(%edi)
#endif
movl %esp, %ebp
CPU_STATS_SYS_SYSCALL_INC
ENABLE_INTR_FLAGS
pushl %eax / preserve across mstate call
MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
popl %eax
movl %gs:CPU_THREAD, %ebx
ASSERT_LWPTOREGS(%ebx, %esp)
CHECK_PRESYS_NE(%ebx, %eax)
jne _full_syscall_presys
SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
_sysenter_call:
call *SY_CALLC(%eax)
_sysenter_done:
CHECK_POSTSYS_NE(%ebx, %ecx)
jne _full_syscall_postsys
SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
/
/ sysexit uses %edx to restore %eip, so we can't use it
/ to return a value, sigh.
/
movl %eax, REGOFF_EAX(%esp)
/ movl %edx, REGOFF_EDX(%esp)
/ Interrupts will be turned on by the 'sti' executed just before
/ sysexit. The following ensures that restoring the user's EFLAGS
/ doesn't enable interrupts too soon.
andl $_BITNOT(PS_IE), REGOFF_EFL(%esp)
MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
cli
SYSCALL_POP
popl %edx / sysexit: %edx -> %eip
addl $4, %esp / get CS off the stack
popfl / EFL
popl %ecx / sysexit: %ecx -> %esp
sti
sysexit
SET_SIZE(sys_sysenter)
/*
* Declare a uintptr_t which covers the entire pc range of syscall
* handlers for the stack walkers that need this.
*/
.align CPTRSIZE
.globl _allsyscalls_size
.type _allsyscalls_size, @object
_allsyscalls_size:
.NWORD . - _allsyscalls
SET_SIZE(_allsyscalls_size)
#endif /* __lint */
/*
* These are the thread context handlers for lwps using sysenter/sysexit.
*/
#if defined(__lint)
/*ARGSUSED*/
void
sep_save(void *ksp)
{}
/*ARGSUSED*/
void
sep_restore(void *ksp)
{}
#else /* __lint */
/*
* setting this value to zero as we switch away causes the
* stack-pointer-on-sysenter to be NULL, ensuring that we
* don't silently corrupt another (preempted) thread stack
* when running an lwp that (somehow) didn't get sep_restore'd
*/
ENTRY_NP(sep_save)
xorl %edx, %edx
xorl %eax, %eax
movl $MSR_INTC_SEP_ESP, %ecx
wrmsr
ret
SET_SIZE(sep_save)
/*
* Update the kernel stack pointer as we resume onto this cpu.
*/
ENTRY_NP(sep_restore)
movl 4(%esp), %eax /* per-lwp kernel sp */
xorl %edx, %edx
movl $MSR_INTC_SEP_ESP, %ecx
wrmsr
ret
SET_SIZE(sep_restore)
#endif /* __lint */
/*
* Call syscall(). Called from trap() on watchpoint at lcall 0,7
*/
#if defined(__lint)
void
watch_syscall(void)
{}
#else /* __lint */
ENTRY_NP(watch_syscall)
movl %gs:CPU_THREAD, %ebx
movl T_STACK(%ebx), %esp / switch to the thread stack
movl REGOFF_EAX(%esp), %eax / recover original syscall#
jmp _watch_do_syscall
SET_SIZE(watch_syscall)
#endif /* __lint */