syscall.c revision efd37614a1e214f502001b0e6cfa90b747abc5b9
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/sysmacros.h>
#include <sys/privregs.h>
#include <sys/ucontext.h>
#include <sys/aio_impl.h>
#include <sys/tnf_probe.h>
int syscalltrace = 0;
#ifdef SYSCALLTRACE
#else
#define syscalltrace 0
#endif /* SYSCALLTRACE */
int pre_syscall(void);
static void deferred_singlestep_trap(caddr_t);
#ifdef _SYSCALL32_IMPL
#define LWP_GETSYSENT(lwp) \
#else
#endif
/*
* Arrange for the real time profiling signal to be dispatched.
*/
void
{
proc_t *p;
return;
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
return;
}
lwp->lwp_lastfault = 0;
mutex_exit(&p->p_lock);
psig();
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
}
/*
* If watchpoints are active, don't make copying in of
* system call arguments take a read watchpoint trap.
*/
static int
{
}
#if defined(_SYSCALL32_IMPL)
static int
{
int rc;
while (nargs--)
}
return (rc);
}
#define COPYIN_ARGS32 copyin_args32
#else
#define COPYIN_ARGS32 copyin_args
#endif
/*
* Error handler for system calls where arg copy gets fault.
*/
static longlong_t
{
return (0);
}
/*
* Corresponding sysent entry to allow syscall_entry caller
* to invoke syscall_err.
*/
static struct sysent sysent_err = {
};
/*
* Called from syscall() when a non-trivial 32-bit system call occurs.
* Sets up the args and returns a pointer to the handler.
*/
struct sysent *
{
unsigned int code;
int error = 0;
/*
* Set lwp_ap to point to the args, even if none are needed for this
* system call. This is for the loadable-syscall case where the
* number of args won't be known until the system call is loaded, and
* also maintains a non-NULL lwp_ap setup for get_syscall_args(). Note
* that lwp_ap MUST be set to a non-NULL value _BEFORE_ t_sysnum is
* set to non-zero; otherwise get_syscall_args(), seeing a non-zero
* t_sysnum for this thread, will charge ahead and dereference lwp_ap.
*/
if ((t->t_pre_sys | syscalltrace) != 0) {
error = pre_syscall();
/*
* Reset lwp_ap so that the args will be refetched if
* the lwp stopped for /proc purposes in pre_syscall().
*/
lwp->lwp_argsaved = 0;
if (error)
return (&sysent_err); /* use dummy handler */
}
/*
* Fetch the system call arguments.
* Note: for loadable system calls the number of arguments required
* may not be known at this point, and will be zero if the system call
* was never loaded. Once the system call has been loaded, the number
* of args is not allowed to be changed.
*/
return (&sysent_err); /* use dummy handler */
}
return (callp); /* return sysent entry for caller */
}
void
{
/*
* Handle signals and other post-call events if necessary.
*/
if ((t->t_post_sys_ast | syscalltrace) == 0) {
/*
* Normal return.
* Clear error indication and set return values.
*/
} else
t->t_sysnum = 0; /* invalidate args */
}
/*
* Perform pre-system-call processing, including stopping for tracing,
* auditing, etc.
*
* This routine is called only if the t_pre_sys flag is set. Any condition
* requiring pre-syscall handling must set the t_pre_sys flag. If the
* condition is persistent, this routine will repost t_pre_sys.
*/
int
{
int repost;
#if defined(DEBUG)
/*
* On the i386 kernel, lwp_ap points at the piece of the thread
* stack that we copy the users arguments into.
*
* On the amd64 kernel, the syscall arguments in the rdi..r9
* registers should be pointed at by lwp_ap. If the args need to
* be copied so that those registers can be changed without losing
* the ability to get the args for /proc, they can be saved by
* save_syscall_args(), and lwp_ap will be restored by post_syscall().
*/
#if defined(_LP64)
} else {
#endif
}
#endif /* DEBUG */
/*
* Make sure the thread is holding the latest credentials for the
* process. The credentials in the process right now apply to this
* thread for the entire system call.
*/
/*
* DTrace accesses t_cred in probe context. t_cred must
* always be either NULL, or point to a valid, allocated cred
* structure.
*/
}
/*
* From the proc(4) manual page:
* When entry to a system call is being traced, the traced process
* stops after having begun the call to the system but before the
* system call arguments have been fetched from the process.
*/
mutex_enter(&p->p_lock);
/*
* Recheck stop condition, now that lock is held.
*/
#if defined(_LP64)
/*
* Must refetch args since they were
* possibly modified by /proc.
* Indicate that a valid copy is in registers.
*/
lwp->lwp_argsaved = 0;
}
#endif
}
mutex_exit(&p->p_lock);
}
repost = 1;
}
if (lwp->lwp_sysabort) {
/*
* lwp_sysabort may have been set via /proc while the process
* was stopped on PR_SYSENTRY. If so, abort the system call.
* Override any error from the copyin() of the arguments.
*/
lwp->lwp_sysabort = 0;
return (1); /* don't do system call, return EINTR */
}
#ifdef C2_AUDIT
if (audit_active) { /* begin auditing for this syscall */
int error;
return (1);
}
repost = 1;
}
#endif /* C2_AUDIT */
#ifndef NPROBE
/* Kernel probe */
if (tnf_tracing_active) {
repost = 1;
}
#endif /* NPROBE */
#ifdef SYSCALLTRACE
if (syscalltrace) {
int i;
long *ap;
char *cp;
char *sysname;
else
(void) save_syscall_args();
else {
}
cp = "(";
cp = ", ";
}
if (i)
printf(")");
}
#endif /* SYSCALLTRACE */
/*
* If there was a continuing reason for pre-syscall processing,
* set the t_pre_sys flag for the next system call.
*/
if (repost)
t->t_pre_sys = 1;
return (0);
}
/*
* Post-syscall processing. Perform abnormal system call completion
* actions such as /proc tracing, profiling, signals, preemption, etc.
*
* This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
* Any condition requiring pre-syscall handling must set one of these.
* If the condition is persistent, this routine will repost t_post_sys.
*/
void
{
int repost = 0;
int proc_stop = 0; /* non-zero if stopping */
int sigprof = 0; /* non-zero if sending SIGPROF */
t->t_post_sys = 0;
/*
* Code can be zero if this is a new LWP returning after a forkall(),
* other than the one which matches the one in the parent which called
* forkall(). In these LWPs, skip most of post-syscall activity.
*/
if (code == 0)
goto sig_check;
/*
* If the trace flag is set, mark the lwp to take a single-step trap
* on return to user level (below). The x86 lcall interface and
* sysenter has already done this, and turned off the flag, but
* amd64 syscall interface has not.
*/
}
#ifdef C2_AUDIT
if (audit_active) { /* put out audit record for this syscall */
/* XX64 -- truncation of 64-bit return values? */
repost = 1;
}
#endif /* C2_AUDIT */
uprintf("%s", m);
}
/*
* If we're going to stop for /proc tracing, set the flag and
* save the arguments so that the return values don't smash them.
*/
(void) save_syscall_args();
proc_stop = 1;
}
repost = 1;
}
/*
* Similarly check to see if SIGPROF might be sent.
*/
(void) save_syscall_args();
sigprof = 1;
}
if (error == 0) {
#ifdef SYSCALLTRACE
if (syscalltrace) {
"%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
}
#endif /* SYSCALLTRACE */
} else {
int sig;
#ifdef SYSCALLTRACE
if (syscalltrace) {
printf("%d: error=%d, id 0x%p\n",
}
#endif /* SYSCALLTRACE */
}
}
/*
* From the proc(4) manual page:
* When exit from a system call is being traced, the traced process
* stops on completion of the system call just prior to checking for
* signals and returning to user level. At this point all return
* values have been stored into the traced process's saved registers.
*/
if (proc_stop) {
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
}
/*
* If we are the parent returning from a successful
* vfork, wait for the child to exec or exit.
* This code must be here and not in the bowels of the system
* so that /proc can intercept exit from vfork in a timely way.
*/
/*
* If profiling is active, bill the current PC in user-land
* and keep reposting until profiling is disabled.
*/
if (lwp->lwp_oweupc)
repost = 1;
}
/*
* Reset flag for next time.
* We must do this after stopping on PR_SYSEXIT
* because /proc uses the information in lwp_eosys.
*/
/*
* If a single-step trap occurred on a syscall (see trap())
* recognize it now. Do this before checking for signals
* because deferred_singlestep_trap() may generate a SIGTRAP to
* the LWP or may otherwise mark the LWP to call issig(FORREAL).
*/
if (t->t_astflag | t->t_sig_check) {
/*
* Turn off the AST flag before checking all the conditions that
* may have caused an AST. This flag is on whenever a signal or
* unusual condition should be handled after the next trap or
* syscall.
*/
astoff(t);
t->t_sig_check = 0;
/*
* The following check is legal for the following reasons:
* 1) The thread we are checking, is ourselves, so there is
* no way the proc can go away.
* 2) The only time we need to be protected by the
* lock is if the binding is changed.
*
* Note we will still take the lock and check the binding
* if the condition was true without the lock held. This
* prevents lock contention among threads owned by the
* same proc.
*/
mutex_enter(&p->p_lock);
}
mutex_exit(&p->p_lock);
}
/*
* for kaio requests on the special kaio poll queue,
* copyout their results to user memory.
*/
if (p->p_aio)
aio_cleanup(0);
/*
* If this LWP was asked to hold, call holdlwp(), which will
* stop. holdlwps() sets this up and calls pokelwps() which
* sets the AST flag.
*
* Also check TP_EXITLWP, since this is used by fresh new LWPs
* through lwp_rtt(). That flag is set if the lwp_create(2)
* syscall failed after creating the LWP.
*/
holdlwp();
/*
* All code that sets signals and makes ISSIG_PENDING
* evaluate true must set t_sig_check afterwards.
*/
if (ISSIG_PENDING(t, lwp, p)) {
psig();
}
if (sigprof) {
}
/*
* If a performance counter overflow interrupt was
* delivered *during* the syscall, then re-enable the
* AST so that we take a trip through trap() to cause
* the SIGEMT to be delivered.
*/
aston(t);
/*
* because that could race with the call gate used by
* system calls via "lcall". If that happened, an
* invalid EFLAGS would result. prstep()/prnostep()
* therefore schedule an AST for the purpose.
*/
}
}
}
#ifndef NPROBE
/* Kernel probe */
if (tnf_tracing_active) {
repost = 1;
}
#endif /* NPROBE */
/*
* Set state to LWP_USER here so preempt won't give us a kernel
* priority if it occurs after this point. Call CL_TRAPRET() to
* restore the user-level priority.
*
* It is important that no locks (other than spinlocks) be entered
* after this point before returning to user mode (unless lwp_state
* is set back to LWP_SYS).
*
* XXX Sampled times past this point are charged to the user.
*/
if (t->t_trapret) {
t->t_trapret = 0;
thread_lock(t);
CL_TRAPRET(t);
thread_unlock(t);
}
if (CPU->cpu_runrun)
preempt();
/*
* The thread lock must be held in order to clear sysnum and reset
* lwp_ap atomically with respect to other threads in the system that
* may be looking at the args via lwp_ap from get_syscall_args().
*/
thread_lock(t);
t->t_sysnum = 0; /* no longer in a system call */
#if defined(_LP64)
/*
* In case the args were copied to the lwp, reset the
* pointer so the next syscall will have the right
* lwp_ap pointer.
*/
} else {
#endif
}
thread_unlock(t);
lwp->lwp_argsaved = 0;
/*
* If there was a continuing reason for post-syscall processing,
* set the t_post_sys flag for the next system call.
*/
if (repost)
t->t_post_sys = 1;
/*
* If there is a ustack registered for this lwp, and the stack rlimit
* has been altered, read in the ustack. If the saved stack rlimit
* matches the bounds of the ustack, update the ustack to reflect
* the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
* stack checking by setting the size to 0.
*/
mutex_enter(&p->p_lock);
top = p->p_usrstack;
mutex_exit(&p->p_lock);
new_size = 0;
sizeof (stack_t)) == 0 &&
sizeof (stack_t));
}
lwp->lwp_old_stk_ctl = 0;
}
}
/*
* Called from post_syscall() when a deferred singlestep is to be taken.
*/
static void
{
/*
* If both NORMAL_STEP and WATCH_STEP are in
* effect, give precedence to NORMAL_STEP.
* If neither is set, user must have set the
* PS_T bit in %efl; treat this as NORMAL_STEP.
*/
(void) undo_watch_step(NULL);
} else {
}
if (fault) {
/*
* Remember the fault and fault adddress
* for real-time (SIGPROF) profiling.
*/
/*
* If a debugger has declared this fault to be an
* event of interest, stop the lwp. Otherwise just
* deliver the associated signal.
*/
}
}
/*
* nonexistent system call-- signal lwp (may want to handle it)
* flag error if lwp won't see signal immediately
*/
nosys()
{
}
/*
* Execute a 32-bit system call on behalf of the current thread.
*/
void
dosyscall(void)
{
/*
* Need space on the stack to store syscall arguments.
*/
long syscall_args[MAXSYSARGS];
/*
* syscall_entry() copied all 8 arguments into syscall_args.
*/
syscall_args[7]);
}
/*
* Get the arguments to the current system call. See comment atop
* save_syscall_args() regarding lwp_ap usage.
*/
{
long *ap;
int nargs;
#if defined(_LP64)
mask = 0xfffffffffffffffful;
#endif
/*
* The thread lock must be held while looking at the arguments to ensure
* they don't go away via post_syscall().
* get_syscall_args() is the only routine to read them which is callable
* outside the LWP in question and hence the only one that must be
* synchronized in this manner.
*/
thread_lock(t);
thread_unlock(t);
while (nargs-- > 0)
} else {
*nargsp = 0;
}
return (code);
}
#ifdef _SYSCALL32_IMPL
/*
* Get the arguments to the current 32-bit system call.
*/
{
long args[MAXSYSARGS];
for (i = 0; i != *nargsp; i++)
return (code);
}
#endif
/*
* Save the system call arguments in a safe place.
*
* On the i386 kernel:
*
* Copy the users args prior to changing the stack or stack pointer.
* This is so /proc will be able to get a valid copy of the
* args from the user stack even after the user stack has been changed.
* Note that the kernel stack copy of the args may also have been
* changed by a system call handler which takes C-style arguments.
*
* Note that this may be called by stop() from trap(). In that case
* t_sysnum will be zero (syscall_exit clears it), so no args will be
* copied.
*
* On the amd64 kernel:
*
* For 64-bit applications, lwp->lwp_ap normally points to %rdi..%r9
* in the reg structure. If the user is going to change the argument
* registers, rax, or the stack and might want to get the args (for
* /proc tracing), it must copy the args elsewhere via save_syscall_args().
*
* For 32-bit applications, lwp->lwp_ap normally points to a copy of
* the system call arguments on the kernel stack made from the user
* stack. Copy the args prior to change the stack or stack pointer.
* This is so /proc will be able to get a valid copy of the args
* from the user stack even after that stack has been changed.
*
* This may be called from stop() even when we're not in a system call.
* Since there's no easy way to tell, this must be safe (not panic).
* If the copyins get data faults, return non-zero.
*/
int
{
return (0); /* args already saved or not needed */
nargs = 0; /* illegal syscall */
} else {
/*
* Find out how many arguments the system
* call uses.
*
* We have the property that loaded syscalls
* never change the number of arguments they
* use after they've been loaded once. This
* allows us to stop for /proc tracing without
* holding the module lock.
* /proc is assured that sy_narg is valid.
*/
}
}
/*
* Fetch the system call arguments.
*/
if (nargs == 0)
goto out;
#if defined(_LP64)
return (-1);
} else {
#endif
return (-1);
}
out:
return (0);
}
void
reset_syscall_args(void)
{
}
/*
* Call a system call which takes a pointer to the user args struct and
* a pointer to the return values. This is a bit slower than the standard
* C arg-passing method in some cases.
*/
syscall_ap(void)
{
#if defined(__amd64)
/*
* If the arguments don't fit in registers %rdi-%r9, make sure they
* have been copied to the lwp_arg array.
*/
#endif
if (error)
}
/*
* Load system call module.
* Returns with pointer to held read lock for module.
*/
static krwlock_t *
{
int id;
/*
* Optimization to only call modload if we don't have a loaded
* syscall.
*/
if (LOADED_SYSCALL(callp))
return (module_lock);
for (;;) {
break;
/*
* If we loaded successfully at least once, the modctl
* will still be valid, so we try to grab it by filename.
* If this call fails, it's because the mod_filename
* was changed after the call to modload() (mod_hold_by_name()
* is the likely culprit). We can safely just take
* another lap if this is the case; the modload() will
* change the mod_filename back to one by which we can
* find the modctl.
*/
continue;
if (!modp->mod_installed) {
continue;
}
break;
}
if (id != -1)
return (module_lock);
}
/*
* Loadable syscall support.
* If needed, load the module, then reserve it by holding a read
* lock for the duration of the call.
* Later, if the syscall is not unloadable, it could patch the vector.
*/
/*ARGSUSED*/
{
/*
* Try to autoload the system call if necessary
*/
THREAD_KPRI_RELEASE(); /* drop priority given by rw_enter */
/*
* we've locked either the loaded syscall or nosys
*/
#if defined(_LP64)
} else
rval = syscall_ap();
} else {
#endif
/*
* Now that it's loaded, make sure enough args were copied.
*/
if (error) {
} else
rval = syscall_ap();
}
THREAD_KPRI_REQUEST(); /* regain priority from read lock */
return (rval);
}
/*
* Indirect syscall handled in libc on x86 architectures
*/
indir()
{
return (nosys());
}
/*
* set_errno - set an error return from the current system call.
* This could be a macro.
* This returns the value it is passed, so that the caller can
* use tail-recursion-elimination and do return (set_errno(ERRNO));
*/
{
}
/*
* set_proc_pre_sys - Set pre-syscall processing for entire process.
*/
void
{
kthread_t *t;
do {
t->t_pre_sys = 1;
}
/*
* set_proc_post_sys - Set post-syscall processing for entire process.
*/
void
{
kthread_t *t;
do {
t->t_post_sys = 1;
}
/*
* set_proc_sys - Set pre- and post-syscall processing for entire process.
*/
void
set_proc_sys(proc_t *p)
{
kthread_t *t;
do {
t->t_pre_sys = 1;
t->t_post_sys = 1;
}
/*
* set_all_proc_sys - set pre- and post-syscall processing flags for all
* user processes.
*
* This is needed when auditing, tracing, or other facilities which affect
* all processes are turned on.
*/
void
{
kthread_t *t;
do {
t->t_pre_sys = 1;
t->t_post_sys = 1;
}
/*
* set_proc_ast - Set asynchronous service trap (AST) flag for all
* threads in process.
*/
void
set_proc_ast(proc_t *p)
{
kthread_t *t;
do {
aston(t);
}