fork.c revision c6939658adb0a356a77bc28f7df252ceb4a8f6cc
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/sysmacros.h>
#include <sys/tuneable.h>
#include <sys/ucontext.h>
#include <sys/shm_impl.h>
#include <sys/door_data.h>
#include <sys/schedctl.h>
#include <sys/resource.h>
#include <sys/contract_impl.h>
static void forklwp_fail(proc_t *);
int fork_fail_pending;
extern struct kmem_cache *process_cache;
/*
* forkall system call.
*/
forkall(void)
{
return (cfork(0, 0));
}
/*
* The parent is stopped until the child invokes relvm().
*/
vfork(void)
{
}
/*
* fork1 system call
*/
fork1(void)
{
return (cfork(0, 1));
}
/* ARGSUSED */
static int64_t
{
kthread_t *t;
rval_t r;
int error;
int i;
/*
* fork is not supported for the /proc agent lwp.
*/
goto forkerr;
}
goto forkerr;
/*
* If the calling lwp is doing a fork1() then the
* other lwps in this process are not duplicated and
* don't need to be held where their kernel stacks can be
* cloned. If doing forkall(), the process is held with
* SHOLDFORK, so that the lwps are at a point where their
* stacks can be copied which is on entry or exit from
* the kernel.
*/
goto forkerr;
}
#if defined(__sparc)
/*
* Ensure that the user stack is fully constructed
* before creating the child process structure.
*/
(void) flush_user_windows_to_stack(NULL);
#endif
mutex_enter(&p->p_lock);
/*
* If this is vfork(), cancel any suspend request we might
* have gotten from some other thread via lwp_suspend().
* Otherwise we could end up with a deadlock on return
* from the vfork() in both the parent and the child.
*/
if (isvfork)
/*
* Prevent our resource set associations from being changed during fork.
*/
mutex_exit(&p->p_lock);
/*
* Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
*/
mutex_enter(&p->p_lock);
continuelwps(p);
mutex_exit(&p->p_lock);
goto forkerr;
}
/*
* Assign an address space to child
*/
if (isvfork) {
/*
* Clear any watched areas and remember the
* watched pages for restoring in vfwait().
*/
sizeof (struct watched_page),
}
} else {
/*
* We need to hold P_PR_LOCK until the address space has
* been duplicated and we've had a chance to remove from the
* child any DTrace probes that were in the parent. Holding
* P_PR_LOCK prevents any new probes from being added and any
* extant probes from being removed.
*/
mutex_enter(&p->p_lock);
sprlock_proc(p);
mutex_exit(&p->p_lock);
if (error != 0) {
if (cp->p_psibling)
mutex_enter(&p->p_lock);
continuelwps(p);
sprunlock(p);
/*
* Preserve ENOMEM error condition but
* map all others to EAGAIN.
*/
goto forkerr;
}
/* Duplicate parent's shared memory */
if (p->p_segacct)
/*
* Remove all DTrace tracepoints from the child process. We
* need to do this _before_ duplicating USDT providers since
* any associated probes may be immediately enabled.
*/
if (p->p_dtrace_count > 0)
dtrace_fasttrap_fork(p, cp);
/*
* Duplicate any helper actions and providers. The SFORKING
* we set above informs the code to enable USDT probes that
* sprlock() may fail because the child is being forked.
*/
if (p->p_dtrace_helpers != NULL) {
(*dtrace_helpers_fork)(p, cp);
}
mutex_enter(&p->p_lock);
sprunlock(p);
}
/*
* Duplicate parent's resource controls.
*/
dup_set = rctl_set_create();
for (;;) {
break;
}
e.rcep_t = RCENTITY_PROCESS;
RCD_DUP | RCD_CALLBACK);
/*
* Allocate the child's lwp directory and lwpid hash table.
*/
if (isfork1)
else
/*
* Duplicate parent's lwps.
* Mutual exclusion is not needed because the process is
* in the hold state and only the current lwp is running.
*/
if (isfork1) {
goto forklwperr;
/*
* Inherit only the lwp_wait()able flag,
* Daemon threads should not call fork1(), but oh well...
*/
} else {
/* this is forkall(), no one can be in lwp_wait() */
/* for each entry in the parent's lwp directory... */
continue;
goto forklwperr;
/*
* Inherit lwp_wait()able and daemon flags.
*/
ct->t_proc_flag |=
/*
* Keep track of the clone of curthread to
* post return values through lwp_setrval().
* Mark other threads for special treatment
* by lwp_rtt() / post_syscall().
*/
if (t == curthread)
else
} else {
/*
* Replicate zombie lwps in the child.
*/
}
}
}
/*
* Put new process in the parent's process contract, or put it
* in a new one if there is an active process template. Send a
* fork event (if requested) to whatever contract the child is
* a member of. Fails if the parent has been SIGKILLed.
*/
goto forklwperr;
/*
* No fork failures occur beyond this point.
*/
if (!isfork1) {
/*
* If the parent's lwp ids have wrapped around, so have the
* child's.
*/
}
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
/*
* Duplicate process context ops, if any.
*/
if (p->p_pctx)
#ifdef __sparc
#endif
/*
* If the child process has been marked to stop on exit
* from this fork, arrange for all other lwps to stop in
* sympathy with the active lwp.
*/
do {
t->t_proc_flag |= TP_PRSTOP;
aston(t); /* so TP_PRSTOP will be seen */
}
/*
* If the parent process has been marked to stop on exit
* from this fork, and its asynchronous-stop flag has not
* been set, arrange for all other lwps to stop before
* they return back to user level.
*/
mutex_enter(&p->p_lock);
t = p->p_tlist;
do {
t->t_proc_flag |= TP_PRSTOP;
aston(t); /* so TP_PRSTOP will be seen */
mutex_exit(&p->p_lock);
}
if (PROC_IS_BRANDED(p))
else
/* set return values for parent */
r.r_val2 = 0;
/*
* pool_barrier_exit() can now be called because the child process has:
* - all identifying features cloned or set (p_pid, p_task, p_pool)
* - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset)
* - any other fields set which are used in resource set binding.
*/
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
/*
* Now that there are lwps and threads attached, add the new
* process to the process group.
*/
/*
* We are now done with all the lwps in the child process.
*/
do {
/*
* Set the lwp_suspend()ed lwps running.
* They will suspend properly at syscall exit.
*/
if (t->t_proc_flag & TP_HOLDLWP)
lwp_create_done(t);
else {
/* set TS_CREATE to allow continuelwps() to work */
thread_lock(t);
t->t_schedflag |= TS_CREATE;
thread_unlock(t);
}
if (isvfork) {
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
/*
* Grab child's p_lock before dropping pidlock to ensure
* the process will not disappear before we set it running.
*/
sigdefault(cp);
} else {
/*
* It is CL_FORKRET's job to drop pidlock.
* If we do it here, the process could be set running
* and disappear before CL_FORKRET() is called.
*/
}
return (r.r_vals);
if (isvfork) {
if (avl_numnodes(&p->p_wpage) != 0) {
/* restore watchpoints to parent */
sizeof (struct watched_page),
}
} else {
}
}
cp->p_lwpdir_sz = 0;
cp->p_tidhash_sz = 0;
/*
* Detach failed child from task.
*/
if (cp->p_psibling)
mutex_enter(&p->p_lock);
continuelwps(p);
mutex_exit(&p->p_lock);
}
/*
* Free allocated resources from getproc() if a fork failed.
*/
static void
{
/*
* single threaded, so no locking needed here
*/
if (u.u_rdir)
if (u.u_cwd)
refstr_rele(u.u_cwd);
}
/*
* Clean up the lwps already created for this child process.
* The fork failed while duplicating all the lwps of the parent
* and those lwps already created must be freed.
* This process is invisible to the rest of the system,
* so we don't need to hold p->p_lock to protect the list.
*/
static void
forklwp_fail(proc_t *p)
{
kthread_t *t;
/*
* First remove the lwp from the process's p_tlist.
*/
if (t != t->t_forw)
else
p->p_lwpcnt--;
p->p_zone->zone_nlwps--;
}
lwp_ctmpl_clear(ttolwp(t));
/*
* Remove the thread from the all threads list.
* We need to hold pidlock for this.
*/
CL_EXIT(t); /* tell the scheduler that we're exiting */
/*
* Let the lgroup load averages know that this thread isn't
* going to show up (i.e. un-do what was done on behalf of
* this thread by the earlier lgrp_move_thread()).
*/
/*
* The thread was created TS_STOPPED.
* We change it to TS_FREE to avoid an
* ASSERT() panic in thread_free().
*/
thread_rele(t);
thread_free(t);
}
}
/*
* fork a kernel process.
*/
int
{
proc_t *p;
if (getproc(&p, 1) < 0)
return (EAGAIN);
init_set = rctl_set_create();
/*
* kernel processes do not inherit /proc tracing flags.
*/
sigemptyset(&p->p_sigmask);
premptyset(&p->p_fltmask);
mutex_enter(&p->p_lock);
e.rcep_t = RCENTITY_PROCESS;
init_gp);
mutex_exit(&p->p_lock);
} else {
if (getproc(&p, 0) < 0)
return (EAGAIN);
/*
* init creates a new task, distinct from the task
* containing kernel "processes".
*/
init_set = rctl_set_create();
mutex_enter(&p->p_lock);
task_detach(p);
task_begin(tk, p);
e.rcep_t = RCENTITY_PROCESS;
init_gp);
mutex_exit(&p->p_lock);
}
fork_fail(p);
mutex_enter(&p->p_lock);
task_detach(p);
mutex_exit(&p->p_lock);
pid_exit(p);
return (EAGAIN);
}
B_FALSE);
}
p->p_lwpid = 1;
mutex_enter(&p->p_lock);
mutex_exit(&p->p_lock);
return (0);
}
/*
* create a child proc struct.
*/
static int
{
return (-1);
return (-1); /* no point in starting new processes */
/*
* Make proc entry for child process
*/
#if defined(__x86)
#endif
}
goto bad;
}
/*
* If not privileged make sure that this user hasn't exceeded
* v.v_maxup processes, and that users collectively haven't
* exceeded v.v_maxupttl processes.
*/
(nproc >= v.v_maxupttl ||
secpolicy_newproc(cr) != 0) {
"out of per-user processes for uid %d", ruid);
goto bad;
}
/*
* Everything is cool, put the new proc on the active process list.
* It is already on the pid list and in /proc.
* Increment the per uid process count (upcount).
*/
nproc++;
if (PROC_IS_BRANDED(pp))
/*
* Initialize watchpoint structures
*/
/*
* Initialize immediate resource control values.
*/
/*
* Link up to parent-child-sibling chain. No need to lock
* in general since only a call to freeproc() (done by the
* same parent as newproc()) diddles with the child chain.
*/
/*
* Inherit profiling state; do not inherit REALPROF profiling state.
*/
/*
* Inherit pool pointer from the parent. Kernel processes are
* always bound to the default pool.
*/
if (kernel) {
} else {
}
/*
* Add the child process to the current task. Kernel processes
* are always attached to task0.
*/
if (kernel)
else
/*
* Duplicate any audit information kept in the process table
*/
#ifdef C2_AUDIT
if (audit_active) /* copy audit data to cp */
#endif
/*
* Bump up the counts on the file structures pointed at by the
* parent's file table since the child will point at them too.
*/
if (u.u_rdir)
if (u.u_cwd)
refstr_hold(u.u_cwd);
/*
* copy the parent's uarea.
*/
/*
* If inherit-on-fork, copy /proc tracing flags to child.
*/
} else {
}
/*
* If microstate accounting is being inherited, mark child
*/
/*
* Inherit fixalignment flag from the parent
*/
return (0);
bad:
#if defined(__x86)
#endif
if (newpid != -1) {
}
/*
* We most likely got into this situation because some process is
* forking out of control. As punishment, put it to sleep for a
* bit so it can't eat the machine alive. Sleep interval is chosen
* to allow no more than one fork failure per cpu per clock tick
* on average (yes, I just made this up). This has two desirable
* properties: (1) it sets a constant limit on the fork failure
* rate, and (2) the busier the system is, the harsher the penalty
* for abusing it becomes.
*/
return (-1); /* out of memory or proc slots */
}
/*
* Release virtual memory.
* In the case of vfork(), the child was given exclusive access to its
* parent's address space. The parent is waiting in vfwait() for the
* child to release its exclusive claim via relvm().
*/
void
relvm()
{
prrelvm(); /* inform /proc */
/*
* The child process is either exec'ing or exit'ing.
* The child is now separated from the parent's address
* space. The parent process is made dispatchable.
*
* This is a delicate locking maneuver, involving
* both the parent's p_lock and the child's p_lock.
* As soon as the SVFORK flag is turned off, the
* parent is free to run, but it must not run until
* we wake it up using its p_cv because it might
* exit and we would be referencing invalid memory.
* Therefore, we hold the parent with its p_lock
* while protecting our p_flags with our own p_lock.
*/
prbarrier(p); /* make sure /proc is blocked out */
/*
* Check if parent is locked by /proc.
*/
/*
* Delay until /proc is done with the parent.
* We must drop our (the child's) p->p_lock, wait
* via prbarrier() on the parent, then start over.
*/
mutex_exit(&p->p_lock);
goto try_again;
}
/*
* notify hat of change in thread's address space
*/
/*
* child sizes are copied back to parent because
* child may have grown.
*/
/*
* The parent is no longer waiting for the vfork()d child.
* Restore the parent's watched pages, if any. This is
* safe because we know the parent is not locked by /proc
*/
sizeof (struct watched_page),
}
mutex_exit(&p->p_lock);
} else {
if (p->p_segacct)
shmexit(p);
/*
* We grab p_lock for the benefit of /proc
*/
mutex_enter(&p->p_lock);
prbarrier(p); /* make sure /proc is blocked out */
mutex_exit(&p->p_lock);
/*
* notify hat of change in thread's address space
*/
}
}
}
/*
* Wait for child to exec or exit.
* Called by parent of vfork'ed process.
* See important comments in relvm(), above.
*/
void
{
int signalled = 0;
/*
* Wait for child to exec or exit.
*/
for (;;) {
/*
* Child has exit()ed.
*/
break;
}
/*
* Grab the child's p_lock before releasing pidlock.
* Otherwise, the child could exit and we would be
* referencing invalid memory.
*/
/*
* Child has exec()ed or is exit()ing.
*/
break;
}
/*
* We might be waked up spuriously from the cv_wait().
* We have to do the whole operation over again to be
* sure the child's SVFORK flag really is turned off.
* We cannot make reference to the child because it can
* exit before we return and we would be referencing
* invalid memory.
*
* Because this is potentially a very long-term wait,
* we call cv_wait_sig() (for its jobcontrol and /proc
* side-effects) unless there is a current signal, in
* which case we use cv_wait() because we cannot return
* from this function until the child has released the
* address space. Calling cv_wait_sig() with a current
* signal would lead to an indefinite loop here because
* cv_wait_sig() returns immediately in this case.
*/
if (signalled)
else
}
/* restore watchpoints to parent */
if (pr_watch_active(pp)) {
}
}