/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <sys/auxv.h>
#include <sys/bitmap.h>
#include <sys/brand.h>
#include <sys/inttypes.h>
#include <sys/lwp.h>
#include <sys/syscall.h>
#include <sys/systm.h>
#include <sys/utsname.h>
#include <fcntl.h>
#include <brand_misc.h>
#include <sys/brand.h>
extern brand_sysent_table_t brand_sysent_table[];
/*LINTED: static unused*/
static volatile int brand_abort_err;
/*LINTED: static unused*/
static volatile const char *brand_abort_msg;
/*LINTED: static unused*/
static volatile const char *brand_abort_file;
/*LINTED: static unused*/
static volatile int brand_abort_line;
/*
* Principles of emulation 101.
*
*
* *** Setting errno
*
* Just don't do it. This emulation library is loaded onto a
* seperate link map from the application who's address space we're
* running in. We have our own private copy of libc, so there for,
* the errno value accessible from here is is also private and changing
* it will not affect any errno value that the processes who's address
* space we are running in will see. To return an error condition we
* should return the errno value we'd like the system to return.
* For more information about this see the comments in brand_misc.h.
* Basically, when we return to the caller that initiated the system
* call it's their responsibility to set errno.
*
*
* *** Recursion Considerations
*
* When emulating system calls we need to be very careful about what
* library calls we invoke. Library calls should be kept to a minimum.
* One issue is that library calls can invoke system calls, so if we're
* emulating a system call and we invoke a library call that depends on
* that system call we will probably enter a recursive loop, which would
* be bad.
*
*
* *** Return Values.
*
* See brand_misc.h.
*
* *** Agent lwp considerations
*
* It is currently impossible to do any emulation for these system call
* when they are being invoked on behalf of an agent lwp. To understand why
* it's impossible you have to understand how agent lwp syscalls work.
*
* The agent lwp syscall process works as follows:
* 1 The controlling process stops the target.
* 2 The controlling process injects an agent lwp which is also stopped.
* This agent lwp assumes the userland stack and register values
* of another stopped lwp in the current process.
* 3 The controlling process configures the agent lwp to start
* executing the requested system call.
* 4 The controlling process configure /proc to stop the agent lwp when
* it enters the requested system call.
* 5 The controlling processes allows the agent lwp to start executing.
* 6 The agent lwp traps into the kernel to perform the requested system
* call and immediately stop.
* 7 The controlling process copies all the arguments for the requested
* system call onto the agent lwp's stack.
* 8 The controlling process configures /proc to stop the agent lwp
* when it completes the requested system call.
* 9 The controlling processes allows the agent lwp to start executing.
* 10 The agent lwp executes the system call and then stop before returning
* to userland.
* 11 The controlling process copies the return value and return arguments
* back from the agent lwps stack.
* 12 The controlling process destroys the agent lwp and restarts
* the target process.
*
* The fundamental problem is that when the agent executes the request
* system call in step 5, if we're emulating that system call then the
* lwp is redirected back to our emulation layer without blocking
* in the kernel. But our emulation layer can't access the arguments
* for the system call because they haven't been copied to the stack
* yet and they still only exist in the controlling processes address
* space. This prevents us from being able to do any emulation of
* agent lwp system calls. Hence, currently our brand trap interposition
* callback (XXX_brand_syscall_callback_common) will detect if a system
* call is being made by an agent lwp, and if this is the case it will
* never redirect the system call to this emulation library.
*
* In the future, if this proves to be a problem the the easiest solution
* would probably be to replace the branded versions of these application
* with their native counterparts. Ie, truss, plimit, and pfiles could be
* replace with wrapper scripts that execute the native versions of these
* applications. In the case of plimit and pfiles this should be pretty
* strait forward. Truss would probably be more tricky since it can
* execute applications which would be branded applications, so in that
* case it might be necessary to create a loadable library which could
* be LD_PRELOADed into truss and this library would interpose on the
* exec() system call to allow truss to correctly execute branded
* processes. It should be pointed out that this solution could work
* because "native agent lwps" (ie, agent lwps created by native
* processes) can be treated differently from "branded aged lwps" (ie,
* agent lwps created by branded processes), since native agent lwps
* would presumably be making native system calls and hence not need
* any interposition.
*
* *** General considerations
*
* One of the differences between the lx brand and the s10
* brand, is that the s10 brand only interposes on syscalls
* that need some kind of emulation, whereas the lx brand interposes
* on _all_ system calls. Lx branded system calls that don't need
* any emulation are then redirected back to the kernel from the
* userland library via the IN_KERNEL_SYSCALL macro. The lx-syscall
* dtrace provider depends on this behavior.
*
*/
/*ARGSUSED*/
void
_brand_abort(int err, const char *msg, const char *file, int line)
{
sysret_t rval;
/* Save the error message into convenient globals */
brand_abort_err = err;
brand_abort_msg = msg;
brand_abort_file = file;
brand_abort_line = line;
/* kill ourselves */
abort();
/* If abort() didn't work, try something stronger. */
(void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGKILL);
}
int
brand_uucopy(const void *from, void *to, size_t size)
{
sysret_t rval;
if (__systemcall(&rval, SYS_uucopy + 1024, from, to, size) != 0)
return (EFAULT);
return (0);
}
/*
* ATTENTION: uucopystr() does NOT ensure that string are null terminated!
*/
int
brand_uucopystr(const void *from, void *to, size_t size)
{
sysret_t rval;
if (__systemcall(&rval, SYS_uucopystr + 1024, from, to, size) != 0)
return (EFAULT);
return (0);
}
/*
* This function is defined to be NOSYS but it won't be called from the
* the kernel since the NOSYS system calls are not enabled in the kernel.
* Thus, the only time this function is called is directly from within the
* indirect system call path.
*/
/*ARGSUSED*/
long
brand_unimpl(sysret_t *rv, uintptr_t p1)
{
sysret_t rval;
/*
* We'd like to print out some kind of error message here like
* "unsupported syscall", but we can't because it's not safe to
* assume that stderr or STDERR_FILENO actually points to something
* that is a terminal, and if we wrote to those files we could
* inadvertantly write to some applications open files, which would
* be bad.
*
* Normally, if an application calls an invalid system call
* it get a SIGSYS sent to it. So we'll just go ahead and send
* ourselves a signal here. Note that this is far from ideal since
* if the application has registered a signal handler, that signal
* handler may recieve a ucontext_t as the third parameter to
* indicate the context of the process when the signal was
* generated, and in this case that context will not be what the
* application is expecting. Hence, we should probably create a
* brandsys() kernel function that can deliver the signal to us
* with the correct ucontext_t.
*/
(void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGSYS);
return (ENOSYS);
}
#if defined(__sparc) && !defined(__sparcv9)
/*
* Yuck. For 32-bit sparc applications, handle indirect system calls.
* Note that we declare this interface to use the maximum number of
* system call arguments. If we recieve a system call that uses less
* arguments, then the additional arguments will be garbage, but they
* will also be ignored so that should be ok.
*/
long
brand_indir(sysret_t *rv, int code,
uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4,
uintptr_t a5, uintptr_t a6, uintptr_t a7)
{
brand_sysent_table_t *sst = &(brand_sysent_table[code]);
brand_assert(code < NSYSCALL);
switch (sst->st_args & NARGS_MASK) {
case 0:
return ((sst->st_callc)(rv));
case 1:
return ((sst->st_callc)(rv, a0));
case 2:
return ((sst->st_callc)(rv, a0, a1));
case 3:
return ((sst->st_callc)(rv, a0, a1, a2));
case 4:
return ((sst->st_callc)(rv, a0, a1, a2, a3));
case 5:
return ((sst->st_callc)(rv, a0, a1, a2, a3, a4));
case 6:
return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5));
case 7:
return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6));
case 8:
return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6, a7));
}
brand_abort(0, "invalid entry in brand_sysent_table");
return (EINVAL);
}
#endif /* __sparc && !__sparcv9 */
/*
* Close a libc file handle, but don't actually close the underlying
* file descriptor.
*/
static void
brand_close_fh(FILE *file)
{
int fd, fd_new;
if (file == NULL)
return;
if ((fd = fileno(file)) < 0)
return;
/*
* We're a branded process but our handler isn't installed yet. We
* can't use the dup() syscall since it no longer exists.
*/
fd_new = fcntl(fd, F_DUPFD, 0);
if (fd_new == -1)
return;
(void) fclose(file);
(void) dup2(fd_new, fd);
(void) close(fd_new);
}
/*ARGSUSED*/
void
brand_pre_init()
{
int i;
/* Sanity check our translation table return value codes */
for (i = 0; i < NSYSCALL; i++) {
brand_sysent_table_t *est = &(brand_sysent_table[i]);
brand_assert(BIT_ONLYONESET(est->st_args & RV_MASK));
}
/*
* We need to shutdown all libc stdio. libc stdio normally goes to
* file descriptors, but since we're actually part of a another
* process we don't own these file descriptors and we can't make
* any assumptions about their state.
*/
brand_close_fh(stdin);
brand_close_fh(stdout);
brand_close_fh(stderr);
}
/*ARGSUSED*/
ulong_t
brand_post_init(int version, int argc, char *argv[], char *envp[])
{
sysret_t rval;
brand_proc_reg_t reg;
brand_elf_data_t sed;
auxv_t *ap;
uintptr_t *p;
int err;
/*
* Register our syscall emulation table with the kernel.
* Note that we don't have to do invoke (syscall_number + 1024)
* until we've actually establised a syscall emulation callback
* handler address, which is what we're doing with this brand
* syscall.
*/
reg.sbr_version = version;
#ifdef __x86
reg.sbr_handler = (caddr_t)brand_handler_table;
#else /* !__x86 */
reg.sbr_handler = (caddr_t)brand_handler;
#endif /* !__x86 */
if ((err = __systemcall(&rval, SYS_brand, B_REGISTER, &reg)) != 0) {
brand_abort(err, "Failed to brand current process");
/*NOTREACHED*/
}
/* Get data about the executable we're running from the kernel. */
if ((err = __systemcall(&rval, SYS_brand + 1024,
B_ELFDATA, (void *)&sed)) != 0) {
brand_abort(err,
"Failed to get required brand ELF data from the kernel");
/*NOTREACHED*/
}
/*
* Find the aux vector on the stack.
*/
p = (uintptr_t *)envp;
while (*p != NULL)
p++;
/*
* p is now pointing at the 0 word after the environ pointers.
* After that is the aux vectors.
*
* The aux vectors are currently pointing to the brand emulation
* library and associated linker. We're going to change them to
* point to the brand executable and associated linker (or to no
* linker for static binaries). This matches the process data
* stored within the kernel and visible from /proc, which was
* all setup in sn1_elfexec(). We do this so that when a debugger
* attaches to the process it sees the process as a normal solaris
* process, this brand emulation library and everything on it's
* link map will not be visible, unless our librtld_db plugin
* is used. Note that this is very different from how Linux
* branded processes are implemented within lx branded zones.
* In that situation, the primary linkmap of the process is the
* brand emulation libraries linkmap, not the Linux applications
* linkmap.
*
* We also need to clear the AF_SUN_NOPLM flag from the AT_SUN_AUXFLAGS
* aux vector. This flag told our linker that we don't have a
* primary link map. Now that our linker is done initializing, we
* want to clear this flag before we transfer control to the
* applications copy of the linker, since we want that linker to have
* a primary link map which will be the link map for the application
* we're running.
*/
p++;
for (ap = (auxv_t *)p; ap->a_type != AT_NULL; ap++) {
switch (ap->a_type) {
case AT_BASE:
/* Hide AT_BASE if static binary */
if (sed.sed_base == NULL) {
ap->a_type = AT_IGNORE;
ap->a_un.a_val = NULL;
} else {
ap->a_un.a_val = sed.sed_base;
}
break;
case AT_ENTRY:
ap->a_un.a_val = sed.sed_entry;
break;
case AT_PHDR:
ap->a_un.a_val = sed.sed_phdr;
break;
case AT_PHENT:
ap->a_un.a_val = sed.sed_phent;
break;
case AT_PHNUM:
ap->a_un.a_val = sed.sed_phnum;
break;
case AT_SUN_AUXFLAGS:
ap->a_un.a_val &= ~AF_SUN_NOPLM;
break;
case AT_SUN_EMULATOR:
/*
* ld.so.1 inspects AT_SUN_EMULATOR to see if
* if it is the linker for the brand emulation
* library. Hide AT_SUN_EMULATOR, as the
* linker we are about to jump to is the linker
* for the binary.
*/
ap->a_type = AT_IGNORE;
ap->a_un.a_val = NULL;
break;
case AT_SUN_LDDATA:
/* Hide AT_SUN_LDDATA if static binary */
if (sed.sed_lddata == NULL) {
ap->a_type = AT_IGNORE;
ap->a_un.a_val = NULL;
} else {
ap->a_un.a_val = sed.sed_lddata;
}
break;
default:
break;
}
}
return (sed.sed_ldentry);
}