/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2009, Intel Corporation
* All rights reserved.
*/
/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
/* All Rights Reserved */
/* Copyright (c) 1987, 1988 Microsoft Corporation */
/* All Rights Reserved */
/*
* Copyright 2016 Joyent, Inc.
*/
#include <sys/asm_linkage.h>
#if defined(__lint)
#else /* __lint */
#include "assym.h"
#endif /* __lint */
/*
* Non-temopral access (NTA) alignment requirement
*/
/*
* With the introduction of Broadwell, Intel has introduced supervisor mode
* access protection -- SMAP. SMAP forces the kernel to set certain bits to
* enable access of user pages (AC in rflags, defines as PS_ACHK in
* userland copy routines directly use the kernel ones. For example, copyin and
* copyout simply go and jump to the do_copy_fault label and traditionally let
* those deal with the return for them. In fact, changing that is a can of frame
* pointers.
*
* Rules and Constraints:
*
* 1. For anything that's not in copy.s, we have it do explicit calls to the
* smap related code. It usually is in a position where it is able to. This is
* restricted to the following three places: DTrace, resume() in swtch.s and
* thinking twice.
*
* 2. We try to toggle this at the smallest window possible. This means that if
* we take a fault, need to try to use a copyop in copyin() or copyout(), or any
* other function, we will always leave with SMAP enabled (the kernel cannot
* access user pages).
*
* explicitly only allowed to be called while in an on_fault()/no_fault() handler,
* which already takes care of ensuring that SMAP is enabled and disabled. Note
* this means that when under an on_fault()/no_fault() handler, one must not
* call the non-*_noeer() routines.
*
* 4. The first thing we should do after coming out of an lofault handler is to
* make sure that we call smap_enable again to ensure that we are safely
* protected, as more often than not, we will have disabled smap to get there.
*
* 5. The SMAP functions, smap_enable and smap_disable may not touch any
* registers beyond those done by the call and ret. These routines may be called
* from arbitrary contexts in copy.s where we have slightly more special ABIs in
* place.
*
* 6. For any inline user of SMAP, the appropriate SMAP_ENABLE_INSTR and
* SMAP_DISABLE_INSTR macro should be used (except for smap_enable() and
* smap_disable()). If the number of these is changed, you must update the
* constants SMAP_ENABLE_COUNT and SMAP_DISABLE_COUNT below.
*
* 7. Note, at this time SMAP is not implemented for the 32-bit kernel. There is
* no known technical reason preventing it from being enabled.
*
* 8. Generally this .s file is processed by a K&R style cpp. This means that it
* really has a lot of feelings about whitespace. In particular, if you have a
* macro FOO with the arguments FOO(1, 3), the second argument is in fact ' 3'.
*
* 9. The smap_enable and smap_disable functions should not generally be called.
* They exist such that DTrace and on_trap() may use them, that's it.
*
* 10. In general, the kernel has its own value for rflags that gets used. This
* is maintained in a few different places which vary based on how the thread
* comes into existence and whether it's a user thread. In general, when the
* kernel takes a trap, it always will set ourselves to a known set of flags,
* mainly as part of ENABLE_INTR_FLAGS and F_OFF and F_ON. These ensure that
* PS_ACHK is cleared for us. In addition, when using the sysenter instruction,
* we mask off PS_ACHK off via the AMD_SFMASK MSR. See init_cpu_syscall() for
* where that gets masked off.
*/
/*
* The optimal 64-bit bcopy and kcopy for modern x86 processors uses
* "rep smovq" for large sizes. Performance data shows that many calls to
* these small sizes unrolled code is used. For medium sizes loops writing
* 64-bytes per loop are used. Transition points were determined experimentally.
*/
/*
* Copy a block of storage, returning an error code if `from' or
* `to' takes a kernel pagefault which cannot be resolved.
* Returns errno value on pagefault error, 0 if all ok
*/
/*
* I'm sorry about these macros, but copy.s is unsurprisingly sensitive to
* additional call instructions.
*/
#if defined(__amd64)
#define SMAP_DISABLE_COUNT 0
#define SMAP_ENABLE_COUNT 0
#endif
#if defined(__lint)
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
#ifdef DEBUG
jb 0f
jnb 1f
1:
#endif
/*
* pass lofault value as 4th argument to do_copy_fault
*/
/*
* A fault during do_copy_fault is indicated through an errno value
* in %rax and we iretq from the trap handler to here.
*/
#ifdef DEBUG
jb 0f
jnb 1f
0: pushl $.kcopy_panic_msg
#endif
/*
* A fault during do_copy_fault is indicated through an errno value
* in %eax and we iret from the trap handler to here.
*/
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
/*
* Copy a block of storage. Similar to kcopy but uses non-temporal
* instructions.
*/
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
/* Copy 16 bytes per loop. Uses %rax and %r8 */
#ifdef DEBUG
jb 0f
jnb 1f
1:
#endif
/*
* pass lofault value as 4th argument to do_copy_fault
*/
/*
* Make sure cnt is >= KCOPY_MIN_SIZE
*/
/*
* Make sure src and dst are NTA_ALIGN_SIZE aligned,
* count is COUNT_ALIGN_SIZE aligned.
*/
/*
* COPY_LOOP_BODY uses %rax and %r8
*/
jnz 2b
/*
* kcopy_nta is not implemented for 32-bit as no performance
* improvement was shown. We simply jump directly to kcopy
* and discard the 4 arguments.
*/
/* COPY_LOOP_BODY needs to use %esi */
jnz 1b
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
/* ARGSUSED */
void
{}
#else /* __lint */
#if defined(__amd64)
#ifdef DEBUG
jz 1f
jb 0f
jnb 1f
1:
#endif
/*
* bcopy_altentry() is called from kcopy, i.e., do_copy_fault.
* kcopy assumes that bcopy doesn't touch %r9 and %r11. If bcopy
* uses these registers in future they must be saved and restored.
*/
#define L(s) .bcopy/**/s
/*
* Performance data shows many caller's copy small buffers. So for
* best perf for these sizes unrolled code is used. Store data without
* worrying about alignment.
*/
.p2align 4
L(fwdPxQx):
.p2align 4
L(P0Q9):
L(P0Q8):
L(P0Q7):
L(P0Q6):
L(P0Q5):
L(P0Q4):
L(P0Q3):
L(P0Q2):
L(P0Q1):
L(P0Q0):
.p2align 4
L(P1Q9):
L(P1Q8):
L(P1Q7):
L(P1Q6):
L(P1Q5):
L(P1Q4):
L(P1Q3):
L(P1Q2):
L(P1Q1):
L(P1Q0):
.p2align 4
L(P2Q9):
L(P2Q8):
L(P2Q7):
L(P2Q6):
L(P2Q5):
L(P2Q4):
L(P2Q3):
L(P2Q2):
L(P2Q1):
L(P2Q0):
.p2align 4
L(P3Q9):
L(P3Q8):
L(P3Q7):
L(P3Q6):
L(P3Q5):
L(P3Q4):
L(P3Q3):
L(P3Q2):
L(P3Q1):
/*
* then do the stores.
*/
L(P3Q0):
.p2align 4
L(P4Q9):
L(P4Q8):
L(P4Q7):
L(P4Q6):
L(P4Q5):
L(P4Q4):
L(P4Q3):
L(P4Q2):
L(P4Q1):
L(P4Q0):
.p2align 4
L(P5Q9):
L(P5Q8):
L(P5Q7):
L(P5Q6):
L(P5Q5):
L(P5Q4):
L(P5Q3):
L(P5Q2):
L(P5Q1):
L(P5Q0):
.p2align 4
L(P6Q9):
L(P6Q8):
L(P6Q7):
L(P6Q6):
L(P6Q5):
L(P6Q4):
L(P6Q3):
L(P6Q2):
L(P6Q1):
L(P6Q0):
.p2align 4
L(P7Q9):
L(P7Q8):
L(P7Q7):
L(P7Q6):
L(P7Q5):
L(P7Q4):
L(P7Q3):
L(P7Q2):
L(P7Q1):
L(P7Q0):
/*
* For large sizes rep smovq is fastest.
* Transition point determined experimentally as measured on
* Intel Xeon processors (incl. Nehalem and previous generations) and
* AMD Opteron. The transition value is patched at boot time to avoid
* memory reference hit.
*/
.p2align 4
/*
* Align to a 8-byte boundary. Avoids penalties from unaligned stores
* as well as from stores spanning cachelines.
*/
jz L(aligned_loop)
jz 2f
2:
jz 4f
4:
jz L(aligned_loop)
/*
* Copy 64-bytes per loop
*/
.p2align 4
L(aligned_loop):
jae L(aligned_loop)
/*
* Copy remaining bytes (0-63)
*/
L(do_remainder):
/*
* Use rep smovq. Clear remainder via unrolled code
*/
.p2align 4
L(use_rep):
jnz L(do_remainder)
#undef L
#ifdef DEBUG
/*
* Setup frame on the run-time stack. The end of the input argument
* area must be aligned on a 16 byte boundary. The stack pointer %rsp,
* always points to the end of the latest allocated stack frame.
* panic(const char *format, ...) is a varargs function. When a
* function taking variable arguments is called, %rax must be set
* to eight times the number of floating point parameters passed
* to the function in SSE registers.
*/
#endif
#ifdef DEBUG
jz 1f
jb 0f
jnb 1f
1:
#endif
#endif /* __i386 */
#endif /* __lint */
/*
* Zero a block of storage, returning an error code if we
* take a kernel pagefault which cannot be resolved.
* Returns errno value on pagefault error, 0 if all ok
*/
#if defined(__lint)
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
#ifdef DEBUG
jnb 0f
0:
#endif
/*
* pass lofault value as 3rd argument for fault return
*/
/*
* A fault during bzero is indicated through an errno value
* in %rax when we iretq to here.
*/
#ifdef DEBUG
jnb 0f
#endif
sstol /* %ecx contains words to clear (%eax=0) */
sstob /* %ecx contains residual bytes to clear */
/*
* A fault during kzero is indicated through an errno value
* in %eax when we iret to here.
*/
#endif /* __i386 */
#endif /* __lint */
/*
* Zero a block of storage.
*/
#if defined(__lint)
/* ARGSUSED */
void
{}
#else /* __lint */
#if defined(__amd64)
#ifdef DEBUG
jnb 0f
0:
#endif
#define L(s) .bzero/**/s
/*
* Performance data shows many caller's are zeroing small buffers. So
* for best perf for these sizes unrolled code is used. Store zeros
* without worrying about alignment.
*/
.p2align 4
L(setPxQx):
.p2align 4
L(P0Q0):
.p2align 4
.p2align 4
.p2align 4
.p2align 4
.p2align 4
.p2align 4
.p2align 4
/*
* Align to a 16-byte boundary. Avoids penalties from unaligned stores
* as well as from stores spanning cachelines. Note 16-byte alignment
* is better in case where rep sstosq is used.
*/
.p2align 4
L(ck_align):
jz L(aligned_now)
jz 2f
2:
jz 4f
4:
jz 8f
8:
jz L(aligned_now)
/*
* For large sizes rep sstoq is fastest.
* Transition point determined experimentally as measured on
* Intel Xeon processors (incl. Nehalem) and AMD Opteron.
*/
L(aligned_now):
/*
* zero 64-bytes per loop
*/
.p2align 4
L(bzero_loop):
jae L(bzero_loop)
/*
* Clear any remaining bytes..
*/
9:
/*
* Use rep sstoq. Clear any remainder via unrolled code
*/
.p2align 4
L(use_rep):
sstoq /* %rcx = words to clear (%rax=0) */
jnz 9b
#undef L
#ifdef DEBUG
jnb 0f
0:
#endif
#endif /* __i386 */
#endif /* __lint */
/*
* Transfer data to and from user space -
* Note that these routines can cause faults
* It is assumed that the kernel has nothing at
* less than KERNELBASE in the virtual address space.
*
* Note that copyin(9F) and copyout(9F) are part of the
*
* Sigh.
*
* So there's two extremely similar routines - xcopyin_nta() and
* xcopyout_nta() which return the errno that we've faithfully computed.
* This allows other callers (e.g. uiomove(9F)) to work correctly.
* Given that these are used pretty heavily, we expand the calling
* sequences inline for all flavours (rather than making wrappers).
*/
/*
* Copy user data to kernel space.
*/
#if defined(__lint)
/* ARGSUSED */
int
{ return (0); }
#else /* lint */
#if defined(__amd64)
/*
* save args in case we trap and need to rerun as a copyop
*/
#ifdef DEBUG
jnb 1f
1:
#endif
/*
* pass lofault value as 4th argument to do_copy_fault
*/
3:
jz 2f
/*
* reload args for the copyop
*/
#ifdef DEBUG
jnb 1f
1:
#endif
jmp 3f
3:
jz 2f
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
/*
* save args in case we trap and need to rerun as a copyop
* %rcx is consumed in this routine so we don't need to save
* it.
*/
#ifdef DEBUG
jnb 1f
1:
#endif
jae 4f
/*
* pass lofault value as 4th argument to do_copy_fault
*/
/*
* Make sure cnt is >= XCOPY_MIN_SIZE bytes
*/
jae 5f
6:
/*
* Make sure src and dst are NTA_ALIGN_SIZE aligned,
* count is COUNT_ALIGN_SIZE aligned.
*/
5:
jnz 6b
4:
jmp 3f
/*
* A fault during do_copy_fault or do_copy_fault_nta is
* indicated through an errno value in %rax and we iret from the
* trap handler to here.
*/
3:
jz 2f
/*
* reload args for the copyop
*/
2: leave
jae 4f
/*
* Make sure cnt is >= XCOPY_MIN_SIZE bytes
*/
/*
* Make sure src and dst are NTA_ALIGN_SIZE aligned,
* count is COUNT_ALIGN_SIZE aligned.
*/
4:
jmp 3f
/*
* A fault during do_copy_fault or do_copy_fault_nta is
* indicated through an errno value in %eax and we iret from the
* trap handler to here.
*/
3:
jz 2f
/* AMD Software Optimization Guide - Section 6.2 */
#endif /* __i386 */
#endif /* __lint */
/*
* Copy kernel data to user space.
*/
#if defined(__lint)
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
/*
* save args in case we trap and need to rerun as a copyop
*/
#ifdef DEBUG
jnb 1f
1:
#endif
/*
* pass lofault value as 4th argument to do_copy_fault
*/
3:
jz 2f
/*
* reload args for the copyop
*/
#ifdef DEBUG
jnb 1f
1:
#endif
jmp 3f
3:
jz 2f
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
/*
* save args in case we trap and need to rerun as a copyop
*/
#ifdef DEBUG
jnb 1f
1:
#endif
jae 4f
/*
* pass lofault value as 4th argument to do_copy_fault
*/
jnz 6f
/*
* Make sure cnt is >= XCOPY_MIN_SIZE bytes
*/
jae 5f
6:
/*
* Make sure src and dst are NTA_ALIGN_SIZE aligned,
* count is COUNT_ALIGN_SIZE aligned.
*/
5:
jnz 6b
4:
jmp 3f
/*
* A fault during do_copy_fault or do_copy_fault_nta is
* indicated through an errno value in %rax and we iret from the
* trap handler to here.
*/
3:
jz 2f
/*
* reload args for the copyop
*/
2: leave
jae 4f
/*
* Make sure cnt is >= XCOPY_MIN_SIZE bytes
*/
/*
* Make sure src and dst are NTA_ALIGN_SIZE aligned,
* count is COUNT_ALIGN_SIZE aligned.
*/
4:
jmp 3f
/*
* A fault during do_copy_fault or do_copy_fault_nta is
* indicated through an errno value in %eax and we iret from the
* trap handler to here.
*/
3:
jz 2f
/* AMD Software Optimization Guide - Section 6.2 */
#endif /* __i386 */
#endif /* __lint */
/*
* Copy a null terminated string from one point to another in
* the kernel address space.
*/
#if defined(__lint)
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
#ifdef DEBUG
jb 0f
jnb 1f
1:
#endif
/* 5th argument to do_copystr */
/* as a non-ABI 6th arg */
#ifdef DEBUG
jb 0f
jnb 1f
0: pushl $.copystr_panic_msg
#endif
/* get the current lofault address */
#endif /* __i386 */
#endif /* __lint */
/*
* Copy a null terminated string from the user address space into
* the kernel address space.
*/
#if defined(__lint)
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
/*
* save args in case we trap and need to rerun as a copyop
*/
#ifdef DEBUG
jnb 1f
1:
#endif
/*
* pass lofault value as 5th argument to do_copystr
* do_copystr expects whether or not we need smap in %r10d
*/
jae 4f
4:
jmp 3f
3:
jz 2f
/*
* reload args for the copyop
*/
#ifdef DEBUG
jnb 1f
1:
#endif
jmp 3f
3:
jz 2f
#endif /* __i386 */
#endif /* __lint */
/*
* Copy a null terminated string from the kernel
* address space to the user address space.
*/
#if defined(__lint)
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
/*
* save args in case we trap and need to rerun as a copyop
*/
#ifdef DEBUG
jnb 1f
1:
#endif
/*
* pass lofault value as 5th argument to do_copystr
* pass one as 6th argument to do_copystr in %r10d
*/
jae 4f
4:
jmp 3f
3:
jz 2f
/*
* reload args for the copyop
*/
#ifdef DEBUG
jnb 1f
1:
#endif
jmp 3f
3:
jz 2f
#endif /* __i386 */
#endif /* __lint */
/*
* Since all of the fuword() variants are so similar, we have a macro to spit
* them out. This allows us to create DTrace-unobservable functions easily.
*/
#if defined(__lint)
#if defined(__amd64)
/* ARGSUSED */
int
{ return (0); }
#endif
/* ARGSUSED */
int
{ return (0); }
/* ARGSUSED */
int
{ return (0); }
/* ARGSUSED */
int
{ return (0); }
#else /* __lint */
#if defined(__amd64)
/*
* Note that we don't save and reload the arguments here
* because their values are not altered in the copy path.
* Additionally, when successful, the smap_enable jmp will
* actually return us to our original caller.
*/
jae 1f; \
ret; \
1: \
jz 2f; \
2: \
ret; \
jae 1f; \
ret; \
1: \
jz 2f; \
2: \
ret; \
#endif /* __i386 */
#endif /* __lint */
/*
* Set user word.
*/
#if defined(__lint)
#if defined(__amd64)
/* ARGSUSED */
int
{ return (0); }
#endif
/* ARGSUSED */
int
{ return (0); }
/* ARGSUSED */
int
{ return (0); }
/* ARGSUSED */
int
{ return (0); }
#else /* lint */
#if defined(__amd64)
/*
* Note that we don't save and reload the arguments here
* because their values are not altered in the copy path.
*/
jae 1f; \
ret; \
1: \
jz 3f; \
3: \
ret; \
jae 1f; \
ret; \
1: \
jz 3f; \
3: \
ret; \
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
#if defined(__amd64)
/*ARGSUSED*/
void
{}
#endif
/*ARGSUSED*/
void
{}
/*ARGSUSED*/
void
{}
/*ARGSUSED*/
void
{}
#else /* __lint */
#if defined(__amd64)
ret; \
jb 1f; \
ret; \
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
#if defined(__amd64)
/*ARGSUSED*/
void
{}
#endif
/*ARGSUSED*/
void
{}
/*ARGSUSED*/
void
{}
/*ARGSUSED*/
void
{}
#else /* lint */
#if defined(__amd64)
ret; \
jb 1f; \
1: \
ret; \
#endif /* __i386 */
#endif /* lint */
#if defined(__lint)
/*ARGSUSED*/
int
{ return (0); }
/*ARGSUSED*/
void
{}
/*ARGSUSED*/
int
{ return (0); }
/*ARGSUSED*/
void
{}
/*ARGSUSED*/
int
{ return (0); }
/*ARGSUSED*/
void
{}
#else
#if defined(__amd64)
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
/*
* Copy a block of storage - must not overlap (from + len <= to).
* No fault handler installed (to be called under on_fault())
*/
/* ARGSUSED */
void
{}
/* ARGSUSED */
void
{}
/*
* Zero a block of storage in user space
*/
/* ARGSUSED */
void
{}
/*
* copy a block of storage in user space
*/
/* ARGSUSED */
void
{}
/*
* copy a string in user space
*/
/* ARGSUSED */
void
{}
#else /* __lint */
#if defined(__amd64)
#ifdef DEBUG
jae 1f
1:
#endif
#ifdef DEBUG
jae 1f
1:
#endif
/*
* Note, the frame pointer is required here becuase do_copystr expects
* to be able to pop it off!
*/
/* do_copystr expects lofault address in %r8 */
/* do_copystr expects whether or not we need smap in %r10 */
#ifdef DEBUG
jae 1f
1:
#endif
#ifdef DEBUG
jae 1f
1:
#endif
jb 1f
1:
jb 1f
1:
jb 2f
2:
/* do_copystr expects the lofault address in %eax */
#endif /* __i386 */
#ifdef DEBUG
.data
.string "kcopy: arguments below kernelbase"
.string "bcopy: arguments below kernelbase"
.string "kzero: arguments below kernelbase"
.string "bzero: arguments below kernelbase"
.string "copyin: kaddr argument below kernelbase"
.string "xcopyin: kaddr argument below kernelbase"
.string "copyout: kaddr argument below kernelbase"
.string "xcopyout: kaddr argument below kernelbase"
.string "copystr: arguments in user space"
.string "copyinstr: kaddr argument not in kernel address space"
.string "copyoutstr: kaddr argument not in kernel address space"
.string "copyin_noerr: argument not in kernel address space"
.string "copyout_noerr: argument not in kernel address space"
#endif
#endif /* __lint */
/*
* These functions are used for SMAP, supervisor mode access protection. They
* are hotpatched to become real instructions when the system starts up which is
* done in mlsetup() as a part of enabling the other CR4 related features.
*
* Generally speaking, smap_disable() is a stac instruction and smap_enable is a
* clac instruction. It's safe to call these any number of times, and in fact,
* out of paranoia, the kernel will likely call it at several points.
*/
#if defined(__lint)
void
smap_enable(void)
{}
void
smap_disable(void)
{}
#else
#endif /* __amd64 || __i386 */
#endif /* __lint */
#ifndef __lint
.data
.align 4
.long SMAP_ENABLE_COUNT
.long SMAP_DISABLE_COUNT
#endif /* __lint */