ia32/ml/copy.s

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright (c) 2009, Intel Corporation
 * All rights reserved.
 */

/*       Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.    */
/*       Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T      */
/*         All Rights Reserved                      */

/*       Copyright (c) 1987, 1988 Microsoft Corporation         */
/*         All Rights Reserved                      */

/*
 * Copyright 2016 Joyent, Inc.
 */

#include <sys/errno.h>
#include <sys/asm_linkage.h>

#if defined(__lint)
#include <sys/types.h>
#include <sys/systm.h>
#else   /* __lint */
#include "assym.h"
#endif  /* __lint */

#define KCOPY_MIN_SIZE  128 /* Must be >= 16 bytes */
#define XCOPY_MIN_SIZE  128 /* Must be >= 16 bytes */
/*
 * Non-temopral access (NTA) alignment requirement
 */
#define NTA_ALIGN_SIZE  4   /* Must be at least 4-byte aligned */
#define NTA_ALIGN_MASK  _CONST(NTA_ALIGN_SIZE-1)
#define COUNT_ALIGN_SIZE    16  /* Must be at least 16-byte aligned */
#define COUNT_ALIGN_MASK    _CONST(COUNT_ALIGN_SIZE-1)

/*
 * With the introduction of Broadwell, Intel has introduced supervisor mode
 * access protection -- SMAP. SMAP forces the kernel to set certain bits to
 * enable access of user pages (AC in rflags, defines as PS_ACHK in
 * <sys/psw.h>). One of the challenges is that the implementation of many of the
 * userland copy routines directly use the kernel ones. For example, copyin and
 * copyout simply go and jump to the do_copy_fault label and traditionally let
 * those deal with the return for them. In fact, changing that is a can of frame
 * pointers.
 *
 * Rules and Constraints:
 *
 * 1. For anything that's not in copy.s, we have it do explicit calls to the
 * smap related code. It usually is in a position where it is able to. This is
 * restricted to the following three places: DTrace, resume() in swtch.s and
 * on_fault/no_fault. If you want to add it somewhere else, we should be
 * thinking twice.
 *
 * 2. We try to toggle this at the smallest window possible. This means that if
 * we take a fault, need to try to use a copyop in copyin() or copyout(), or any
 * other function, we will always leave with SMAP enabled (the kernel cannot
 * access user pages).
 *
 * 3. None of the *_noerr() or ucopy/uzero routines should toggle SMAP. They are
 * explicitly only allowed to be called while in an on_fault()/no_fault() handler,
 * which already takes care of ensuring that SMAP is enabled and disabled. Note
 * this means that when under an on_fault()/no_fault() handler, one must not
 * call the non-*_noeer() routines.
 *
 * 4. The first thing we should do after coming out of an lofault handler is to
 * make sure that we call smap_enable again to ensure that we are safely
 * protected, as more often than not, we will have disabled smap to get there.
 *
 * 5. The SMAP functions, smap_enable and smap_disable may not touch any
 * registers beyond those done by the call and ret. These routines may be called
 * from arbitrary contexts in copy.s where we have slightly more special ABIs in
 * place.
 *
 * 6. For any inline user of SMAP, the appropriate SMAP_ENABLE_INSTR and
 * SMAP_DISABLE_INSTR macro should be used (except for smap_enable() and
 * smap_disable()). If the number of these is changed, you must update the
 * constants SMAP_ENABLE_COUNT and SMAP_DISABLE_COUNT below.
 *
 * 7. Note, at this time SMAP is not implemented for the 32-bit kernel. There is
 * no known technical reason preventing it from being enabled.
 *
 * 8. Generally this .s file is processed by a K&R style cpp. This means that it
 * really has a lot of feelings about whitespace. In particular, if you have a
 * macro FOO with the arguments FOO(1, 3), the second argument is in fact ' 3'.
 *
 * 9. The smap_enable and smap_disable functions should not generally be called.
 * They exist such that DTrace and on_trap() may use them, that's it.
 *
 * 10. In general, the kernel has its own value for rflags that gets used. This
 * is maintained in a few different places which vary based on how the thread
 * comes into existence and whether it's a user thread. In general, when the
 * kernel takes a trap, it always will set ourselves to a known set of flags,
 * mainly as part of ENABLE_INTR_FLAGS and F_OFF and F_ON. These ensure that
 * PS_ACHK is cleared for us. In addition, when using the sysenter instruction,
 * we mask off PS_ACHK off via the AMD_SFMASK MSR. See init_cpu_syscall() for
 * where that gets masked off.
 */

/*
 * The optimal 64-bit bcopy and kcopy for modern x86 processors uses
 * "rep smovq" for large sizes. Performance data shows that many calls to
 * bcopy/kcopy/bzero/kzero operate on small buffers. For best performance for
 * these small sizes unrolled code is used. For medium sizes loops writing
 * 64-bytes per loop are used. Transition points were determined experimentally.
 */
#define BZERO_USE_REP   (1024)
#define BCOPY_DFLT_REP  (128)
#define BCOPY_NHM_REP   (768)

/*
 * Copy a block of storage, returning an error code if `from' or
 * `to' takes a kernel pagefault which cannot be resolved.
 * Returns errno value on pagefault error, 0 if all ok
 */

/*
 * I'm sorry about these macros, but copy.s is unsurprisingly sensitive to
 * additional call instructions.
 */
#if defined(__amd64)
#define SMAP_DISABLE_COUNT  16
#define SMAP_ENABLE_COUNT   26
#elif defined(__i386)
#define SMAP_DISABLE_COUNT  0
#define SMAP_ENABLE_COUNT   0
#endif

#define SMAP_DISABLE_INSTR(ITER)        \
    .globl  _smap_disable_patch_/**/ITER;   \
    _smap_disable_patch_/**/ITER/**/:;  \
    nop; nop; nop;

#define SMAP_ENABLE_INSTR(ITER)         \
    .globl  _smap_enable_patch_/**/ITER;    \
    _smap_enable_patch_/**/ITER/**/:;   \
    nop; nop; nop;

#if defined(__lint)

/* ARGSUSED */
int
kcopy(const void *from, void *to, size_t count)
{ return (0); }

#else   /* __lint */

    .globl  kernelbase
    .globl  postbootkernelbase

#if defined(__amd64)

    ENTRY(kcopy)
    pushq   %rbp
    movq    %rsp, %rbp
#ifdef DEBUG
    cmpq    postbootkernelbase(%rip), %rdi      /* %rdi = from */
    jb  0f
    cmpq    postbootkernelbase(%rip), %rsi      /* %rsi = to */
    jnb 1f
0:  leaq    .kcopy_panic_msg(%rip), %rdi
    xorl    %eax, %eax
    call    panic
1:
#endif
    /*
     * pass lofault value as 4th argument to do_copy_fault
     */
    leaq    _kcopy_copyerr(%rip), %rcx
    movq    %gs:CPU_THREAD, %r9 /* %r9 = thread addr */

do_copy_fault:
    movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
    movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
    call    bcopy_altentry
    xorl    %eax, %eax      /* return 0 (success) */
    SMAP_ENABLE_INSTR(0)

    /*
     * A fault during do_copy_fault is indicated through an errno value
     * in %rax and we iretq from the trap handler to here.
     */
_kcopy_copyerr:
    movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
    leave
    ret
    SET_SIZE(kcopy)

#elif defined(__i386)

#define ARG_FROM    8
#define ARG_TO      12
#define ARG_COUNT   16

    ENTRY(kcopy)
#ifdef DEBUG
    pushl   %ebp
    movl    %esp, %ebp
    movl    postbootkernelbase, %eax
    cmpl    %eax, ARG_FROM(%ebp)
    jb  0f
    cmpl    %eax, ARG_TO(%ebp)
    jnb 1f
0:  pushl   $.kcopy_panic_msg
    call    panic
1:  popl    %ebp
#endif
    lea _kcopy_copyerr, %eax    /* lofault value */
    movl    %gs:CPU_THREAD, %edx

do_copy_fault:
    pushl   %ebp
    movl    %esp, %ebp      /* setup stack frame */
    pushl   %esi
    pushl   %edi            /* save registers */

    movl    T_LOFAULT(%edx), %edi
    pushl   %edi            /* save the current lofault */
    movl    %eax, T_LOFAULT(%edx)   /* new lofault */

    movl    ARG_COUNT(%ebp), %ecx
    movl    ARG_FROM(%ebp), %esi
    movl    ARG_TO(%ebp), %edi
    shrl    $2, %ecx        /* word count */
    rep
      smovl
    movl    ARG_COUNT(%ebp), %ecx
    andl    $3, %ecx        /* bytes left over */
    rep
      smovb
    xorl    %eax, %eax

    /*
     * A fault during do_copy_fault is indicated through an errno value
     * in %eax and we iret from the trap handler to here.
     */
_kcopy_copyerr:
    popl    %ecx
    popl    %edi
    movl    %ecx, T_LOFAULT(%edx)   /* restore the original lofault */
    popl    %esi
    popl    %ebp
    ret
    SET_SIZE(kcopy)

#undef  ARG_FROM
#undef  ARG_TO
#undef  ARG_COUNT

#endif  /* __i386 */
#endif  /* __lint */

#if defined(__lint)

/*
 * Copy a block of storage.  Similar to kcopy but uses non-temporal
 * instructions.
 */

/* ARGSUSED */
int
kcopy_nta(const void *from, void *to, size_t count, int copy_cached)
{ return (0); }

#else   /* __lint */

#if defined(__amd64)

#define COPY_LOOP_INIT(src, dst, cnt)   \
    addq    cnt, src;           \
    addq    cnt, dst;           \
    shrq    $3, cnt;            \
    neg cnt

    /* Copy 16 bytes per loop.  Uses %rax and %r8 */
#define COPY_LOOP_BODY(src, dst, cnt)   \
    prefetchnta 0x100(src, cnt, 8); \
    movq    (src, cnt, 8), %rax;        \
    movq    0x8(src, cnt, 8), %r8;      \
    movnti  %rax, (dst, cnt, 8);        \
    movnti  %r8, 0x8(dst, cnt, 8);      \
    addq    $2, cnt

    ENTRY(kcopy_nta)
    pushq   %rbp
    movq    %rsp, %rbp
#ifdef DEBUG
    cmpq    postbootkernelbase(%rip), %rdi      /* %rdi = from */
    jb  0f
    cmpq    postbootkernelbase(%rip), %rsi      /* %rsi = to */
    jnb 1f
0:  leaq    .kcopy_panic_msg(%rip), %rdi
    xorl    %eax, %eax
    call    panic
1:
#endif

    movq    %gs:CPU_THREAD, %r9
    cmpq    $0, %rcx        /* No non-temporal access? */
    /*
     * pass lofault value as 4th argument to do_copy_fault
     */
    leaq    _kcopy_nta_copyerr(%rip), %rcx  /* doesn't set rflags */
    jnz do_copy_fault       /* use regular access */
    /*
     * Make sure cnt is >= KCOPY_MIN_SIZE
     */
    cmpq    $KCOPY_MIN_SIZE, %rdx
    jb  do_copy_fault

    /*
     * Make sure src and dst are NTA_ALIGN_SIZE aligned,
     * count is COUNT_ALIGN_SIZE aligned.
     */
    movq    %rdi, %r10
    orq %rsi, %r10
    andq    $NTA_ALIGN_MASK, %r10
    orq %rdx, %r10
    andq    $COUNT_ALIGN_MASK, %r10
    jnz do_copy_fault

    ALTENTRY(do_copy_fault_nta)
    movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
    movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
    movq    %rcx, T_LOFAULT(%r9)    /* new lofault */

    /*
     * COPY_LOOP_BODY uses %rax and %r8
     */
    COPY_LOOP_INIT(%rdi, %rsi, %rdx)
2:  COPY_LOOP_BODY(%rdi, %rsi, %rdx)
    jnz 2b

    mfence
    xorl    %eax, %eax      /* return 0 (success) */
    SMAP_ENABLE_INSTR(1)

_kcopy_nta_copyerr:
    movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
    leave
    ret
    SET_SIZE(do_copy_fault_nta)
    SET_SIZE(kcopy_nta)

#elif defined(__i386)

#define ARG_FROM    8
#define ARG_TO      12
#define ARG_COUNT   16

#define COPY_LOOP_INIT(src, dst, cnt)   \
    addl    cnt, src;           \
    addl    cnt, dst;           \
    shrl    $3, cnt;            \
    neg cnt

#define COPY_LOOP_BODY(src, dst, cnt)   \
    prefetchnta 0x100(src, cnt, 8); \
    movl    (src, cnt, 8), %esi;        \
    movnti  %esi, (dst, cnt, 8);        \
    movl    0x4(src, cnt, 8), %esi;     \
    movnti  %esi, 0x4(dst, cnt, 8);     \
    movl    0x8(src, cnt, 8), %esi;     \
    movnti  %esi, 0x8(dst, cnt, 8);     \
    movl    0xc(src, cnt, 8), %esi;     \
    movnti  %esi, 0xc(dst, cnt, 8);     \
    addl    $2, cnt

    /*
     * kcopy_nta is not implemented for 32-bit as no performance
     * improvement was shown.  We simply jump directly to kcopy
     * and discard the 4 arguments.
     */
    ENTRY(kcopy_nta)
    jmp kcopy

    lea _kcopy_nta_copyerr, %eax    /* lofault value */
    ALTENTRY(do_copy_fault_nta)
    pushl   %ebp
    movl    %esp, %ebp      /* setup stack frame */
    pushl   %esi
    pushl   %edi

    movl    %gs:CPU_THREAD, %edx
    movl    T_LOFAULT(%edx), %edi
    pushl   %edi            /* save the current lofault */
    movl    %eax, T_LOFAULT(%edx)   /* new lofault */

    /* COPY_LOOP_BODY needs to use %esi */
    movl    ARG_COUNT(%ebp), %ecx
    movl    ARG_FROM(%ebp), %edi
    movl    ARG_TO(%ebp), %eax
    COPY_LOOP_INIT(%edi, %eax, %ecx)
1:  COPY_LOOP_BODY(%edi, %eax, %ecx)
    jnz 1b
    mfence

    xorl    %eax, %eax
_kcopy_nta_copyerr:
    popl    %ecx
    popl    %edi
    movl    %ecx, T_LOFAULT(%edx)   /* restore the original lofault */
    popl    %esi
    leave
    ret
    SET_SIZE(do_copy_fault_nta)
    SET_SIZE(kcopy_nta)

#undef  ARG_FROM
#undef  ARG_TO
#undef  ARG_COUNT

#endif  /* __i386 */
#endif  /* __lint */

#if defined(__lint)

/* ARGSUSED */
void
bcopy(const void *from, void *to, size_t count)
{}

#else   /* __lint */

#if defined(__amd64)

    ENTRY(bcopy)
#ifdef DEBUG
    orq %rdx, %rdx      /* %rdx = count */
    jz  1f
    cmpq    postbootkernelbase(%rip), %rdi      /* %rdi = from */
    jb  0f
    cmpq    postbootkernelbase(%rip), %rsi      /* %rsi = to */
    jnb 1f
0:  leaq    .bcopy_panic_msg(%rip), %rdi
    jmp call_panic      /* setup stack and call panic */
1:
#endif
    /*
     * bcopy_altentry() is called from kcopy, i.e., do_copy_fault.
     * kcopy assumes that bcopy doesn't touch %r9 and %r11. If bcopy
     * uses these registers in future they must be saved and restored.
     */
    ALTENTRY(bcopy_altentry)
do_copy:
#define L(s) .bcopy/**/s
    cmpq    $0x50, %rdx     /* 80 */
    jae bcopy_ck_size

    /*
     * Performance data shows many caller's copy small buffers. So for
     * best perf for these sizes unrolled code is used. Store data without
     * worrying about alignment.
     */
    leaq    L(fwdPxQx)(%rip), %r10
    addq    %rdx, %rdi
    addq    %rdx, %rsi
    movslq  (%r10,%rdx,4), %rcx
    leaq    (%rcx,%r10,1), %r10
    jmpq    *%r10

    .p2align 4
L(fwdPxQx):
    .int       L(P0Q0)-L(fwdPxQx)   /* 0 */
    .int       L(P1Q0)-L(fwdPxQx)
    .int       L(P2Q0)-L(fwdPxQx)
    .int       L(P3Q0)-L(fwdPxQx)
    .int       L(P4Q0)-L(fwdPxQx)
    .int       L(P5Q0)-L(fwdPxQx)
    .int       L(P6Q0)-L(fwdPxQx)
    .int       L(P7Q0)-L(fwdPxQx)

    .int       L(P0Q1)-L(fwdPxQx)   /* 8 */
    .int       L(P1Q1)-L(fwdPxQx)
    .int       L(P2Q1)-L(fwdPxQx)
    .int       L(P3Q1)-L(fwdPxQx)
    .int       L(P4Q1)-L(fwdPxQx)
    .int       L(P5Q1)-L(fwdPxQx)
    .int       L(P6Q1)-L(fwdPxQx)
    .int       L(P7Q1)-L(fwdPxQx)

    .int       L(P0Q2)-L(fwdPxQx)   /* 16 */
    .int       L(P1Q2)-L(fwdPxQx)
    .int       L(P2Q2)-L(fwdPxQx)
    .int       L(P3Q2)-L(fwdPxQx)
    .int       L(P4Q2)-L(fwdPxQx)
    .int       L(P5Q2)-L(fwdPxQx)
    .int       L(P6Q2)-L(fwdPxQx)
    .int       L(P7Q2)-L(fwdPxQx)

    .int       L(P0Q3)-L(fwdPxQx)   /* 24 */
    .int       L(P1Q3)-L(fwdPxQx)
    .int       L(P2Q3)-L(fwdPxQx)
    .int       L(P3Q3)-L(fwdPxQx)
    .int       L(P4Q3)-L(fwdPxQx)
    .int       L(P5Q3)-L(fwdPxQx)
    .int       L(P6Q3)-L(fwdPxQx)
    .int       L(P7Q3)-L(fwdPxQx)

    .int       L(P0Q4)-L(fwdPxQx)   /* 32 */
    .int       L(P1Q4)-L(fwdPxQx)
    .int       L(P2Q4)-L(fwdPxQx)
    .int       L(P3Q4)-L(fwdPxQx)
    .int       L(P4Q4)-L(fwdPxQx)
    .int       L(P5Q4)-L(fwdPxQx)
    .int       L(P6Q4)-L(fwdPxQx)
    .int       L(P7Q4)-L(fwdPxQx)

    .int       L(P0Q5)-L(fwdPxQx)   /* 40 */
    .int       L(P1Q5)-L(fwdPxQx)
    .int       L(P2Q5)-L(fwdPxQx)
    .int       L(P3Q5)-L(fwdPxQx)
    .int       L(P4Q5)-L(fwdPxQx)
    .int       L(P5Q5)-L(fwdPxQx)
    .int       L(P6Q5)-L(fwdPxQx)
    .int       L(P7Q5)-L(fwdPxQx)

    .int       L(P0Q6)-L(fwdPxQx)   /* 48 */
    .int       L(P1Q6)-L(fwdPxQx)
    .int       L(P2Q6)-L(fwdPxQx)
    .int       L(P3Q6)-L(fwdPxQx)
    .int       L(P4Q6)-L(fwdPxQx)
    .int       L(P5Q6)-L(fwdPxQx)
    .int       L(P6Q6)-L(fwdPxQx)
    .int       L(P7Q6)-L(fwdPxQx)

    .int       L(P0Q7)-L(fwdPxQx)   /* 56 */
    .int       L(P1Q7)-L(fwdPxQx)
    .int       L(P2Q7)-L(fwdPxQx)
    .int       L(P3Q7)-L(fwdPxQx)
    .int       L(P4Q7)-L(fwdPxQx)
    .int       L(P5Q7)-L(fwdPxQx)
    .int       L(P6Q7)-L(fwdPxQx)
    .int       L(P7Q7)-L(fwdPxQx)

    .int       L(P0Q8)-L(fwdPxQx)   /* 64 */
    .int       L(P1Q8)-L(fwdPxQx)
    .int       L(P2Q8)-L(fwdPxQx)
    .int       L(P3Q8)-L(fwdPxQx)
    .int       L(P4Q8)-L(fwdPxQx)
    .int       L(P5Q8)-L(fwdPxQx)
    .int       L(P6Q8)-L(fwdPxQx)
    .int       L(P7Q8)-L(fwdPxQx)

    .int       L(P0Q9)-L(fwdPxQx)   /* 72 */
    .int       L(P1Q9)-L(fwdPxQx)
    .int       L(P2Q9)-L(fwdPxQx)
    .int       L(P3Q9)-L(fwdPxQx)
    .int       L(P4Q9)-L(fwdPxQx)
    .int       L(P5Q9)-L(fwdPxQx)
    .int       L(P6Q9)-L(fwdPxQx)
    .int       L(P7Q9)-L(fwdPxQx)   /* 79 */

    .p2align 4
L(P0Q9):
    mov    -0x48(%rdi), %rcx
    mov    %rcx, -0x48(%rsi)
L(P0Q8):
    mov    -0x40(%rdi), %r10
    mov    %r10, -0x40(%rsi)
L(P0Q7):
    mov    -0x38(%rdi), %r8
    mov    %r8, -0x38(%rsi)
L(P0Q6):
    mov    -0x30(%rdi), %rcx
    mov    %rcx, -0x30(%rsi)
L(P0Q5):
    mov    -0x28(%rdi), %r10
    mov    %r10, -0x28(%rsi)
L(P0Q4):
    mov    -0x20(%rdi), %r8
    mov    %r8, -0x20(%rsi)
L(P0Q3):
    mov    -0x18(%rdi), %rcx
    mov    %rcx, -0x18(%rsi)
L(P0Q2):
    mov    -0x10(%rdi), %r10
    mov    %r10, -0x10(%rsi)
L(P0Q1):
    mov    -0x8(%rdi), %r8
    mov    %r8, -0x8(%rsi)
L(P0Q0):
    ret

    .p2align 4
L(P1Q9):
    mov    -0x49(%rdi), %r8
    mov    %r8, -0x49(%rsi)
L(P1Q8):
    mov    -0x41(%rdi), %rcx
    mov    %rcx, -0x41(%rsi)
L(P1Q7):
    mov    -0x39(%rdi), %r10
    mov    %r10, -0x39(%rsi)
L(P1Q6):
    mov    -0x31(%rdi), %r8
    mov    %r8, -0x31(%rsi)
L(P1Q5):
    mov    -0x29(%rdi), %rcx
    mov    %rcx, -0x29(%rsi)
L(P1Q4):
    mov    -0x21(%rdi), %r10
    mov    %r10, -0x21(%rsi)
L(P1Q3):
    mov    -0x19(%rdi), %r8
    mov    %r8, -0x19(%rsi)
L(P1Q2):
    mov    -0x11(%rdi), %rcx
    mov    %rcx, -0x11(%rsi)
L(P1Q1):
    mov    -0x9(%rdi), %r10
    mov    %r10, -0x9(%rsi)
L(P1Q0):
    movzbq -0x1(%rdi), %r8
    mov    %r8b, -0x1(%rsi)
    ret

    .p2align 4
L(P2Q9):
    mov    -0x4a(%rdi), %r8
    mov    %r8, -0x4a(%rsi)
L(P2Q8):
    mov    -0x42(%rdi), %rcx
    mov    %rcx, -0x42(%rsi)
L(P2Q7):
    mov    -0x3a(%rdi), %r10
    mov    %r10, -0x3a(%rsi)
L(P2Q6):
    mov    -0x32(%rdi), %r8
    mov    %r8, -0x32(%rsi)
L(P2Q5):
    mov    -0x2a(%rdi), %rcx
    mov    %rcx, -0x2a(%rsi)
L(P2Q4):
    mov    -0x22(%rdi), %r10
    mov    %r10, -0x22(%rsi)
L(P2Q3):
    mov    -0x1a(%rdi), %r8
    mov    %r8, -0x1a(%rsi)
L(P2Q2):
    mov    -0x12(%rdi), %rcx
    mov    %rcx, -0x12(%rsi)
L(P2Q1):
    mov    -0xa(%rdi), %r10
    mov    %r10, -0xa(%rsi)
L(P2Q0):
    movzwq -0x2(%rdi), %r8
    mov    %r8w, -0x2(%rsi)
    ret

    .p2align 4
L(P3Q9):
    mov    -0x4b(%rdi), %r8
    mov    %r8, -0x4b(%rsi)
L(P3Q8):
    mov    -0x43(%rdi), %rcx
    mov    %rcx, -0x43(%rsi)
L(P3Q7):
    mov    -0x3b(%rdi), %r10
    mov    %r10, -0x3b(%rsi)
L(P3Q6):
    mov    -0x33(%rdi), %r8
    mov    %r8, -0x33(%rsi)
L(P3Q5):
    mov    -0x2b(%rdi), %rcx
    mov    %rcx, -0x2b(%rsi)
L(P3Q4):
    mov    -0x23(%rdi), %r10
    mov    %r10, -0x23(%rsi)
L(P3Q3):
    mov    -0x1b(%rdi), %r8
    mov    %r8, -0x1b(%rsi)
L(P3Q2):
    mov    -0x13(%rdi), %rcx
    mov    %rcx, -0x13(%rsi)
L(P3Q1):
    mov    -0xb(%rdi), %r10
    mov    %r10, -0xb(%rsi)
    /*
     * These trailing loads/stores have to do all their loads 1st,
     * then do the stores.
     */
L(P3Q0):
    movzwq -0x3(%rdi), %r8
    movzbq -0x1(%rdi), %r10
    mov    %r8w, -0x3(%rsi)
    mov    %r10b, -0x1(%rsi)
    ret

    .p2align 4
L(P4Q9):
    mov    -0x4c(%rdi), %r8
    mov    %r8, -0x4c(%rsi)
L(P4Q8):
    mov    -0x44(%rdi), %rcx
    mov    %rcx, -0x44(%rsi)
L(P4Q7):
    mov    -0x3c(%rdi), %r10
    mov    %r10, -0x3c(%rsi)
L(P4Q6):
    mov    -0x34(%rdi), %r8
    mov    %r8, -0x34(%rsi)
L(P4Q5):
    mov    -0x2c(%rdi), %rcx
    mov    %rcx, -0x2c(%rsi)
L(P4Q4):
    mov    -0x24(%rdi), %r10
    mov    %r10, -0x24(%rsi)
L(P4Q3):
    mov    -0x1c(%rdi), %r8
    mov    %r8, -0x1c(%rsi)
L(P4Q2):
    mov    -0x14(%rdi), %rcx
    mov    %rcx, -0x14(%rsi)
L(P4Q1):
    mov    -0xc(%rdi), %r10
    mov    %r10, -0xc(%rsi)
L(P4Q0):
    mov    -0x4(%rdi), %r8d
    mov    %r8d, -0x4(%rsi)
    ret

    .p2align 4
L(P5Q9):
    mov    -0x4d(%rdi), %r8
    mov    %r8, -0x4d(%rsi)
L(P5Q8):
    mov    -0x45(%rdi), %rcx
    mov    %rcx, -0x45(%rsi)
L(P5Q7):
    mov    -0x3d(%rdi), %r10
    mov    %r10, -0x3d(%rsi)
L(P5Q6):
    mov    -0x35(%rdi), %r8
    mov    %r8, -0x35(%rsi)
L(P5Q5):
    mov    -0x2d(%rdi), %rcx
    mov    %rcx, -0x2d(%rsi)
L(P5Q4):
    mov    -0x25(%rdi), %r10
    mov    %r10, -0x25(%rsi)
L(P5Q3):
    mov    -0x1d(%rdi), %r8
    mov    %r8, -0x1d(%rsi)
L(P5Q2):
    mov    -0x15(%rdi), %rcx
    mov    %rcx, -0x15(%rsi)
L(P5Q1):
    mov    -0xd(%rdi), %r10
    mov    %r10, -0xd(%rsi)
L(P5Q0):
    mov    -0x5(%rdi), %r8d
    movzbq -0x1(%rdi), %r10
    mov    %r8d, -0x5(%rsi)
    mov    %r10b, -0x1(%rsi)
    ret

    .p2align 4
L(P6Q9):
    mov    -0x4e(%rdi), %r8
    mov    %r8, -0x4e(%rsi)
L(P6Q8):
    mov    -0x46(%rdi), %rcx
    mov    %rcx, -0x46(%rsi)
L(P6Q7):
    mov    -0x3e(%rdi), %r10
    mov    %r10, -0x3e(%rsi)
L(P6Q6):
    mov    -0x36(%rdi), %r8
    mov    %r8, -0x36(%rsi)
L(P6Q5):
    mov    -0x2e(%rdi), %rcx
    mov    %rcx, -0x2e(%rsi)
L(P6Q4):
    mov    -0x26(%rdi), %r10
    mov    %r10, -0x26(%rsi)
L(P6Q3):
    mov    -0x1e(%rdi), %r8
    mov    %r8, -0x1e(%rsi)
L(P6Q2):
    mov    -0x16(%rdi), %rcx
    mov    %rcx, -0x16(%rsi)
L(P6Q1):
    mov    -0xe(%rdi), %r10
    mov    %r10, -0xe(%rsi)
L(P6Q0):
    mov    -0x6(%rdi), %r8d
    movzwq -0x2(%rdi), %r10
    mov    %r8d, -0x6(%rsi)
    mov    %r10w, -0x2(%rsi)
    ret

    .p2align 4
L(P7Q9):
    mov    -0x4f(%rdi), %r8
    mov    %r8, -0x4f(%rsi)
L(P7Q8):
    mov    -0x47(%rdi), %rcx
    mov    %rcx, -0x47(%rsi)
L(P7Q7):
    mov    -0x3f(%rdi), %r10
    mov    %r10, -0x3f(%rsi)
L(P7Q6):
    mov    -0x37(%rdi), %r8
    mov    %r8, -0x37(%rsi)
L(P7Q5):
    mov    -0x2f(%rdi), %rcx
    mov    %rcx, -0x2f(%rsi)
L(P7Q4):
    mov    -0x27(%rdi), %r10
    mov    %r10, -0x27(%rsi)
L(P7Q3):
    mov    -0x1f(%rdi), %r8
    mov    %r8, -0x1f(%rsi)
L(P7Q2):
    mov    -0x17(%rdi), %rcx
    mov    %rcx, -0x17(%rsi)
L(P7Q1):
    mov    -0xf(%rdi), %r10
    mov    %r10, -0xf(%rsi)
L(P7Q0):
    mov    -0x7(%rdi), %r8d
    movzwq -0x3(%rdi), %r10
    movzbq -0x1(%rdi), %rcx
    mov    %r8d, -0x7(%rsi)
    mov    %r10w, -0x3(%rsi)
    mov    %cl, -0x1(%rsi)
    ret

    /*
     * For large sizes rep smovq is fastest.
     * Transition point determined experimentally as measured on
     * Intel Xeon processors (incl. Nehalem and previous generations) and
     * AMD Opteron. The transition value is patched at boot time to avoid
     * memory reference hit.
     */
    .globl bcopy_patch_start
bcopy_patch_start:
    cmpq    $BCOPY_NHM_REP, %rdx
    .globl bcopy_patch_end
bcopy_patch_end:

    .p2align 4
    .globl bcopy_ck_size
bcopy_ck_size:
    cmpq    $BCOPY_DFLT_REP, %rdx
    jae L(use_rep)

    /*
     * Align to a 8-byte boundary. Avoids penalties from unaligned stores
     * as well as from stores spanning cachelines.
     */
    test    $0x7, %rsi
    jz  L(aligned_loop)
    test    $0x1, %rsi
    jz  2f
    movzbq  (%rdi), %r8
    dec %rdx
    inc %rdi
    mov %r8b, (%rsi)
    inc %rsi
2:
    test    $0x2, %rsi
    jz  4f
    movzwq  (%rdi), %r8
    sub $0x2, %rdx
    add $0x2, %rdi
    mov %r8w, (%rsi)
    add $0x2, %rsi
4:
    test    $0x4, %rsi
    jz  L(aligned_loop)
    mov (%rdi), %r8d
    sub $0x4, %rdx
    add $0x4, %rdi
    mov %r8d, (%rsi)
    add $0x4, %rsi

    /*
     * Copy 64-bytes per loop
     */
    .p2align 4
L(aligned_loop):
    mov (%rdi), %r8
    mov 0x8(%rdi), %r10
    lea -0x40(%rdx), %rdx
    mov %r8, (%rsi)
    mov %r10, 0x8(%rsi)
    mov 0x10(%rdi), %rcx
    mov 0x18(%rdi), %r8
    mov %rcx, 0x10(%rsi)
    mov %r8, 0x18(%rsi)

    cmp $0x40, %rdx
    mov 0x20(%rdi), %r10
    mov 0x28(%rdi), %rcx
    mov %r10, 0x20(%rsi)
    mov %rcx, 0x28(%rsi)
    mov 0x30(%rdi), %r8
    mov 0x38(%rdi), %r10
    lea 0x40(%rdi), %rdi
    mov %r8, 0x30(%rsi)
    mov %r10, 0x38(%rsi)
    lea 0x40(%rsi), %rsi
    jae L(aligned_loop)

    /*
     * Copy remaining bytes (0-63)
     */
L(do_remainder):
    leaq    L(fwdPxQx)(%rip), %r10
    addq    %rdx, %rdi
    addq    %rdx, %rsi
    movslq  (%r10,%rdx,4), %rcx
    leaq    (%rcx,%r10,1), %r10
    jmpq    *%r10

    /*
     * Use rep smovq. Clear remainder via unrolled code
     */
    .p2align 4
L(use_rep):
    xchgq   %rdi, %rsi      /* %rsi = source, %rdi = destination */
    movq    %rdx, %rcx      /* %rcx = count */
    shrq    $3, %rcx        /* 8-byte word count */
    rep
      smovq

    xchgq   %rsi, %rdi      /* %rdi = src, %rsi = destination */
    andq    $7, %rdx        /* remainder */
    jnz L(do_remainder)
    ret
#undef  L

#ifdef DEBUG
    /*
     * Setup frame on the run-time stack. The end of the input argument
     * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
     * always points to the end of the latest allocated stack frame.
     * panic(const char *format, ...) is a varargs function. When a
     * function taking variable arguments is called, %rax must be set
     * to eight times the number of floating point parameters passed
     * to the function in SSE registers.
     */
call_panic:
    pushq   %rbp            /* align stack properly */
    movq    %rsp, %rbp
    xorl    %eax, %eax      /* no variable arguments */
    call    panic           /* %rdi = format string */
#endif
    SET_SIZE(bcopy_altentry)
    SET_SIZE(bcopy)

#elif defined(__i386)

#define ARG_FROM    4
#define ARG_TO      8
#define ARG_COUNT   12

    ENTRY(bcopy)
#ifdef DEBUG
    movl    ARG_COUNT(%esp), %eax
    orl %eax, %eax
    jz  1f
    movl    postbootkernelbase, %eax
    cmpl    %eax, ARG_FROM(%esp)
    jb  0f
    cmpl    %eax, ARG_TO(%esp)
    jnb 1f
0:  pushl   %ebp
    movl    %esp, %ebp
    pushl   $.bcopy_panic_msg
    call    panic
1:
#endif
do_copy:
    movl    %esi, %eax      /* save registers */
    movl    %edi, %edx
    movl    ARG_COUNT(%esp), %ecx
    movl    ARG_FROM(%esp), %esi
    movl    ARG_TO(%esp), %edi

    shrl    $2, %ecx        /* word count */
    rep
      smovl
    movl    ARG_COUNT(%esp), %ecx
    andl    $3, %ecx        /* bytes left over */
    rep
      smovb
    movl    %eax, %esi      /* restore registers */
    movl    %edx, %edi
    ret
    SET_SIZE(bcopy)

#undef  ARG_COUNT
#undef  ARG_FROM
#undef  ARG_TO

#endif  /* __i386 */
#endif  /* __lint */


/*
 * Zero a block of storage, returning an error code if we
 * take a kernel pagefault which cannot be resolved.
 * Returns errno value on pagefault error, 0 if all ok
 */

#if defined(__lint)

/* ARGSUSED */
int
kzero(void *addr, size_t count)
{ return (0); }

#else   /* __lint */

#if defined(__amd64)

    ENTRY(kzero)
#ifdef DEBUG
        cmpq    postbootkernelbase(%rip), %rdi  /* %rdi = addr */
        jnb 0f
        leaq    .kzero_panic_msg(%rip), %rdi
    jmp call_panic      /* setup stack and call panic */
0:
#endif
    /*
     * pass lofault value as 3rd argument for fault return
     */
    leaq    _kzeroerr(%rip), %rdx

    movq    %gs:CPU_THREAD, %r9 /* %r9 = thread addr */
    movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
    movq    %rdx, T_LOFAULT(%r9)    /* new lofault */
    call    bzero_altentry
    xorl    %eax, %eax
    movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
    ret
    /*
     * A fault during bzero is indicated through an errno value
     * in %rax when we iretq to here.
     */
_kzeroerr:
    addq    $8, %rsp        /* pop bzero_altentry call ret addr */
    movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
    ret
    SET_SIZE(kzero)

#elif defined(__i386)

#define ARG_ADDR    8
#define ARG_COUNT   12

    ENTRY(kzero)
#ifdef DEBUG
    pushl   %ebp
    movl    %esp, %ebp
    movl    postbootkernelbase, %eax
        cmpl    %eax, ARG_ADDR(%ebp)
        jnb 0f
        pushl   $.kzero_panic_msg
        call    panic
0:  popl    %ebp
#endif
    lea _kzeroerr, %eax     /* kzeroerr is lofault value */

    pushl   %ebp            /* save stack base */
    movl    %esp, %ebp      /* set new stack base */
    pushl   %edi            /* save %edi */

    mov %gs:CPU_THREAD, %edx
    movl    T_LOFAULT(%edx), %edi
    pushl   %edi            /* save the current lofault */
    movl    %eax, T_LOFAULT(%edx)   /* new lofault */

    movl    ARG_COUNT(%ebp), %ecx   /* get size in bytes */
    movl    ARG_ADDR(%ebp), %edi    /* %edi <- address of bytes to clear */
    shrl    $2, %ecx        /* Count of double words to zero */
    xorl    %eax, %eax      /* sstol val */
    rep
      sstol         /* %ecx contains words to clear (%eax=0) */

    movl    ARG_COUNT(%ebp), %ecx   /* get size in bytes */
    andl    $3, %ecx        /* do mod 4 */
    rep
      sstob         /* %ecx contains residual bytes to clear */

    /*
     * A fault during kzero is indicated through an errno value
     * in %eax when we iret to here.
     */
_kzeroerr:
    popl    %edi
    movl    %edi, T_LOFAULT(%edx)   /* restore the original lofault */
    popl    %edi
    popl    %ebp
    ret
    SET_SIZE(kzero)

#undef  ARG_ADDR
#undef  ARG_COUNT

#endif  /* __i386 */
#endif  /* __lint */

/*
 * Zero a block of storage.
 */

#if defined(__lint)

/* ARGSUSED */
void
bzero(void *addr, size_t count)
{}

#else   /* __lint */

#if defined(__amd64)

    ENTRY(bzero)
#ifdef DEBUG
    cmpq    postbootkernelbase(%rip), %rdi  /* %rdi = addr */
    jnb 0f
    leaq    .bzero_panic_msg(%rip), %rdi
    jmp call_panic      /* setup stack and call panic */
0:
#endif
    ALTENTRY(bzero_altentry)
do_zero:
#define L(s) .bzero/**/s
    xorl    %eax, %eax

    cmpq    $0x50, %rsi     /* 80 */
    jae L(ck_align)

    /*
     * Performance data shows many caller's are zeroing small buffers. So
     * for best perf for these sizes unrolled code is used. Store zeros
     * without worrying about alignment.
     */
    leaq    L(setPxQx)(%rip), %r10
    addq    %rsi, %rdi
    movslq  (%r10,%rsi,4), %rcx
    leaq    (%rcx,%r10,1), %r10
    jmpq    *%r10

    .p2align 4
L(setPxQx):
    .int       L(P0Q0)-L(setPxQx)   /* 0 */
    .int       L(P1Q0)-L(setPxQx)
    .int       L(P2Q0)-L(setPxQx)
    .int       L(P3Q0)-L(setPxQx)
    .int       L(P4Q0)-L(setPxQx)
    .int       L(P5Q0)-L(setPxQx)
    .int       L(P6Q0)-L(setPxQx)
    .int       L(P7Q0)-L(setPxQx)

    .int       L(P0Q1)-L(setPxQx)   /* 8 */
    .int       L(P1Q1)-L(setPxQx)
    .int       L(P2Q1)-L(setPxQx)
    .int       L(P3Q1)-L(setPxQx)
    .int       L(P4Q1)-L(setPxQx)
    .int       L(P5Q1)-L(setPxQx)
    .int       L(P6Q1)-L(setPxQx)
    .int       L(P7Q1)-L(setPxQx)

    .int       L(P0Q2)-L(setPxQx)   /* 16 */
    .int       L(P1Q2)-L(setPxQx)
    .int       L(P2Q2)-L(setPxQx)
    .int       L(P3Q2)-L(setPxQx)
    .int       L(P4Q2)-L(setPxQx)
    .int       L(P5Q2)-L(setPxQx)
    .int       L(P6Q2)-L(setPxQx)
    .int       L(P7Q2)-L(setPxQx)

    .int       L(P0Q3)-L(setPxQx)   /* 24 */
    .int       L(P1Q3)-L(setPxQx)
    .int       L(P2Q3)-L(setPxQx)
    .int       L(P3Q3)-L(setPxQx)
    .int       L(P4Q3)-L(setPxQx)
    .int       L(P5Q3)-L(setPxQx)
    .int       L(P6Q3)-L(setPxQx)
    .int       L(P7Q3)-L(setPxQx)

    .int       L(P0Q4)-L(setPxQx)   /* 32 */
    .int       L(P1Q4)-L(setPxQx)
    .int       L(P2Q4)-L(setPxQx)
    .int       L(P3Q4)-L(setPxQx)
    .int       L(P4Q4)-L(setPxQx)
    .int       L(P5Q4)-L(setPxQx)
    .int       L(P6Q4)-L(setPxQx)
    .int       L(P7Q4)-L(setPxQx)

    .int       L(P0Q5)-L(setPxQx)   /* 40 */
    .int       L(P1Q5)-L(setPxQx)
    .int       L(P2Q5)-L(setPxQx)
    .int       L(P3Q5)-L(setPxQx)
    .int       L(P4Q5)-L(setPxQx)
    .int       L(P5Q5)-L(setPxQx)
    .int       L(P6Q5)-L(setPxQx)
    .int       L(P7Q5)-L(setPxQx)

    .int       L(P0Q6)-L(setPxQx)   /* 48 */
    .int       L(P1Q6)-L(setPxQx)
    .int       L(P2Q6)-L(setPxQx)
    .int       L(P3Q6)-L(setPxQx)
    .int       L(P4Q6)-L(setPxQx)
    .int       L(P5Q6)-L(setPxQx)
    .int       L(P6Q6)-L(setPxQx)
    .int       L(P7Q6)-L(setPxQx)

    .int       L(P0Q7)-L(setPxQx)   /* 56 */
    .int       L(P1Q7)-L(setPxQx)
    .int       L(P2Q7)-L(setPxQx)
    .int       L(P3Q7)-L(setPxQx)
    .int       L(P4Q7)-L(setPxQx)
    .int       L(P5Q7)-L(setPxQx)
    .int       L(P6Q7)-L(setPxQx)
    .int       L(P7Q7)-L(setPxQx)

    .int       L(P0Q8)-L(setPxQx)   /* 64 */
    .int       L(P1Q8)-L(setPxQx)
    .int       L(P2Q8)-L(setPxQx)
    .int       L(P3Q8)-L(setPxQx)
    .int       L(P4Q8)-L(setPxQx)
    .int       L(P5Q8)-L(setPxQx)
    .int       L(P6Q8)-L(setPxQx)
    .int       L(P7Q8)-L(setPxQx)

    .int       L(P0Q9)-L(setPxQx)   /* 72 */
    .int       L(P1Q9)-L(setPxQx)
    .int       L(P2Q9)-L(setPxQx)
    .int       L(P3Q9)-L(setPxQx)
    .int       L(P4Q9)-L(setPxQx)
    .int       L(P5Q9)-L(setPxQx)
    .int       L(P6Q9)-L(setPxQx)
    .int       L(P7Q9)-L(setPxQx)   /* 79 */

    .p2align 4
L(P0Q9): mov    %rax, -0x48(%rdi)
L(P0Q8): mov    %rax, -0x40(%rdi)
L(P0Q7): mov    %rax, -0x38(%rdi)
L(P0Q6): mov    %rax, -0x30(%rdi)
L(P0Q5): mov    %rax, -0x28(%rdi)
L(P0Q4): mov    %rax, -0x20(%rdi)
L(P0Q3): mov    %rax, -0x18(%rdi)
L(P0Q2): mov    %rax, -0x10(%rdi)
L(P0Q1): mov    %rax, -0x8(%rdi)
L(P0Q0):
     ret

    .p2align 4
L(P1Q9): mov    %rax, -0x49(%rdi)
L(P1Q8): mov    %rax, -0x41(%rdi)
L(P1Q7): mov    %rax, -0x39(%rdi)
L(P1Q6): mov    %rax, -0x31(%rdi)
L(P1Q5): mov    %rax, -0x29(%rdi)
L(P1Q4): mov    %rax, -0x21(%rdi)
L(P1Q3): mov    %rax, -0x19(%rdi)
L(P1Q2): mov    %rax, -0x11(%rdi)
L(P1Q1): mov    %rax, -0x9(%rdi)
L(P1Q0): mov    %al, -0x1(%rdi)
     ret

    .p2align 4
L(P2Q9): mov    %rax, -0x4a(%rdi)
L(P2Q8): mov    %rax, -0x42(%rdi)
L(P2Q7): mov    %rax, -0x3a(%rdi)
L(P2Q6): mov    %rax, -0x32(%rdi)
L(P2Q5): mov    %rax, -0x2a(%rdi)
L(P2Q4): mov    %rax, -0x22(%rdi)
L(P2Q3): mov    %rax, -0x1a(%rdi)
L(P2Q2): mov    %rax, -0x12(%rdi)
L(P2Q1): mov    %rax, -0xa(%rdi)
L(P2Q0): mov    %ax, -0x2(%rdi)
     ret

    .p2align 4
L(P3Q9): mov    %rax, -0x4b(%rdi)
L(P3Q8): mov    %rax, -0x43(%rdi)
L(P3Q7): mov    %rax, -0x3b(%rdi)
L(P3Q6): mov    %rax, -0x33(%rdi)
L(P3Q5): mov    %rax, -0x2b(%rdi)
L(P3Q4): mov    %rax, -0x23(%rdi)
L(P3Q3): mov    %rax, -0x1b(%rdi)
L(P3Q2): mov    %rax, -0x13(%rdi)
L(P3Q1): mov    %rax, -0xb(%rdi)
L(P3Q0): mov    %ax, -0x3(%rdi)
     mov    %al, -0x1(%rdi)
     ret

    .p2align 4
L(P4Q9): mov    %rax, -0x4c(%rdi)
L(P4Q8): mov    %rax, -0x44(%rdi)
L(P4Q7): mov    %rax, -0x3c(%rdi)
L(P4Q6): mov    %rax, -0x34(%rdi)
L(P4Q5): mov    %rax, -0x2c(%rdi)
L(P4Q4): mov    %rax, -0x24(%rdi)
L(P4Q3): mov    %rax, -0x1c(%rdi)
L(P4Q2): mov    %rax, -0x14(%rdi)
L(P4Q1): mov    %rax, -0xc(%rdi)
L(P4Q0): mov    %eax, -0x4(%rdi)
     ret

    .p2align 4
L(P5Q9): mov    %rax, -0x4d(%rdi)
L(P5Q8): mov    %rax, -0x45(%rdi)
L(P5Q7): mov    %rax, -0x3d(%rdi)
L(P5Q6): mov    %rax, -0x35(%rdi)
L(P5Q5): mov    %rax, -0x2d(%rdi)
L(P5Q4): mov    %rax, -0x25(%rdi)
L(P5Q3): mov    %rax, -0x1d(%rdi)
L(P5Q2): mov    %rax, -0x15(%rdi)
L(P5Q1): mov    %rax, -0xd(%rdi)
L(P5Q0): mov    %eax, -0x5(%rdi)
     mov    %al, -0x1(%rdi)
     ret

    .p2align 4
L(P6Q9): mov    %rax, -0x4e(%rdi)
L(P6Q8): mov    %rax, -0x46(%rdi)
L(P6Q7): mov    %rax, -0x3e(%rdi)
L(P6Q6): mov    %rax, -0x36(%rdi)
L(P6Q5): mov    %rax, -0x2e(%rdi)
L(P6Q4): mov    %rax, -0x26(%rdi)
L(P6Q3): mov    %rax, -0x1e(%rdi)
L(P6Q2): mov    %rax, -0x16(%rdi)
L(P6Q1): mov    %rax, -0xe(%rdi)
L(P6Q0): mov    %eax, -0x6(%rdi)
     mov    %ax, -0x2(%rdi)
     ret

    .p2align 4
L(P7Q9): mov    %rax, -0x4f(%rdi)
L(P7Q8): mov    %rax, -0x47(%rdi)
L(P7Q7): mov    %rax, -0x3f(%rdi)
L(P7Q6): mov    %rax, -0x37(%rdi)
L(P7Q5): mov    %rax, -0x2f(%rdi)
L(P7Q4): mov    %rax, -0x27(%rdi)
L(P7Q3): mov    %rax, -0x1f(%rdi)
L(P7Q2): mov    %rax, -0x17(%rdi)
L(P7Q1): mov    %rax, -0xf(%rdi)
L(P7Q0): mov    %eax, -0x7(%rdi)
     mov    %ax, -0x3(%rdi)
     mov    %al, -0x1(%rdi)
     ret

    /*
     * Align to a 16-byte boundary. Avoids penalties from unaligned stores
     * as well as from stores spanning cachelines. Note 16-byte alignment
     * is better in case where rep sstosq is used.
     */
    .p2align 4
L(ck_align):
    test    $0xf, %rdi
    jz  L(aligned_now)
    test    $1, %rdi
    jz  2f
    mov %al, (%rdi)
    dec %rsi
    lea 1(%rdi),%rdi
2:
    test    $2, %rdi
    jz  4f
    mov %ax, (%rdi)
    sub $2, %rsi
    lea 2(%rdi),%rdi
4:
    test    $4, %rdi
    jz  8f
    mov %eax, (%rdi)
    sub $4, %rsi
    lea 4(%rdi),%rdi
8:
    test    $8, %rdi
    jz  L(aligned_now)
    mov %rax, (%rdi)
    sub $8, %rsi
    lea 8(%rdi),%rdi

    /*
     * For large sizes rep sstoq is fastest.
     * Transition point determined experimentally as measured on
     * Intel Xeon processors (incl. Nehalem) and AMD Opteron.
     */
L(aligned_now):
    cmp $BZERO_USE_REP, %rsi
    ja  L(use_rep)

    /*
     * zero 64-bytes per loop
     */
    .p2align 4
L(bzero_loop):
    leaq    -0x40(%rsi), %rsi
    cmpq    $0x40, %rsi
    movq    %rax, (%rdi)
    movq    %rax, 0x8(%rdi)
    movq    %rax, 0x10(%rdi)
    movq    %rax, 0x18(%rdi)
    movq    %rax, 0x20(%rdi)
    movq    %rax, 0x28(%rdi)
    movq    %rax, 0x30(%rdi)
    movq    %rax, 0x38(%rdi)
    leaq    0x40(%rdi), %rdi
    jae L(bzero_loop)

    /*
     * Clear any remaining bytes..
     */
9:
    leaq    L(setPxQx)(%rip), %r10
    addq    %rsi, %rdi
    movslq  (%r10,%rsi,4), %rcx
    leaq    (%rcx,%r10,1), %r10
    jmpq    *%r10

    /*
     * Use rep sstoq. Clear any remainder via unrolled code
     */
    .p2align 4
L(use_rep):
    movq    %rsi, %rcx      /* get size in bytes */
    shrq    $3, %rcx        /* count of 8-byte words to zero */
    rep
      sstoq             /* %rcx = words to clear (%rax=0) */
    andq    $7, %rsi        /* remaining bytes */
    jnz 9b
    ret
#undef  L
    SET_SIZE(bzero_altentry)
    SET_SIZE(bzero)

#elif defined(__i386)

#define ARG_ADDR    4
#define ARG_COUNT   8

    ENTRY(bzero)
#ifdef DEBUG
    movl    postbootkernelbase, %eax
    cmpl    %eax, ARG_ADDR(%esp)
    jnb 0f
    pushl   %ebp
    movl    %esp, %ebp
    pushl   $.bzero_panic_msg
    call    panic
0:
#endif
do_zero:
    movl    %edi, %edx
    movl    ARG_COUNT(%esp), %ecx
    movl    ARG_ADDR(%esp), %edi
    shrl    $2, %ecx
    xorl    %eax, %eax
    rep
      sstol
    movl    ARG_COUNT(%esp), %ecx
    andl    $3, %ecx
    rep
      sstob
    movl    %edx, %edi
    ret
    SET_SIZE(bzero)

#undef  ARG_ADDR
#undef  ARG_COUNT

#endif  /* __i386 */
#endif  /* __lint */

/*
 * Transfer data to and from user space -
 * Note that these routines can cause faults
 * It is assumed that the kernel has nothing at
 * less than KERNELBASE in the virtual address space.
 *
 * Note that copyin(9F) and copyout(9F) are part of the
 * DDI/DKI which specifies that they return '-1' on "errors."
 *
 * Sigh.
 *
 * So there's two extremely similar routines - xcopyin_nta() and
 * xcopyout_nta() which return the errno that we've faithfully computed.
 * This allows other callers (e.g. uiomove(9F)) to work correctly.
 * Given that these are used pretty heavily, we expand the calling
 * sequences inline for all flavours (rather than making wrappers).
 */

/*
 * Copy user data to kernel space.
 */

#if defined(__lint)

/* ARGSUSED */
int
copyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }

#else   /* lint */

#if defined(__amd64)

    ENTRY(copyin)
    pushq   %rbp
    movq    %rsp, %rbp
    subq    $24, %rsp

    /*
     * save args in case we trap and need to rerun as a copyop
     */
    movq    %rdi, (%rsp)
    movq    %rsi, 0x8(%rsp)
    movq    %rdx, 0x10(%rsp)

    movq    kernelbase(%rip), %rax
#ifdef DEBUG
    cmpq    %rax, %rsi      /* %rsi = kaddr */
    jnb 1f
    leaq    .copyin_panic_msg(%rip), %rdi
    xorl    %eax, %eax
    call    panic
1:
#endif
    /*
     * pass lofault value as 4th argument to do_copy_fault
     */
    leaq    _copyin_err(%rip), %rcx

    movq    %gs:CPU_THREAD, %r9
    cmpq    %rax, %rdi      /* test uaddr < kernelbase */
    jae 3f          /* take copyop if uaddr > kernelbase */
    SMAP_DISABLE_INSTR(0)
    jmp do_copy_fault       /* Takes care of leave for us */

_copyin_err:
    SMAP_ENABLE_INSTR(2)
    movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
    addq    $8, %rsp        /* pop bcopy_altentry call ret addr */
3:
    movq    T_COPYOPS(%r9), %rax
    cmpq    $0, %rax
    jz  2f
    /*
     * reload args for the copyop
     */
    movq    (%rsp), %rdi
    movq    0x8(%rsp), %rsi
    movq    0x10(%rsp), %rdx
    leave
    jmp *CP_COPYIN(%rax)

2:  movl    $-1, %eax
    leave
    ret
    SET_SIZE(copyin)

#elif defined(__i386)

#define ARG_UADDR   4
#define ARG_KADDR   8

    ENTRY(copyin)
    movl    kernelbase, %ecx
#ifdef DEBUG
    cmpl    %ecx, ARG_KADDR(%esp)
    jnb 1f
    pushl   %ebp
    movl    %esp, %ebp
    pushl   $.copyin_panic_msg
    call    panic
1:
#endif
    lea _copyin_err, %eax

    movl    %gs:CPU_THREAD, %edx
    cmpl    %ecx, ARG_UADDR(%esp)   /* test uaddr < kernelbase */
    jb  do_copy_fault
    jmp 3f

_copyin_err:
    popl    %ecx
    popl    %edi
    movl    %ecx, T_LOFAULT(%edx)   /* restore original lofault */
    popl    %esi
    popl    %ebp
3:
    movl    T_COPYOPS(%edx), %eax
    cmpl    $0, %eax
    jz  2f
    jmp *CP_COPYIN(%eax)

2:  movl    $-1, %eax
    ret
    SET_SIZE(copyin)

#undef  ARG_UADDR
#undef  ARG_KADDR

#endif  /* __i386 */
#endif  /* __lint */

#if defined(__lint)

/* ARGSUSED */
int
xcopyin_nta(const void *uaddr, void *kaddr, size_t count, int copy_cached)
{ return (0); }

#else   /* __lint */

#if defined(__amd64)

    ENTRY(xcopyin_nta)
    pushq   %rbp
    movq    %rsp, %rbp
    subq    $24, %rsp

    /*
     * save args in case we trap and need to rerun as a copyop
     * %rcx is consumed in this routine so we don't need to save
     * it.
     */
    movq    %rdi, (%rsp)
    movq    %rsi, 0x8(%rsp)
    movq    %rdx, 0x10(%rsp)

    movq    kernelbase(%rip), %rax
#ifdef DEBUG
    cmpq    %rax, %rsi      /* %rsi = kaddr */
    jnb 1f
    leaq    .xcopyin_panic_msg(%rip), %rdi
    xorl    %eax, %eax
    call    panic
1:
#endif
    movq    %gs:CPU_THREAD, %r9
    cmpq    %rax, %rdi      /* test uaddr < kernelbase */
    jae 4f
    cmpq    $0, %rcx        /* No non-temporal access? */
    /*
     * pass lofault value as 4th argument to do_copy_fault
     */
    leaq    _xcopyin_err(%rip), %rcx    /* doesn't set rflags */
    jnz 6f          /* use regular access */
    /*
     * Make sure cnt is >= XCOPY_MIN_SIZE bytes
     */
    cmpq    $XCOPY_MIN_SIZE, %rdx
    jae 5f
6:
    SMAP_DISABLE_INSTR(1)
    jmp do_copy_fault

    /*
     * Make sure src and dst are NTA_ALIGN_SIZE aligned,
     * count is COUNT_ALIGN_SIZE aligned.
     */
5:
    movq    %rdi, %r10
    orq %rsi, %r10
    andq    $NTA_ALIGN_MASK, %r10
    orq %rdx, %r10
    andq    $COUNT_ALIGN_MASK, %r10
    jnz 6b
    leaq    _xcopyin_nta_err(%rip), %rcx    /* doesn't set rflags */
    SMAP_DISABLE_INSTR(2)
    jmp do_copy_fault_nta   /* use non-temporal access */

4:
    movl    $EFAULT, %eax
    jmp 3f

    /*
     * A fault during do_copy_fault or do_copy_fault_nta is
     * indicated through an errno value in %rax and we iret from the
     * trap handler to here.
     */
_xcopyin_err:
    addq    $8, %rsp        /* pop bcopy_altentry call ret addr */
_xcopyin_nta_err:
    SMAP_ENABLE_INSTR(3)
    movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
3:
    movq    T_COPYOPS(%r9), %r8
    cmpq    $0, %r8
    jz  2f

    /*
     * reload args for the copyop
     */
    movq    (%rsp), %rdi
    movq    0x8(%rsp), %rsi
    movq    0x10(%rsp), %rdx
    leave
    jmp *CP_XCOPYIN(%r8)

2:  leave
    ret
    SET_SIZE(xcopyin_nta)

#elif defined(__i386)

#define ARG_UADDR   4
#define ARG_KADDR   8
#define ARG_COUNT   12
#define ARG_CACHED  16

    .globl  use_sse_copy

    ENTRY(xcopyin_nta)
    movl    kernelbase, %ecx
    lea _xcopyin_err, %eax
    movl    %gs:CPU_THREAD, %edx
    cmpl    %ecx, ARG_UADDR(%esp)   /* test uaddr < kernelbase */
    jae 4f

    cmpl    $0, use_sse_copy    /* no sse support */
    jz  do_copy_fault

    cmpl    $0, ARG_CACHED(%esp)    /* copy_cached hint set? */
    jnz do_copy_fault

    /*
     * Make sure cnt is >= XCOPY_MIN_SIZE bytes
     */
    cmpl    $XCOPY_MIN_SIZE, ARG_COUNT(%esp)
    jb  do_copy_fault

    /*
     * Make sure src and dst are NTA_ALIGN_SIZE aligned,
     * count is COUNT_ALIGN_SIZE aligned.
     */
    movl    ARG_UADDR(%esp), %ecx
    orl ARG_KADDR(%esp), %ecx
    andl    $NTA_ALIGN_MASK, %ecx
    orl ARG_COUNT(%esp), %ecx
    andl    $COUNT_ALIGN_MASK, %ecx
    jnz do_copy_fault

    jmp do_copy_fault_nta   /* use regular access */

4:
    movl    $EFAULT, %eax
    jmp 3f

    /*
     * A fault during do_copy_fault or do_copy_fault_nta is
     * indicated through an errno value in %eax and we iret from the
     * trap handler to here.
     */
_xcopyin_err:
    popl    %ecx
    popl    %edi
    movl    %ecx, T_LOFAULT(%edx)   /* restore original lofault */
    popl    %esi
    popl    %ebp
3:
    cmpl    $0, T_COPYOPS(%edx)
    jz  2f
    movl    T_COPYOPS(%edx), %eax
    jmp *CP_XCOPYIN(%eax)

2:  rep;    ret /* use 2 byte return instruction when branch target */
            /* AMD Software Optimization Guide - Section 6.2 */
    SET_SIZE(xcopyin_nta)

#undef  ARG_UADDR
#undef  ARG_KADDR
#undef  ARG_COUNT
#undef  ARG_CACHED

#endif  /* __i386 */
#endif  /* __lint */

/*
 * Copy kernel data to user space.
 */

#if defined(__lint)

/* ARGSUSED */
int
copyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }

#else   /* __lint */

#if defined(__amd64)

    ENTRY(copyout)
    pushq   %rbp
    movq    %rsp, %rbp
    subq    $24, %rsp

    /*
     * save args in case we trap and need to rerun as a copyop
     */
    movq    %rdi, (%rsp)
    movq    %rsi, 0x8(%rsp)
    movq    %rdx, 0x10(%rsp)

    movq    kernelbase(%rip), %rax
#ifdef DEBUG
    cmpq    %rax, %rdi      /* %rdi = kaddr */
    jnb 1f
    leaq    .copyout_panic_msg(%rip), %rdi
    xorl    %eax, %eax
    call    panic
1:
#endif
    /*
     * pass lofault value as 4th argument to do_copy_fault
     */
    leaq    _copyout_err(%rip), %rcx

    movq    %gs:CPU_THREAD, %r9
    cmpq    %rax, %rsi      /* test uaddr < kernelbase */
    jae 3f          /* take copyop if uaddr > kernelbase */
    SMAP_DISABLE_INSTR(3)
    jmp do_copy_fault       /* Calls leave for us */

_copyout_err:
    SMAP_ENABLE_INSTR(4)
    movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
    addq    $8, %rsp        /* pop bcopy_altentry call ret addr */
3:
    movq    T_COPYOPS(%r9), %rax
    cmpq    $0, %rax
    jz  2f

    /*
     * reload args for the copyop
     */
    movq    (%rsp), %rdi
    movq    0x8(%rsp), %rsi
    movq    0x10(%rsp), %rdx
    leave
    jmp *CP_COPYOUT(%rax)

2:  movl    $-1, %eax
    leave
    ret
    SET_SIZE(copyout)

#elif defined(__i386)

#define ARG_KADDR   4
#define ARG_UADDR   8

    ENTRY(copyout)
    movl    kernelbase, %ecx
#ifdef DEBUG
    cmpl    %ecx, ARG_KADDR(%esp)
    jnb 1f
    pushl   %ebp
    movl    %esp, %ebp
    pushl   $.copyout_panic_msg
    call    panic
1:
#endif
    lea _copyout_err, %eax
    movl    %gs:CPU_THREAD, %edx
    cmpl    %ecx, ARG_UADDR(%esp)   /* test uaddr < kernelbase */
    jb  do_copy_fault
    jmp 3f

_copyout_err:
    popl    %ecx
    popl    %edi
    movl    %ecx, T_LOFAULT(%edx)   /* restore original lofault */
    popl    %esi
    popl    %ebp
3:
    movl    T_COPYOPS(%edx), %eax
    cmpl    $0, %eax
    jz  2f
    jmp *CP_COPYOUT(%eax)

2:  movl    $-1, %eax
    ret
    SET_SIZE(copyout)

#undef  ARG_UADDR
#undef  ARG_KADDR

#endif  /* __i386 */
#endif  /* __lint */

#if defined(__lint)

/* ARGSUSED */
int
xcopyout_nta(const void *kaddr, void *uaddr, size_t count, int copy_cached)
{ return (0); }

#else   /* __lint */

#if defined(__amd64)

    ENTRY(xcopyout_nta)
    pushq   %rbp
    movq    %rsp, %rbp
    subq    $24, %rsp

    /*
     * save args in case we trap and need to rerun as a copyop
     */
    movq    %rdi, (%rsp)
    movq    %rsi, 0x8(%rsp)
    movq    %rdx, 0x10(%rsp)

    movq    kernelbase(%rip), %rax
#ifdef DEBUG
    cmpq    %rax, %rdi      /* %rdi = kaddr */
    jnb 1f
    leaq    .xcopyout_panic_msg(%rip), %rdi
    xorl    %eax, %eax
    call    panic
1:
#endif
    movq    %gs:CPU_THREAD, %r9
    cmpq    %rax, %rsi      /* test uaddr < kernelbase */
    jae 4f

    cmpq    $0, %rcx        /* No non-temporal access? */
    /*
     * pass lofault value as 4th argument to do_copy_fault
     */
    leaq    _xcopyout_err(%rip), %rcx
    jnz 6f
    /*
     * Make sure cnt is >= XCOPY_MIN_SIZE bytes
     */
    cmpq    $XCOPY_MIN_SIZE, %rdx
    jae 5f
6:
    SMAP_DISABLE_INSTR(4)
    jmp do_copy_fault

    /*
     * Make sure src and dst are NTA_ALIGN_SIZE aligned,
     * count is COUNT_ALIGN_SIZE aligned.
     */
5:
    movq    %rdi, %r10
    orq %rsi, %r10
    andq    $NTA_ALIGN_MASK, %r10
    orq %rdx, %r10
    andq    $COUNT_ALIGN_MASK, %r10
    jnz 6b
    leaq    _xcopyout_nta_err(%rip), %rcx
    SMAP_DISABLE_INSTR(5)
    call    do_copy_fault_nta
    SMAP_ENABLE_INSTR(5)
    ret

4:
    movl    $EFAULT, %eax
    jmp 3f

    /*
     * A fault during do_copy_fault or do_copy_fault_nta is
     * indicated through an errno value in %rax and we iret from the
     * trap handler to here.
     */
_xcopyout_err:
    addq    $8, %rsp        /* pop bcopy_altentry call ret addr */
_xcopyout_nta_err:
    SMAP_ENABLE_INSTR(6)
    movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
3:
    movq    T_COPYOPS(%r9), %r8
    cmpq    $0, %r8
    jz  2f

    /*
     * reload args for the copyop
     */
    movq    (%rsp), %rdi
    movq    0x8(%rsp), %rsi
    movq    0x10(%rsp), %rdx
    leave
    jmp *CP_XCOPYOUT(%r8)

2:  leave
    ret
    SET_SIZE(xcopyout_nta)

#elif defined(__i386)

#define ARG_KADDR   4
#define ARG_UADDR   8
#define ARG_COUNT   12
#define ARG_CACHED  16

    ENTRY(xcopyout_nta)
    movl    kernelbase, %ecx
    lea _xcopyout_err, %eax
    movl    %gs:CPU_THREAD, %edx
    cmpl    %ecx, ARG_UADDR(%esp)   /* test uaddr < kernelbase */
    jae 4f

    cmpl    $0, use_sse_copy    /* no sse support */
    jz  do_copy_fault

    cmpl    $0, ARG_CACHED(%esp)    /* copy_cached hint set? */
    jnz do_copy_fault

    /*
     * Make sure cnt is >= XCOPY_MIN_SIZE bytes
     */
    cmpl    $XCOPY_MIN_SIZE, %edx
    jb  do_copy_fault

    /*
     * Make sure src and dst are NTA_ALIGN_SIZE aligned,
     * count is COUNT_ALIGN_SIZE aligned.
     */
    movl    ARG_UADDR(%esp), %ecx
    orl ARG_KADDR(%esp), %ecx
    andl    $NTA_ALIGN_MASK, %ecx
    orl ARG_COUNT(%esp), %ecx
    andl    $COUNT_ALIGN_MASK, %ecx
    jnz do_copy_fault
    jmp do_copy_fault_nta

4:
    movl    $EFAULT, %eax
    jmp 3f

    /*
     * A fault during do_copy_fault or do_copy_fault_nta is
     * indicated through an errno value in %eax and we iret from the
     * trap handler to here.
     */
_xcopyout_err:
    / restore the original lofault
    popl    %ecx
    popl    %edi
    movl    %ecx, T_LOFAULT(%edx)   / original lofault
    popl    %esi
    popl    %ebp
3:
    cmpl    $0, T_COPYOPS(%edx)
    jz  2f
    movl    T_COPYOPS(%edx), %eax
    jmp *CP_XCOPYOUT(%eax)

2:  rep;    ret /* use 2 byte return instruction when branch target */
            /* AMD Software Optimization Guide - Section 6.2 */
    SET_SIZE(xcopyout_nta)

#undef  ARG_UADDR
#undef  ARG_KADDR
#undef  ARG_COUNT
#undef  ARG_CACHED

#endif  /* __i386 */
#endif  /* __lint */

/*
 * Copy a null terminated string from one point to another in
 * the kernel address space.
 */

#if defined(__lint)

/* ARGSUSED */
int
copystr(const char *from, char *to, size_t maxlength, size_t *lencopied)
{ return (0); }

#else   /* __lint */

#if defined(__amd64)

    ENTRY(copystr)
    pushq   %rbp
    movq    %rsp, %rbp
#ifdef DEBUG
    movq    kernelbase(%rip), %rax
    cmpq    %rax, %rdi      /* %rdi = from */
    jb  0f
    cmpq    %rax, %rsi      /* %rsi = to */
    jnb 1f
0:  leaq    .copystr_panic_msg(%rip), %rdi
    xorl    %eax, %eax
    call    panic
1:
#endif
    movq    %gs:CPU_THREAD, %r9
    movq    T_LOFAULT(%r9), %r8 /* pass current lofault value as */
                    /* 5th argument to do_copystr */
    xorl    %r10d,%r10d     /* pass smap restore need in %r10d */
                    /* as a non-ABI 6th arg */
do_copystr:
    movq    %gs:CPU_THREAD, %r9 /* %r9 = thread addr */
    movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
    movq    %r8, T_LOFAULT(%r9) /* new lofault */

    movq    %rdx, %r8       /* save maxlength */

    cmpq    $0, %rdx        /* %rdx = maxlength */
    je  copystr_enametoolong    /* maxlength == 0 */

copystr_loop:
    decq    %r8
    movb    (%rdi), %al
    incq    %rdi
    movb    %al, (%rsi)
    incq    %rsi
    cmpb    $0, %al
    je  copystr_null        /* null char */
    cmpq    $0, %r8
    jne copystr_loop

copystr_enametoolong:
    movl    $ENAMETOOLONG, %eax
    jmp copystr_out

copystr_null:
    xorl    %eax, %eax      /* no error */

copystr_out:
    cmpq    $0, %rcx        /* want length? */
    je  copystr_smap        /* no */
    subq    %r8, %rdx       /* compute length and store it */
    movq    %rdx, (%rcx)

copystr_smap:
    cmpl    $0, %r10d
    jz  copystr_done
    SMAP_ENABLE_INSTR(7)

copystr_done:
    movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
    leave
    ret
    SET_SIZE(copystr)

#elif defined(__i386)

#define ARG_FROM    8
#define ARG_TO      12
#define ARG_MAXLEN  16
#define ARG_LENCOPIED   20

    ENTRY(copystr)
#ifdef DEBUG
    pushl   %ebp
    movl    %esp, %ebp
    movl    kernelbase, %eax
    cmpl    %eax, ARG_FROM(%esp)
    jb  0f
    cmpl    %eax, ARG_TO(%esp)
    jnb 1f
0:  pushl   $.copystr_panic_msg
    call    panic
1:  popl    %ebp
#endif
    /* get the current lofault address */
    movl    %gs:CPU_THREAD, %eax
    movl    T_LOFAULT(%eax), %eax
do_copystr:
    pushl   %ebp            /* setup stack frame */
    movl    %esp, %ebp
    pushl   %ebx            /* save registers */
    pushl   %edi

    movl    %gs:CPU_THREAD, %ebx
    movl    T_LOFAULT(%ebx), %edi
    pushl   %edi            /* save the current lofault */
    movl    %eax, T_LOFAULT(%ebx)   /* new lofault */

    movl    ARG_MAXLEN(%ebp), %ecx
    cmpl    $0, %ecx
    je  copystr_enametoolong    /* maxlength == 0 */

    movl    ARG_FROM(%ebp), %ebx    /* source address */
    movl    ARG_TO(%ebp), %edx  /* destination address */

copystr_loop:
    decl    %ecx
    movb    (%ebx), %al
    incl    %ebx
    movb    %al, (%edx)
    incl    %edx
    cmpb    $0, %al
    je  copystr_null        /* null char */
    cmpl    $0, %ecx
    jne copystr_loop

copystr_enametoolong:
    movl    $ENAMETOOLONG, %eax
    jmp copystr_out

copystr_null:
    xorl    %eax, %eax      /* no error */

copystr_out:
    cmpl    $0, ARG_LENCOPIED(%ebp) /* want length? */
    je  copystr_done        /* no */
    movl    ARG_MAXLEN(%ebp), %edx
    subl    %ecx, %edx      /* compute length and store it */
    movl    ARG_LENCOPIED(%ebp), %ecx
    movl    %edx, (%ecx)

copystr_done:
    popl    %edi
    movl    %gs:CPU_THREAD, %ebx
    movl    %edi, T_LOFAULT(%ebx)   /* restore the original lofault */

    popl    %edi
    popl    %ebx
    popl    %ebp
    ret
    SET_SIZE(copystr)

#undef  ARG_FROM
#undef  ARG_TO
#undef  ARG_MAXLEN
#undef  ARG_LENCOPIED

#endif  /* __i386 */
#endif  /* __lint */

/*
 * Copy a null terminated string from the user address space into
 * the kernel address space.
 */

#if defined(__lint)

/* ARGSUSED */
int
copyinstr(const char *uaddr, char *kaddr, size_t maxlength,
    size_t *lencopied)
{ return (0); }

#else   /* __lint */

#if defined(__amd64)

    ENTRY(copyinstr)
    pushq   %rbp
    movq    %rsp, %rbp
    subq    $32, %rsp

    /*
     * save args in case we trap and need to rerun as a copyop
     */
    movq    %rdi, (%rsp)
    movq    %rsi, 0x8(%rsp)
    movq    %rdx, 0x10(%rsp)
    movq    %rcx, 0x18(%rsp)

    movq    kernelbase(%rip), %rax
#ifdef DEBUG
    cmpq    %rax, %rsi      /* %rsi = kaddr */
    jnb 1f
    leaq    .copyinstr_panic_msg(%rip), %rdi
    xorl    %eax, %eax
    call    panic
1:
#endif
    /*
     * pass lofault value as 5th argument to do_copystr
     * do_copystr expects whether or not we need smap in %r10d
     */
    leaq    _copyinstr_error(%rip), %r8
    movl    $1, %r10d

    cmpq    %rax, %rdi      /* test uaddr < kernelbase */
    jae 4f
    SMAP_DISABLE_INSTR(6)
    jmp do_copystr
4:
    movq    %gs:CPU_THREAD, %r9
    jmp 3f

_copyinstr_error:
    SMAP_ENABLE_INSTR(8)
    movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
3:
    movq    T_COPYOPS(%r9), %rax
    cmpq    $0, %rax
    jz  2f

    /*
     * reload args for the copyop
     */
    movq    (%rsp), %rdi
    movq    0x8(%rsp), %rsi
    movq    0x10(%rsp), %rdx
    movq    0x18(%rsp), %rcx
    leave
    jmp *CP_COPYINSTR(%rax)

2:  movl    $EFAULT, %eax       /* return EFAULT */
    leave
    ret
    SET_SIZE(copyinstr)

#elif defined(__i386)

#define ARG_UADDR   4
#define ARG_KADDR   8

    ENTRY(copyinstr)
    movl    kernelbase, %ecx
#ifdef DEBUG
    cmpl    %ecx, ARG_KADDR(%esp)
    jnb 1f
    pushl   %ebp
    movl    %esp, %ebp
    pushl   $.copyinstr_panic_msg
    call    panic
1:
#endif
    lea _copyinstr_error, %eax
    cmpl    %ecx, ARG_UADDR(%esp)   /* test uaddr < kernelbase */
    jb  do_copystr
    movl    %gs:CPU_THREAD, %edx
    jmp 3f

_copyinstr_error:
    popl    %edi
    movl    %gs:CPU_THREAD, %edx
    movl    %edi, T_LOFAULT(%edx)   /* original lofault */

    popl    %edi
    popl    %ebx
    popl    %ebp
3:
    movl    T_COPYOPS(%edx), %eax
    cmpl    $0, %eax
    jz  2f
    jmp *CP_COPYINSTR(%eax)

2:  movl    $EFAULT, %eax       /* return EFAULT */
    ret
    SET_SIZE(copyinstr)

#undef  ARG_UADDR
#undef  ARG_KADDR

#endif  /* __i386 */
#endif  /* __lint */

/*
 * Copy a null terminated string from the kernel
 * address space to the user address space.
 */

#if defined(__lint)

/* ARGSUSED */
int
copyoutstr(const char *kaddr, char *uaddr, size_t maxlength,
    size_t *lencopied)
{ return (0); }

#else   /* __lint */

#if defined(__amd64)

    ENTRY(copyoutstr)
    pushq   %rbp
    movq    %rsp, %rbp
    subq    $32, %rsp

    /*
     * save args in case we trap and need to rerun as a copyop
     */
    movq    %rdi, (%rsp)
    movq    %rsi, 0x8(%rsp)
    movq    %rdx, 0x10(%rsp)
    movq    %rcx, 0x18(%rsp)

    movq    kernelbase(%rip), %rax
#ifdef DEBUG
    cmpq    %rax, %rdi      /* %rdi = kaddr */
    jnb 1f
    leaq    .copyoutstr_panic_msg(%rip), %rdi
    jmp call_panic      /* setup stack and call panic */
1:
#endif
    /*
     * pass lofault value as 5th argument to do_copystr
     * pass one as 6th argument to do_copystr in %r10d
     */
    leaq    _copyoutstr_error(%rip), %r8
    movl    $1, %r10d

    cmpq    %rax, %rsi      /* test uaddr < kernelbase */
    jae 4f
    SMAP_DISABLE_INSTR(7)
    jmp do_copystr
4:
    movq    %gs:CPU_THREAD, %r9
    jmp 3f

_copyoutstr_error:
    SMAP_ENABLE_INSTR(9)
    movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
3:
    movq    T_COPYOPS(%r9), %rax
    cmpq    $0, %rax
    jz  2f

    /*
     * reload args for the copyop
     */
    movq    (%rsp), %rdi
    movq    0x8(%rsp), %rsi
    movq    0x10(%rsp), %rdx
    movq    0x18(%rsp), %rcx
    leave
    jmp *CP_COPYOUTSTR(%rax)

2:  movl    $EFAULT, %eax       /* return EFAULT */
    leave
    ret
    SET_SIZE(copyoutstr)

#elif defined(__i386)

#define ARG_KADDR   4
#define ARG_UADDR   8

    ENTRY(copyoutstr)
    movl    kernelbase, %ecx
#ifdef DEBUG
    cmpl    %ecx, ARG_KADDR(%esp)
    jnb 1f
    pushl   %ebp
    movl    %esp, %ebp
    pushl   $.copyoutstr_panic_msg
    call    panic
1:
#endif
    lea _copyoutstr_error, %eax
    cmpl    %ecx, ARG_UADDR(%esp)   /* test uaddr < kernelbase */
    jb  do_copystr
    movl    %gs:CPU_THREAD, %edx
    jmp 3f

_copyoutstr_error:
    popl    %edi
    movl    %gs:CPU_THREAD, %edx
    movl    %edi, T_LOFAULT(%edx)   /* restore the original lofault */

    popl    %edi
    popl    %ebx
    popl    %ebp
3:
    movl    T_COPYOPS(%edx), %eax
    cmpl    $0, %eax
    jz  2f
    jmp *CP_COPYOUTSTR(%eax)

2:  movl    $EFAULT, %eax       /* return EFAULT */
    ret
    SET_SIZE(copyoutstr)

#undef  ARG_KADDR
#undef  ARG_UADDR

#endif  /* __i386 */
#endif  /* __lint */

/*
 * Since all of the fuword() variants are so similar, we have a macro to spit
 * them out.  This allows us to create DTrace-unobservable functions easily.
 */

#if defined(__lint)

#if defined(__amd64)

/* ARGSUSED */
int
fuword64(const void *addr, uint64_t *dst)
{ return (0); }

#endif

/* ARGSUSED */
int
fuword32(const void *addr, uint32_t *dst)
{ return (0); }

/* ARGSUSED */
int
fuword16(const void *addr, uint16_t *dst)
{ return (0); }

/* ARGSUSED */
int
fuword8(const void *addr, uint8_t *dst)
{ return (0); }

#else   /* __lint */

#if defined(__amd64)

/*
 * Note that we don't save and reload the arguments here
 * because their values are not altered in the copy path.
 * Additionally, when successful, the smap_enable jmp will
 * actually return us to our original caller.
 */

#define FUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)  \
    ENTRY(NAME)             \
    movq    %gs:CPU_THREAD, %r9;        \
    cmpq    kernelbase(%rip), %rdi;     \
    jae 1f;             \
    leaq    _flt_/**/NAME, %rdx;        \
    movq    %rdx, T_LOFAULT(%r9);       \
    SMAP_DISABLE_INSTR(DISNUM)      \
    INSTR   (%rdi), REG;            \
    movq    $0, T_LOFAULT(%r9);     \
    INSTR   REG, (%rsi);            \
    xorl    %eax, %eax;         \
    SMAP_ENABLE_INSTR(EN1)          \
    ret;                    \
_flt_/**/NAME:                  \
    SMAP_ENABLE_INSTR(EN2)          \
    movq    $0, T_LOFAULT(%r9);     \
1:                      \
    movq    T_COPYOPS(%r9), %rax;       \
    cmpq    $0, %rax;           \
    jz  2f;             \
    jmp *COPYOP(%rax);          \
2:                      \
    movl    $-1, %eax;          \
    ret;                    \
    SET_SIZE(NAME)

    FUWORD(fuword64, movq, %rax, CP_FUWORD64,8,10,11)
    FUWORD(fuword32, movl, %eax, CP_FUWORD32,9,12,13)
    FUWORD(fuword16, movw, %ax, CP_FUWORD16,10,14,15)
    FUWORD(fuword8, movb, %al, CP_FUWORD8,11,16,17)

#elif defined(__i386)

#define FUWORD(NAME, INSTR, REG, COPYOP)    \
    ENTRY(NAME)             \
    movl    %gs:CPU_THREAD, %ecx;       \
    movl    kernelbase, %eax;       \
    cmpl    %eax, 4(%esp);          \
    jae 1f;             \
    lea _flt_/**/NAME, %edx;        \
    movl    %edx, T_LOFAULT(%ecx);      \
    movl    4(%esp), %eax;          \
    movl    8(%esp), %edx;          \
    INSTR   (%eax), REG;            \
    movl    $0, T_LOFAULT(%ecx);        \
    INSTR   REG, (%edx);            \
    xorl    %eax, %eax;         \
    ret;                    \
_flt_/**/NAME:                  \
    movl    $0, T_LOFAULT(%ecx);        \
1:                      \
    movl    T_COPYOPS(%ecx), %eax;      \
    cmpl    $0, %eax;           \
    jz  2f;             \
    jmp *COPYOP(%eax);          \
2:                      \
    movl    $-1, %eax;          \
    ret;                    \
    SET_SIZE(NAME)

    FUWORD(fuword32, movl, %eax, CP_FUWORD32)
    FUWORD(fuword16, movw, %ax, CP_FUWORD16)
    FUWORD(fuword8, movb, %al, CP_FUWORD8)

#endif  /* __i386 */

#undef  FUWORD

#endif  /* __lint */

/*
 * Set user word.
 */

#if defined(__lint)

#if defined(__amd64)

/* ARGSUSED */
int
suword64(void *addr, uint64_t value)
{ return (0); }

#endif

/* ARGSUSED */
int
suword32(void *addr, uint32_t value)
{ return (0); }

/* ARGSUSED */
int
suword16(void *addr, uint16_t value)
{ return (0); }

/* ARGSUSED */
int
suword8(void *addr, uint8_t value)
{ return (0); }

#else   /* lint */

#if defined(__amd64)

/*
 * Note that we don't save and reload the arguments here
 * because their values are not altered in the copy path.
 */

#define SUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)  \
    ENTRY(NAME)             \
    movq    %gs:CPU_THREAD, %r9;        \
    cmpq    kernelbase(%rip), %rdi;     \
    jae 1f;             \
    leaq    _flt_/**/NAME, %rdx;        \
    SMAP_DISABLE_INSTR(DISNUM)      \
    movq    %rdx, T_LOFAULT(%r9);       \
    INSTR   REG, (%rdi);            \
    movq    $0, T_LOFAULT(%r9);     \
    xorl    %eax, %eax;         \
    SMAP_ENABLE_INSTR(EN1)          \
    ret;                    \
_flt_/**/NAME:                  \
    SMAP_ENABLE_INSTR(EN2)          \
    movq    $0, T_LOFAULT(%r9);     \
1:                      \
    movq    T_COPYOPS(%r9), %rax;       \
    cmpq    $0, %rax;           \
    jz  3f;             \
    jmp *COPYOP(%rax);          \
3:                      \
    movl    $-1, %eax;          \
    ret;                    \
    SET_SIZE(NAME)

    SUWORD(suword64, movq, %rsi, CP_SUWORD64,12,18,19)
    SUWORD(suword32, movl, %esi, CP_SUWORD32,13,20,21)
    SUWORD(suword16, movw, %si, CP_SUWORD16,14,22,23)
    SUWORD(suword8, movb, %sil, CP_SUWORD8,15,24,25)

#elif defined(__i386)

#define SUWORD(NAME, INSTR, REG, COPYOP)    \
    ENTRY(NAME)             \
    movl    %gs:CPU_THREAD, %ecx;       \
    movl    kernelbase, %eax;       \
    cmpl    %eax, 4(%esp);          \
    jae 1f;             \
    lea _flt_/**/NAME, %edx;        \
    movl    %edx, T_LOFAULT(%ecx);      \
    movl    4(%esp), %eax;          \
    movl    8(%esp), %edx;          \
    INSTR   REG, (%eax);            \
    movl    $0, T_LOFAULT(%ecx);        \
    xorl    %eax, %eax;         \
    ret;                    \
_flt_/**/NAME:                  \
    movl    $0, T_LOFAULT(%ecx);        \
1:                      \
    movl    T_COPYOPS(%ecx), %eax;      \
    cmpl    $0, %eax;           \
    jz  3f;             \
    movl    COPYOP(%eax), %ecx;     \
    jmp *%ecx;              \
3:                      \
    movl    $-1, %eax;          \
    ret;                    \
    SET_SIZE(NAME)

    SUWORD(suword32, movl, %edx, CP_SUWORD32)
    SUWORD(suword16, movw, %dx, CP_SUWORD16)
    SUWORD(suword8, movb, %dl, CP_SUWORD8)

#endif  /* __i386 */

#undef  SUWORD

#endif  /* __lint */

#if defined(__lint)

#if defined(__amd64)

/*ARGSUSED*/
void
fuword64_noerr(const void *addr, uint64_t *dst)
{}

#endif

/*ARGSUSED*/
void
fuword32_noerr(const void *addr, uint32_t *dst)
{}

/*ARGSUSED*/
void
fuword8_noerr(const void *addr, uint8_t *dst)
{}

/*ARGSUSED*/
void
fuword16_noerr(const void *addr, uint16_t *dst)
{}

#else   /* __lint */

#if defined(__amd64)

#define FUWORD_NOERR(NAME, INSTR, REG)      \
    ENTRY(NAME)             \
    cmpq    kernelbase(%rip), %rdi;     \
    cmovnbq kernelbase(%rip), %rdi;     \
    INSTR   (%rdi), REG;            \
    INSTR   REG, (%rsi);            \
    ret;                    \
    SET_SIZE(NAME)

    FUWORD_NOERR(fuword64_noerr, movq, %rax)
    FUWORD_NOERR(fuword32_noerr, movl, %eax)
    FUWORD_NOERR(fuword16_noerr, movw, %ax)
    FUWORD_NOERR(fuword8_noerr, movb, %al)

#elif defined(__i386)

#define FUWORD_NOERR(NAME, INSTR, REG)      \
    ENTRY(NAME)             \
    movl    4(%esp), %eax;          \
    cmpl    kernelbase, %eax;       \
    jb  1f;             \
    movl    kernelbase, %eax;       \
1:  movl    8(%esp), %edx;          \
    INSTR   (%eax), REG;            \
    INSTR   REG, (%edx);            \
    ret;                    \
    SET_SIZE(NAME)

    FUWORD_NOERR(fuword32_noerr, movl, %ecx)
    FUWORD_NOERR(fuword16_noerr, movw, %cx)
    FUWORD_NOERR(fuword8_noerr, movb, %cl)

#endif  /* __i386 */

#undef  FUWORD_NOERR

#endif  /* __lint */

#if defined(__lint)

#if defined(__amd64)

/*ARGSUSED*/
void
suword64_noerr(void *addr, uint64_t value)
{}

#endif

/*ARGSUSED*/
void
suword32_noerr(void *addr, uint32_t value)
{}

/*ARGSUSED*/
void
suword16_noerr(void *addr, uint16_t value)
{}

/*ARGSUSED*/
void
suword8_noerr(void *addr, uint8_t value)
{}

#else   /* lint */

#if defined(__amd64)

#define SUWORD_NOERR(NAME, INSTR, REG)      \
    ENTRY(NAME)             \
    cmpq    kernelbase(%rip), %rdi;     \
    cmovnbq kernelbase(%rip), %rdi;     \
    INSTR   REG, (%rdi);            \
    ret;                    \
    SET_SIZE(NAME)

    SUWORD_NOERR(suword64_noerr, movq, %rsi)
    SUWORD_NOERR(suword32_noerr, movl, %esi)
    SUWORD_NOERR(suword16_noerr, movw, %si)
    SUWORD_NOERR(suword8_noerr, movb, %sil)

#elif defined(__i386)

#define SUWORD_NOERR(NAME, INSTR, REG)      \
    ENTRY(NAME)             \
    movl    4(%esp), %eax;          \
    cmpl    kernelbase, %eax;       \
    jb  1f;             \
    movl    kernelbase, %eax;       \
1:                      \
    movl    8(%esp), %edx;          \
    INSTR   REG, (%eax);            \
    ret;                    \
    SET_SIZE(NAME)

    SUWORD_NOERR(suword32_noerr, movl, %edx)
    SUWORD_NOERR(suword16_noerr, movw, %dx)
    SUWORD_NOERR(suword8_noerr, movb, %dl)

#endif  /* __i386 */

#undef  SUWORD_NOERR

#endif  /* lint */


#if defined(__lint)

/*ARGSUSED*/
int
subyte(void *addr, uchar_t value)
{ return (0); }

/*ARGSUSED*/
void
subyte_noerr(void *addr, uchar_t value)
{}

/*ARGSUSED*/
int
fulword(const void *addr, ulong_t *valuep)
{ return (0); }

/*ARGSUSED*/
void
fulword_noerr(const void *addr, ulong_t *valuep)
{}

/*ARGSUSED*/
int
sulword(void *addr, ulong_t valuep)
{ return (0); }

/*ARGSUSED*/
void
sulword_noerr(void *addr, ulong_t valuep)
{}

#else

    .weak   subyte
    subyte=suword8
    .weak   subyte_noerr
    subyte_noerr=suword8_noerr

#if defined(__amd64)

    .weak   fulword
    fulword=fuword64
    .weak   fulword_noerr
    fulword_noerr=fuword64_noerr
    .weak   sulword
    sulword=suword64
    .weak   sulword_noerr
    sulword_noerr=suword64_noerr

#elif defined(__i386)

    .weak   fulword
    fulword=fuword32
    .weak   fulword_noerr
    fulword_noerr=fuword32_noerr
    .weak   sulword
    sulword=suword32
    .weak   sulword_noerr
    sulword_noerr=suword32_noerr

#endif /* __i386 */

#endif /* __lint */

#if defined(__lint)

/*
 * Copy a block of storage - must not overlap (from + len <= to).
 * No fault handler installed (to be called under on_fault())
 */

/* ARGSUSED */
void
copyout_noerr(const void *kfrom, void *uto, size_t count)
{}

/* ARGSUSED */
void
copyin_noerr(const void *ufrom, void *kto, size_t count)
{}

/*
 * Zero a block of storage in user space
 */

/* ARGSUSED */
void
uzero(void *addr, size_t count)
{}

/*
 * copy a block of storage in user space
 */

/* ARGSUSED */
void
ucopy(const void *ufrom, void *uto, size_t ulength)
{}

/*
 * copy a string in user space
 */

/* ARGSUSED */
void
ucopystr(const char *ufrom, char *uto, size_t umaxlength, size_t *lencopied)
{}

#else /* __lint */

#if defined(__amd64)

    ENTRY(copyin_noerr)
    movq    kernelbase(%rip), %rax
#ifdef DEBUG
    cmpq    %rax, %rsi      /* %rsi = kto */
    jae 1f
    leaq    .cpyin_ne_pmsg(%rip), %rdi
    jmp call_panic      /* setup stack and call panic */
1:
#endif
    cmpq    %rax, %rdi      /* ufrom < kernelbase */
    jb  do_copy
    movq    %rax, %rdi      /* force fault at kernelbase */
    jmp do_copy
    SET_SIZE(copyin_noerr)

    ENTRY(copyout_noerr)
    movq    kernelbase(%rip), %rax
#ifdef DEBUG
    cmpq    %rax, %rdi      /* %rdi = kfrom */
    jae 1f
    leaq    .cpyout_ne_pmsg(%rip), %rdi
    jmp call_panic      /* setup stack and call panic */
1:
#endif
    cmpq    %rax, %rsi      /* uto < kernelbase */
    jb  do_copy
    movq    %rax, %rsi      /* force fault at kernelbase */
    jmp do_copy
    SET_SIZE(copyout_noerr)

    ENTRY(uzero)
    movq    kernelbase(%rip), %rax
    cmpq    %rax, %rdi
    jb  do_zero
    movq    %rax, %rdi  /* force fault at kernelbase */
    jmp do_zero
    SET_SIZE(uzero)

    ENTRY(ucopy)
    movq    kernelbase(%rip), %rax
    cmpq    %rax, %rdi
    cmovaeq %rax, %rdi  /* force fault at kernelbase */
    cmpq    %rax, %rsi
    cmovaeq %rax, %rsi  /* force fault at kernelbase */
    jmp do_copy
    SET_SIZE(ucopy)

    /*
     * Note, the frame pointer is required here becuase do_copystr expects
     * to be able to pop it off!
     */
    ENTRY(ucopystr)
    pushq   %rbp
    movq    %rsp, %rbp
    movq    kernelbase(%rip), %rax
    cmpq    %rax, %rdi
    cmovaeq %rax, %rdi  /* force fault at kernelbase */
    cmpq    %rax, %rsi
    cmovaeq %rax, %rsi  /* force fault at kernelbase */
    /* do_copystr expects lofault address in %r8 */
    /* do_copystr expects whether or not we need smap in %r10 */
    xorl    %r10d, %r10d
    movq    %gs:CPU_THREAD, %r8
    movq    T_LOFAULT(%r8), %r8
    jmp do_copystr
    SET_SIZE(ucopystr)

#elif defined(__i386)

    ENTRY(copyin_noerr)
    movl    kernelbase, %eax
#ifdef DEBUG
    cmpl    %eax, 8(%esp)
    jae 1f
    pushl   $.cpyin_ne_pmsg
    call    panic
1:
#endif
    cmpl    %eax, 4(%esp)
    jb  do_copy
    movl    %eax, 4(%esp)   /* force fault at kernelbase */
    jmp do_copy
    SET_SIZE(copyin_noerr)

    ENTRY(copyout_noerr)
    movl    kernelbase, %eax
#ifdef DEBUG
    cmpl    %eax, 4(%esp)
    jae 1f
    pushl   $.cpyout_ne_pmsg
    call    panic
1:
#endif
    cmpl    %eax, 8(%esp)
    jb  do_copy
    movl    %eax, 8(%esp)   /* force fault at kernelbase */
    jmp do_copy
    SET_SIZE(copyout_noerr)

    ENTRY(uzero)
    movl    kernelbase, %eax
    cmpl    %eax, 4(%esp)
    jb  do_zero
    movl    %eax, 4(%esp)   /* force fault at kernelbase */
    jmp do_zero
    SET_SIZE(uzero)

    ENTRY(ucopy)
    movl    kernelbase, %eax
    cmpl    %eax, 4(%esp)
    jb  1f
    movl    %eax, 4(%esp)   /* force fault at kernelbase */
1:
    cmpl    %eax, 8(%esp)
    jb  do_copy
    movl    %eax, 8(%esp)   /* force fault at kernelbase */
    jmp do_copy
    SET_SIZE(ucopy)

    ENTRY(ucopystr)
    movl    kernelbase, %eax
    cmpl    %eax, 4(%esp)
    jb  1f
    movl    %eax, 4(%esp)   /* force fault at kernelbase */
1:
    cmpl    %eax, 8(%esp)
    jb  2f
    movl    %eax, 8(%esp)   /* force fault at kernelbase */
2:
    /* do_copystr expects the lofault address in %eax */
    movl    %gs:CPU_THREAD, %eax
    movl    T_LOFAULT(%eax), %eax
    jmp do_copystr
    SET_SIZE(ucopystr)

#endif  /* __i386 */

#ifdef DEBUG
    .data
.kcopy_panic_msg:
    .string "kcopy: arguments below kernelbase"
.bcopy_panic_msg:
    .string "bcopy: arguments below kernelbase"
.kzero_panic_msg:
        .string "kzero: arguments below kernelbase"
.bzero_panic_msg:
    .string "bzero: arguments below kernelbase"
.copyin_panic_msg:
    .string "copyin: kaddr argument below kernelbase"
.xcopyin_panic_msg:
    .string "xcopyin: kaddr argument below kernelbase"
.copyout_panic_msg:
    .string "copyout: kaddr argument below kernelbase"
.xcopyout_panic_msg:
    .string "xcopyout: kaddr argument below kernelbase"
.copystr_panic_msg:
    .string "copystr: arguments in user space"
.copyinstr_panic_msg:
    .string "copyinstr: kaddr argument not in kernel address space"
.copyoutstr_panic_msg:
    .string "copyoutstr: kaddr argument not in kernel address space"
.cpyin_ne_pmsg:
    .string "copyin_noerr: argument not in kernel address space"
.cpyout_ne_pmsg:
    .string "copyout_noerr: argument not in kernel address space"
#endif

#endif  /* __lint */

/*
 * These functions are used for SMAP, supervisor mode access protection. They
 * are hotpatched to become real instructions when the system starts up which is
 * done in mlsetup() as a part of enabling the other CR4 related features.
 *
 * Generally speaking, smap_disable() is a stac instruction and smap_enable is a
 * clac instruction. It's safe to call these any number of times, and in fact,
 * out of paranoia, the kernel will likely call it at several points.
 */

#if defined(__lint)

void
smap_enable(void)
{}

void
smap_disable(void)
{}

#else

#if defined (__amd64) || defined(__i386)
    ENTRY(smap_disable)
    nop
    nop
    nop
    ret
    SET_SIZE(smap_disable)

    ENTRY(smap_enable)
    nop
    nop
    nop
    ret
    SET_SIZE(smap_enable)

#endif /* __amd64 || __i386 */

#endif /* __lint */

#ifndef __lint

.data
.align  4
.globl  _smap_enable_patch_count
.type   _smap_enable_patch_count,@object
.size   _smap_enable_patch_count, 4
_smap_enable_patch_count:
    .long   SMAP_ENABLE_COUNT

.globl  _smap_disable_patch_count
.type   _smap_disable_patch_count,@object
.size   _smap_disable_patch_count, 4
_smap_disable_patch_count:
    .long SMAP_DISABLE_COUNT

#endif /* __lint */