/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/asm_linkage.h>
#include <sys/regset.h>
#include <sys/privregs.h>
#if defined(__lint)
#include <sys/types.h>
#include <sys/archsystm.h>
#else
#include "assym.h"
#endif
/*
* Do block operations using Streaming SIMD extensions
*/
#if defined(DEBUG)
#if defined(__amd64)
#define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \
movq %gs:CPU_THREAD, t; \
movsbl T_PREEMPT(t), r32; \
testl r32, r32; \
jne 5f; \
pushq %rbp; \
movq %rsp, %rbp; \
leaq msg(%rip), %rdi; \
xorl %eax, %eax; \
call panic; \
5:
#elif defined(__i386)
#define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \
movl %gs:CPU_THREAD, t; \
movsbl T_PREEMPT(t), r32; \
testl r32, r32; \
jne 5f; \
pushl %ebp; \
movl %esp, %ebp; \
pushl $msg; \
call panic; \
5:
#endif /* __i386 */
#else /* DEBUG */
#define ASSERT_KPREEMPT_DISABLED(t, r32, msg)
#endif /* DEBUG */
#define BLOCKSHIFT 6
#define BLOCKSIZE 64 /* (1 << BLOCKSHIFT) */
#define BLOCKMASK 63 /* (BLOCKSIZE - 1) */
#if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
#error "mucked up constants"
#endif
#if defined(__lint)
/*ARGSUSED*/
void
hwblkclr(void *addr, size_t size)
{}
#else /* __lint */
#if defined(__amd64)
#define ADD addq
#define SUB subq
#else
#define ADD addl
#define SUB subl
#endif
#define SAVE_XMM0(r) \
SAVE_XMM_PROLOG(r, 1); \
movdqa %xmm0, (r)
#define ZERO_LOOP_INIT_XMM(dst) \
pxor %xmm0, %xmm0
#define ZERO_LOOP_BODY_XMM(dst, cnt) \
movntdq %xmm0, (dst); \
movntdq %xmm0, 0x10(dst); \
movntdq %xmm0, 0x20(dst); \
movntdq %xmm0, 0x30(dst); \
ADD $BLOCKSIZE, dst; \
SUB $1, cnt
#define ZERO_LOOP_FINI_XMM(dst) \
mfence
#define RSTOR_XMM0(r) \
movdqa 0x0(r), %xmm0; \
RSTOR_XMM_EPILOG(r, 1)
#if defined(__amd64)
/*
* %rdi dst
* %rsi size
* %rax saved %cr0 (#if DEBUG then %eax is t->t_preempt)
* %r8 pointer to %xmm register save area
*/
ENTRY(hwblkclr)
pushq %rbp
movq %rsp, %rbp
testl $BLOCKMASK, %edi /* address must be BLOCKSIZE aligned */
jne .dobzero
cmpq $BLOCKSIZE, %rsi /* size must be at least BLOCKSIZE */
jl .dobzero
testq $BLOCKMASK, %rsi /* .. and be a multiple of BLOCKSIZE */
jne .dobzero
shrq $BLOCKSHIFT, %rsi
ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
movq %cr0, %rax
clts
testl $CR0_TS, %eax
jnz 1f
SAVE_XMM0(%r8)
1: ZERO_LOOP_INIT_XMM(%rdi)
9: ZERO_LOOP_BODY_XMM(%rdi, %rsi)
jnz 9b
ZERO_LOOP_FINI_XMM(%rdi)
testl $CR0_TS, %eax
jnz 2f
RSTOR_XMM0(%r8)
2: movq %rax, %cr0
leave
ret
.dobzero:
leave
jmp bzero
SET_SIZE(hwblkclr)
#elif defined(__i386)
/*
* %eax dst
* %ecx size in bytes, loop count
* %ebx saved %cr0 (#if DEBUG then t->t_preempt)
* %edi pointer to %xmm register save area
*/
ENTRY(hwblkclr)
movl 4(%esp), %eax
movl 8(%esp), %ecx
testl $BLOCKMASK, %eax /* address must be BLOCKSIZE aligned */
jne .dobzero
cmpl $BLOCKSIZE, %ecx /* size must be at least BLOCKSIZE */
jl .dobzero
testl $BLOCKMASK, %ecx /* .. and be a multiple of BLOCKSIZE */
jne .dobzero
shrl $BLOCKSHIFT, %ecx
movl 0xc(%esp), %edx
pushl %ebx
pushl %esi
ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
popl %esi
movl %cr0, %ebx
clts
testl $CR0_TS, %ebx
jnz 1f
pushl %edi
SAVE_XMM0(%edi)
1: ZERO_LOOP_INIT_XMM(%eax)
9: ZERO_LOOP_BODY_XMM(%eax, %ecx)
jnz 9b
ZERO_LOOP_FINI_XMM(%eax)
testl $CR0_TS, %ebx
jnz 2f
RSTOR_XMM0(%edi)
popl %edi
2: movl %ebx, %cr0
popl %ebx
ret
.dobzero:
jmp bzero
SET_SIZE(hwblkclr)
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
/*ARGSUSED*/
void
hwblkpagecopy(const void *src, void *dst)
{}
#else /* __lint */
#define PREFETCH_START(src) \
prefetchnta 0x0(src); \
prefetchnta 0x40(src)
#define SAVE_XMMS(r) \
SAVE_XMM_PROLOG(r, 8); \
movdqa %xmm0, (r); \
movdqa %xmm1, 0x10(r); \
movdqa %xmm2, 0x20(r); \
movdqa %xmm3, 0x30(r); \
movdqa %xmm4, 0x40(r); \
movdqa %xmm5, 0x50(r); \
movdqa %xmm6, 0x60(r); \
movdqa %xmm7, 0x70(r)
#define COPY_LOOP_INIT_XMM(src) \
prefetchnta 0x80(src); \
prefetchnta 0xc0(src); \
movdqa 0x0(src), %xmm0; \
movdqa 0x10(src), %xmm1; \
movdqa 0x20(src), %xmm2; \
movdqa 0x30(src), %xmm3; \
movdqa 0x40(src), %xmm4; \
movdqa 0x50(src), %xmm5; \
movdqa 0x60(src), %xmm6; \
movdqa 0x70(src), %xmm7; \
ADD $0x80, src
#define COPY_LOOP_BODY_XMM(src, dst, cnt) \
prefetchnta 0x80(src); \
prefetchnta 0xc0(src); \
prefetchnta 0x100(src); \
prefetchnta 0x140(src); \
movntdq %xmm0, (dst); \
movntdq %xmm1, 0x10(dst); \
movntdq %xmm2, 0x20(dst); \
movntdq %xmm3, 0x30(dst); \
movdqa 0x0(src), %xmm0; \
movdqa 0x10(src), %xmm1; \
movntdq %xmm4, 0x40(dst); \
movntdq %xmm5, 0x50(dst); \
movdqa 0x20(src), %xmm2; \
movdqa 0x30(src), %xmm3; \
movntdq %xmm6, 0x60(dst); \
movntdq %xmm7, 0x70(dst); \
movdqa 0x40(src), %xmm4; \
movdqa 0x50(src), %xmm5; \
ADD $0x80, dst; \
movdqa 0x60(src), %xmm6; \
movdqa 0x70(src), %xmm7; \
ADD $0x80, src; \
subl $1, cnt
#define COPY_LOOP_FINI_XMM(dst) \
movntdq %xmm0, 0x0(dst); \
movntdq %xmm1, 0x10(dst); \
movntdq %xmm2, 0x20(dst); \
movntdq %xmm3, 0x30(dst); \
movntdq %xmm4, 0x40(dst); \
movntdq %xmm5, 0x50(dst); \
movntdq %xmm6, 0x60(dst); \
movntdq %xmm7, 0x70(dst)
#define RSTOR_XMMS(r) \
movdqa 0x0(r), %xmm0; \
movdqa 0x10(r), %xmm1; \
movdqa 0x20(r), %xmm2; \
movdqa 0x30(r), %xmm3; \
movdqa 0x40(r), %xmm4; \
movdqa 0x50(r), %xmm5; \
movdqa 0x60(r), %xmm6; \
movdqa 0x70(r), %xmm7; \
RSTOR_XMM_EPILOG(r, 8)
#if defined(__amd64)
/*
* %rdi src
* %rsi dst
* %rdx #if DEBUG then curthread
* %ecx loop count
* %rax saved %cr0 (#if DEBUG then %eax is t->t_prempt)
* %r8 pointer to %xmm register save area
*/
ENTRY(hwblkpagecopy)
pushq %rbp
movq %rsp, %rbp
PREFETCH_START(%rdi)
/*
* PAGESIZE is 4096, each loop moves 128 bytes, but the initial
* load and final store save us on loop count
*/
movl $_CONST(32 - 1), %ecx
ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
movq %cr0, %rax
clts
testl $CR0_TS, %eax
jnz 3f
SAVE_XMMS(%r8)
3: COPY_LOOP_INIT_XMM(%rdi)
4: COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
jnz 4b
COPY_LOOP_FINI_XMM(%rsi)
testl $CR0_TS, %eax
jnz 5f
RSTOR_XMMS(%r8)
5: movq %rax, %cr0
mfence
leave
ret
SET_SIZE(hwblkpagecopy)
#elif defined(__i386)
/*
* %eax src
* %edx dst
* %ecx loop count
* %ebx saved %cr0 (#if DEBUG then t->t_prempt)
* %edi pointer to %xmm register save area
* %esi #if DEBUG temporary thread pointer
*/
ENTRY(hwblkpagecopy)
movl 4(%esp), %eax
movl 8(%esp), %edx
PREFETCH_START(%eax)
pushl %ebx
/*
* PAGESIZE is 4096, each loop moves 128 bytes, but the initial
* load and final store save us one loop count
*/
movl $_CONST(32 - 1), %ecx
pushl %esi
ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
popl %esi
movl %cr0, %ebx
clts
testl $CR0_TS, %ebx
jnz 3f
pushl %edi
SAVE_XMMS(%edi)
3: COPY_LOOP_INIT_XMM(%eax)
4: COPY_LOOP_BODY_XMM(%eax, %edx, %ecx)
jnz 4b
COPY_LOOP_FINI_XMM(%edx)
testl $CR0_TS, %ebx
jnz 5f
RSTOR_XMMS(%edi)
popl %edi
5: movl %ebx, %cr0
popl %ebx
mfence
ret
SET_SIZE(hwblkpagecopy)
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
/*
* Version of hwblkclr which doesn't use XMM registers.
* Note that it requires aligned dst and len.
*
* XXPV This needs to be performance tuned at some point.
* Is 4 the best number of iterations to unroll?
*/
/*ARGSUSED*/
void
block_zero_no_xmm(void *dst, int len)
{}
#else /* __lint */
#if defined(__amd64)
ENTRY(block_zero_no_xmm)
pushq %rbp
movq %rsp, %rbp
xorl %eax, %eax
addq %rsi, %rdi
negq %rsi
1:
movnti %rax, (%rdi, %rsi)
movnti %rax, 8(%rdi, %rsi)
movnti %rax, 16(%rdi, %rsi)
movnti %rax, 24(%rdi, %rsi)
addq $32, %rsi
jnz 1b
mfence
leave
ret
SET_SIZE(block_zero_no_xmm)
#elif defined(__i386)
ENTRY(block_zero_no_xmm)
pushl %ebp
movl %esp, %ebp
xorl %eax, %eax
movl 8(%ebp), %edx
movl 12(%ebp), %ecx
addl %ecx, %edx
negl %ecx
1:
movnti %eax, (%edx, %ecx)
movnti %eax, 4(%edx, %ecx)
movnti %eax, 8(%edx, %ecx)
movnti %eax, 12(%edx, %ecx)
addl $16, %ecx
jnz 1b
mfence
leave
ret
SET_SIZE(block_zero_no_xmm)
#endif /* __i386 */
#endif /* __lint */
#if defined(__lint)
/*
* Version of page copy which doesn't use XMM registers.
*
* XXPV This needs to be performance tuned at some point.
* Is 4 the right number of iterations to unroll?
* Is the load/store order optimal? Should it use prefetch?
*/
/*ARGSUSED*/
void
page_copy_no_xmm(void *dst, void *src)
{}
#else /* __lint */
#if defined(__amd64)
ENTRY(page_copy_no_xmm)
movq $MMU_STD_PAGESIZE, %rcx
addq %rcx, %rdi
addq %rcx, %rsi
negq %rcx
1:
movq (%rsi, %rcx), %rax
movnti %rax, (%rdi, %rcx)
movq 8(%rsi, %rcx), %rax
movnti %rax, 8(%rdi, %rcx)
movq 16(%rsi, %rcx), %rax
movnti %rax, 16(%rdi, %rcx)
movq 24(%rsi, %rcx), %rax
movnti %rax, 24(%rdi, %rcx)
addq $32, %rcx
jnz 1b
mfence
ret
SET_SIZE(page_copy_no_xmm)
#elif defined(__i386)
ENTRY(page_copy_no_xmm)
pushl %esi
movl $MMU_STD_PAGESIZE, %ecx
movl 8(%esp), %edx
movl 12(%esp), %esi
addl %ecx, %edx
addl %ecx, %esi
negl %ecx
1:
movl (%esi, %ecx), %eax
movnti %eax, (%edx, %ecx)
movl 4(%esi, %ecx), %eax
movnti %eax, 4(%edx, %ecx)
movl 8(%esi, %ecx), %eax
movnti %eax, 8(%edx, %ecx)
movl 12(%esi, %ecx), %eax
movnti %eax, 12(%edx, %ecx)
addl $16, %ecx
jnz 1b
mfence
popl %esi
ret
SET_SIZE(page_copy_no_xmm)
#endif /* __i386 */
#endif /* __lint */
#if defined(DEBUG) && !defined(__lint)
.text
.not_disabled:
.string "sseblk: preemption not disabled!"
#endif