/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2008, Intel Corporation
* All rights reserved.
*/
/*
* Portions Copyright 2009 Advanced Micro Devices, Inc.
*/
#include <sys/asm_linkage.h>
#include "cache.h"
#include "proc64_id.h"
#define L(s) .memset/**/s
/*
* memset algorithm overview:
*
* Thresholds used below were determined experimentally.
*
* Pseudo code:
*
* NOTE: On AMD NO_SSE is always set. Performance on Opteron did not improve
* using 16-byte stores. Setting NO_SSE on AMD should be re-evaluated on
* future AMD processors.
*
*
* If (size <= 144 bytes) {
* do unrolled code (primarily 8-byte stores) regardless of alignment.
* } else {
* Align destination to 16-byte boundary
*
* if (NO_SSE) {
* If (size > largest level cache) {
* } else {
* if (size >= 2K) {
* Use rep sstoq
* } else {
* Use 8-byte stores (128 bytes per loop)
* }
* }
*
* } else { **USE SSE**
* If (size <= 192 bytes) {
* do unrolled code using primarily 16-byte stores (SSE2)
* } else {
* If (size > largest level cache) {
* } else {
* Use 16-byte stores (128 bytes per loop)
* }
* }
* }
*
* Finish any remaining bytes via unrolled code above.
* }
*/
L(ck2):
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
L(ck_align):
/*
* Align to 16 byte boundary first
*/
.balign 16
.balign 16
jmp L(aligned_now)
.balign 16
jmp L(aligned_now)
.balign 16
jmp L(aligned_now)
.balign 16
jmp L(aligned_now)
.balign 16
jmp L(aligned_now)
.balign 16
jmp L(aligned_now)
.balign 16
jmp L(aligned_now)
.balign 16
.balign 16
L(aligned_now):
/*
* Check memops method
*/
je L(Loop8byte_pre)
/*
* Use SSE2 instructions
*/
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
L(byte32sse2_pre):
jg L(sse2_nt_move)
.balign 16
L(byte32sse2):
jge L(byte32sse2)
.balign 16
L(sse2_nt_move):
jge L(sse2_nt_move)
/*
* Don't use SSE
*/
.balign 16
L(Loop8byte_pre):
.balign 16
L(Loop8byte):
1:
/*
* Use rep sstoq for sizes > 2K
*/
.balign 16
L(use_rep):
jnz 1b
.balign 16
.balign 16