/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2008, Intel Corporation
* All rights reserved.
*/
/*
* memcpy.s - copies two blocks of memory
* Implements memcpy() and memmove() libc primitives.
*/
#include <sys/asm_linkage.h>
#include "cache.h"
#include "proc64_id.h"
#define L(s) .memcpy/**/s
/*
* memcpy algorithm overview:
*
* Thresholds used below were determined experimentally.
*
* Pseudo code:
*
* NOTE: On AMD NO_SSE is always set. Performance on Opteron did not improve
* using 16-byte stores. Setting NO_SSE on AMD should be re-evaluated on
* future AMD processors.
*
*
* If (size <= 128 bytes) {
* alignment.
* } else {
* Align destination to 16-byte boundary
*
* if (NO_SSE) {
* If (size > half of the largest level cache) {
* } else {
* if (size > 4K && size <= half l1 cache size) {
* Use rep movsq
* } else {
* }
* }
*
* } else { **USE SSE**
* If (size > half of the largest level cache) {
* Use 16-byte non-temporal stores (128-bytes per loop)
* } else {
* If (both source and destination are aligned) {
* } else {
* use pairs of xmm registers with SSE2 or SSSE3
* instructions to concatenate and shift appropriately
* to account for source unalignment. This enables
* 16-byte aligned loads to be done.
* }
* }
}
*
* Finish any remaining bytes via unrolled code above.
* }
*
* memmove overview:
* memmove is the same as memcpy except one case where copy needs to be
* done backwards. The copy backwards code is done in a similar manner.
*/
jmp L(CopyForward)
L(CopyForward):
jg L(ck_use_sse2)
.balign 16
L(ShrtAlignNew):
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
.balign 16
L(P0QG):
L(P0QF):
L(P0QE):
L(P0QD):
L(P0QC):
L(P0QB):
L(P0QA):
L(P0Q9):
L(P0Q8):
L(P0Q7):
L(P0Q6):
L(P0Q5):
L(P0Q4):
L(P0Q3):
L(P0Q2):
L(P0Q1):
L(P0Q0):
.balign 16
L(P1QF):
L(P1QE):
L(P1QD):
L(P1QC):
L(P1QB):
L(P1QA):
L(P1Q9):
L(P1Q8):
L(P1Q7):
L(P1Q6):
L(P1Q5):
L(P1Q4):
L(P1Q3):
L(P1Q2):
L(P1Q1):
L(P1Q0):
.balign 16
L(P2QF):
L(P2QE):
L(P2QD):
L(P2QC):
L(P2QB):
L(P2QA):
L(P2Q9):
L(P2Q8):
L(P2Q7):
L(P2Q6):
L(P2Q5):
L(P2Q4):
L(P2Q3):
L(P2Q2):
L(P2Q1):
L(P2Q0):
.balign 16
L(P3QF):
L(P3QE):
L(P3QD):
L(P3QC):
L(P3QB):
L(P3QA):
L(P3Q9):
L(P3Q8):
L(P3Q7):
L(P3Q6):
L(P3Q5):
L(P3Q4):
L(P3Q3):
L(P3Q2):
L(P3Q1):
/*
* then do the stores.
*/
L(P3Q0):
.balign 16
L(P4QF):
L(P4QE):
L(P4QD):
L(P4QC):
L(P4QB):
L(P4QA):
L(P4Q9):
L(P4Q8):
L(P4Q7):
L(P4Q6):
L(P4Q5):
L(P4Q4):
L(P4Q3):
L(P4Q2):
L(P4Q1):
L(P4Q0):
.balign 16
L(P5QF):
L(P5QE):
L(P5QD):
L(P5QC):
L(P5QB):
L(P5QA):
L(P5Q9):
L(P5Q8):
L(P5Q7):
L(P5Q6):
L(P5Q5):
L(P5Q4):
L(P5Q3):
L(P5Q2):
L(P5Q1):
/*
* then do the stores.
*/
L(P5Q0):
.balign 16
L(P6QF):
L(P6QE):
L(P6QD):
L(P6QC):
L(P6QB):
L(P6QA):
L(P6Q9):
L(P6Q8):
L(P6Q7):
L(P6Q6):
L(P6Q5):
L(P6Q4):
L(P6Q3):
L(P6Q2):
L(P6Q1):
/*
* then do the stores.
*/
L(P6Q0):
.balign 16
L(P7QF):
L(P7QE):
L(P7QD):
L(P7QC):
L(P7QB):
L(P7QA):
L(P7Q9):
L(P7Q8):
L(P7Q7):
L(P7Q6):
L(P7Q5):
L(P7Q4):
L(P7Q3):
L(P7Q2):
L(P7Q1):
/*
* then do the stores.
*/
L(P7Q0):
.balign 16
L(ck_use_sse2):
/*
* Align dest to 16 byte boundary.
*/
jnz L(ShrtAlignNew)
L(now_qw_aligned):
je L(Loop8byte_pre)
/*
*/
/*
* If current move size is larger than half of the highest level cache
* size, then do non-temporal moves.
*/
jg L(sse2_nt_move)
/*
* If both the source and dest are aligned, then use the both aligned
* logic. Well aligned data should reap the rewards.
*/
jz 1f
1:
/*
* if the src is not 16 byte aligned...
*/
.balign 16
.balign 16
L(movdqa1):
jmp L(movdqa_epi)
.balign 16
L(movdqa2):
jmp L(movdqa_epi)
.balign 16
L(movdqa3):
jmp L(movdqa_epi)
.balign 16
L(movdqa4):
jmp L(movdqa_epi)
.balign 16
L(movdqa5):
jmp L(movdqa_epi)
.balign 16
L(movdqa6):
jmp L(movdqa_epi)
.balign 16
L(movdqa7):
jmp L(movdqa_epi)
.balign 16
L(movdqa8):
jmp L(movdqa_epi)
.balign 16
L(movdqa9):
jmp L(movdqa_epi)
.balign 16
L(movdqa10):
jmp L(movdqa_epi)
.balign 16
L(movdqa11):
jmp L(movdqa_epi)
.balign 16
L(movdqa12):
jmp L(movdqa_epi)
.balign 16
L(movdqa13):
jmp L(movdqa_epi)
.balign 16
L(movdqa14):
jmp L(movdqa_epi)
.balign 16
L(movdqa15):
#jmp L(movdqa_epi)
.balign 16
L(movdqa_epi):
.balign 16
L(mov3dqa1):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa2):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa3):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa4):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa5):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa6):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa7):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa9):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa10):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa11):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa12):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa13):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa14):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(mov3dqa15):
jl L(movdqa_epi)
jl L(movdqa_epi)
jmp L(movdqa_epi)
.balign 16
L(sse2_nt_move):
/*
* doesn't matter if source is aligned for stuff out of cache.
* the mis-aligned penalty is masked by the slowness of main memory.
*/
jge L(sse2_nt_move)
.balign 16
L(Fix16EndTable):
.int L(fix16_0)-L(Fix16EndTable)
.int L(fix16_1)-L(Fix16EndTable)
.int L(fix16_2)-L(Fix16EndTable)
.int L(fix16_3)-L(Fix16EndTable)
.balign 16
L(fix16_3):
L(fix16_2):
L(fix16_1):
L(fix16_0):
.balign 16
L(pre_both_aligned):
.balign 16
L(both_aligned):
/*
*/
jge L(both_aligned)
L(fix_16b):
.balign 16
L(Loop8byte_pre):
jge L(byte8_nt_top)
.balign 16
L(byte8_top):
L(byte8_end):
.balign 16
L(use_rep):
.balign 16
L(byte8_nt_top):
jge L(byte8_nt_top)
.balign 16
L(CopyBackwards):
L(bk_qw_aligned):
.balign 16
L(bk_align):
jle L(bk_qw_aligned)
L(bk_tst2):
L(bk_got2):
L(bk_tst3):
je L(bk_qw_aligned)
L(bk_got3):
jmp L(bk_qw_aligned)
.balign 16
je L(bk_use_rep)
jz L(bk_sse2_cpy)
L(bk_sse2_align):
#jmp L(bk_sse2_cpy)
.balign 16
L(bk_sse2_cpy):
jge L(bk_sse2_cpy)
L(bk_sse2_cpy_end):
.balign 16
L(bk_use_rep):
jz 2f
2:
.balign 16
L(bkP0QI):
L(bkP0QH):
L(bkP0QG):
L(bkP0QF):
L(bkP0QE):
L(bkP0QD):
L(bkP0QC):
L(bkP0QB):
L(bkP0QA):
L(bkP0Q9):
L(bkP0Q8):
L(bkP0Q7):
L(bkP0Q6):
L(bkP0Q5):
L(bkP0Q4):
L(bkP0Q3):
L(bkP0Q2):
L(bkP0Q1):
L(bkP0Q0):
.balign 16
L(bkP1QI):
L(bkP1QH):
L(bkP1QG):
L(bkP1QF):
L(bkP1QE):
L(bkP1QD):
L(bkP1QC):
L(bkP1QB):
L(bkP1QA):
L(bkP1Q9):
L(bkP1Q8):
L(bkP1Q7):
L(bkP1Q6):
L(bkP1Q5):
L(bkP1Q4):
L(bkP1Q3):
L(bkP1Q2):
L(bkP1Q1):
L(bkP1Q0):
.balign 16
L(bkP2QI):
L(bkP2QH):
L(bkP2QG):
L(bkP2QF):
L(bkP2QE):
L(bkP2QD):
L(bkP2QC):
L(bkP2QB):
L(bkP2QA):
L(bkP2Q9):
L(bkP2Q8):
L(bkP2Q7):
L(bkP2Q6):
L(bkP2Q5):
L(bkP2Q4):
L(bkP2Q3):
L(bkP2Q2):
L(bkP2Q1):
L(bkP2Q0):
.balign 16
L(bkP3QI):
L(bkP3QH):
L(bkP3QG):
L(bkP3QF):
L(bkP3QE):
L(bkP3QD):
L(bkP3QC):
L(bkP3QB):
L(bkP3QA):
L(bkP3Q9):
L(bkP3Q8):
L(bkP3Q7):
L(bkP3Q6):
L(bkP3Q5):
L(bkP3Q4):
L(bkP3Q3):
L(bkP3Q2):
L(bkP3Q1):
.balign 16
L(bkP4QI):
L(bkP4QH):
L(bkP4QG):
L(bkP4QF):
L(bkP4QE):
L(bkP4QD):
L(bkP4QC):
L(bkP4QB):
L(bkP4QA):
L(bkP4Q9):
L(bkP4Q8):
L(bkP4Q7):
L(bkP4Q6):
L(bkP4Q5):
L(bkP4Q4):
L(bkP4Q3):
L(bkP4Q2):
L(bkP4Q1):
L(bkP4Q0):
.balign 16
L(bkP5QI):
L(bkP5QH):
L(bkP5QG):
L(bkP5QF):
L(bkP5QE):
L(bkP5QD):
L(bkP5QC):
L(bkP5QB):
L(bkP5QA):
L(bkP5Q9):
L(bkP5Q8):
L(bkP5Q7):
L(bkP5Q6):
L(bkP5Q5):
L(bkP5Q4):
L(bkP5Q3):
L(bkP5Q2):
L(bkP5Q1):
.balign 16
L(bkP6QI):
L(bkP6QH):
L(bkP6QG):
L(bkP6QF):
L(bkP6QE):
L(bkP6QD):
L(bkP6QC):
L(bkP6QB):
L(bkP6QA):
L(bkP6Q9):
L(bkP6Q8):
L(bkP6Q7):
L(bkP6Q6):
L(bkP6Q5):
L(bkP6Q4):
L(bkP6Q3):
L(bkP6Q2):
L(bkP6Q1):
.balign 16
L(bkP7QI):
L(bkP7QH):
L(bkP7QG):
L(bkP7QF):
L(bkP7QE):
L(bkP7QD):
L(bkP7QC):
L(bkP7QB):
L(bkP7QA):
L(bkP7Q9):
L(bkP7Q8):
L(bkP7Q7):
L(bkP7Q6):
L(bkP7Q5):
L(bkP7Q4):
L(bkP7Q3):
L(bkP7Q2):
L(bkP7Q1):
.balign 16