2N/A * The contents of this file are subject to the terms of the 2N/A * Common Development and Distribution License (the "License"). 2N/A * You may not use this file except in compliance with the License. 2N/A * See the License for the specific language governing permissions 2N/A * and limitations under the License. 2N/A * When distributing Covered Code, include this CDDL HEADER in each 2N/A * If applicable, add the following below this CDDL HEADER, with the 2N/A * fields enclosed by brackets "[]" replaced with your own identifying 2N/A * information: Portions Copyright [yyyy] [name of copyright owner] 2N/A * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. 2N/A * memcpy(s1, s2, len) 2N/A * Copy s2 to s1, always copy n bytes. 2N/A * Note: this C code does not work for overlapped copies. 2N/A * Memmove() and bcopy() do. 2N/A * Fast assembler language version of the following C-program for memcpy 2N/A * which represents the `standard' for the C-library. 2N/A * memcpy(void *s, const void *s0, size_t n) 2N/A * const char *s2 = s0; 2N/A * } while (--n != 0); 2N/A * Handle all cases where src and dest are aligned on word 2N/A * or long word boundaries. Use unrolled loops for better 2N/A * performance. This option wins over standard large data 2N/A * move when source and destination is in cache for medium 2N/A * to short data moves. 2N/A * Special case for handling when src and dest are both long word aligned 2N/A * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD 2N/A ! in SRC compared to in DST 2N/A ! Examples: Let # denote bytes that should not be accessed 2N/A ! Let x denote a byte already copied to align DST 2N/A ! Let . and - denote bytes not yet copied 2N/A ! Let | denote double alignment boundaries 2N/A ! DST: ######xx|........|--------|..###### o2 = 18 2N/A ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 2N/A ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 2N/A ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 2N/A alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1 2N/A movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 2N/A andn %o5, 7, %o5 ! 8 byte aligned count 2N/A and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned 2N/A brgez,a %o3, .beginmedloop 2N/A add %o1, %o3, %o1 ! back up o1 2N/A ldda [%o1]ASI_FL8_P, %d2 2N/A bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's
in d2 2N/A bz,pt %ncc, .mediumexit 2N/A bz,pt %ncc, .mediumexit 2N/A wr %o4, %g0, %fprs ! fprs = o4 restore fprs 2N/A .align ICACHE_LINE_SIZE 2N/A ! %o0 DST, 8 byte aligned 2N/A ! %o1 SRC, 8 byte aligned 2N/A ! %o2 count (number of bytes to be moved) 2N/A ! %o3, %o4, %o5 available as temps 2N/A set BST_THRESHOLD, %o5 2N/A bgu,pn %ncc, .xlarge_long 2N/A prefetch [%o1 + (16 * BLOCK_SIZE)], #n_reads 2N/A subcc %o2, (16 * BLOCK_SIZE) + 63, %o2 ! adjust length to allow 2N/A ! cc test for end of loop 2N/A ble,pn %ncc, .largel_no ! skip big loop if no more prefetches 2N/A prefetch [%o0 + (16 * BLOCK_SIZE)], #n_writes 2N/A prefetch [%o1 + (20 * BLOCK_SIZE)], #n_reads 2N/A prefetch [%o0 + (20 * BLOCK_SIZE)], #n_writes 2N/A ldx [%o1], %o4 ! load 2N/A subcc %o2, 64, %o2 ! decrement length count 2N/A stx %o4, [%o0] ! and store 2N/A ldx [%o1+8], %o3 ! a block of 64 bytes 2N/A ldx [%o1+16], %o4 ! a block of 64 bytes 2N/A ldx [%o1+24], %o3 ! a block of 64 bytes 2N/A ldx [%o1+32], %o4 ! a block of 64 bytes 2N/A ldx [%o1+40], %o3 ! a block of 64 bytes 2N/A add %o1, 64, %o1 ! increase src ptr by 64 2N/A add %o0, 64, %o0 ! increase dst ptr by 64 2N/A bgu,pt %ncc, .largel64p ! repeat if at least 64 bytes left 2N/A add %o2, (16 * BLOCK_SIZE), %o2 2N/A.largel64: ! finish with no more prefetches 2N/A ldx [%o1], %o4 ! load 2N/A subcc %o2, 64, %o2 ! decrement length count 2N/A stx %o4, [%o0] ! and store 2N/A ldx [%o1+8], %o3 ! a block of 64 bytes 2N/A ldx [%o1+16], %o4 ! a block of 64 bytes 2N/A ldx [%o1+24], %o3 ! a block of 64 bytes 2N/A ldx [%o1+32], %o4 ! a block of 64 bytes 2N/A ldx [%o1+40], %o3 ! a block of 64 bytes 2N/A add %o1, 64, %o1 ! increase src ptr by 64 2N/A add %o0, 64, %o0 ! increase dst ptr by 64 2N/A bgu,pt %ncc, .largel64 ! repeat if at least 64 bytes left 2N/A addcc %o2, 32, %o2 ! adjust finish count 2N/A ble,pt %ncc, .largel31 2N/A ldx [%o1], %o4 ! load 2N/A sub %o2, 32, %o2 ! decrement length count 2N/A stx %o4, [%o0] ! and store 2N/A ldx [%o1+8], %o3 ! a block of 32 bytes 2N/A add %o1, 32, %o1 ! increase src ptr by 32 2N/A add %o0, 32, %o0 ! increase dst ptr by 32 2N/A addcc %o2, 16, %o2 ! adjust remaining count 2N/A ble,pt %ncc, .largel15 ! skip if 15 or fewer bytes left 2N/A ldx [%o1], %o4 ! load and store 16 bytes 2N/A add %o1, 16, %o1 ! increase src ptr by 16 2N/A sub %o2, 16, %o2 ! decrease count by 16 2N/A add %o0, 16, %o0 ! increase dst ptr by 16 2N/A addcc %o2, 15, %o2 ! restore count 2N/A bz,pt %ncc, .medwexit ! exit if finished 2N/A blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 2N/A ldx [%o1], %o4 ! load 8 bytes 2N/A add %o1, 8, %o1 ! increase src ptr by 8 2N/A subcc %o2, 8, %o2 ! decrease count by 8 2N/A bz,pt %ncc, .medwexit ! exit if finished 2N/A stx %o4, [%o0] ! and store 8 bytes 2N/A add %o0, 8, %o0 ! increase dst ptr by 8 2N/A .align ICACHE_LINE_SIZE 2N/A ! %o0 I/O DST is 64-byte aligned 2N/A prefetch [%o1 + (20 * BLOCK_SIZE)], #n_reads 2N/A prefetch [%o0 + (20 * BLOCK_SIZE)], #n_writes 2N/A faligndata %f12, %f14, %f44 2N/A faligndata %f14, %f0, %f46 2N/A faligndata %f0, %f2, %f32 2N/A faligndata %f2, %f4, %f34 2N/A std %f36, [%o0 + 16] 2N/A faligndata %f4, %f6, %f36 2N/A std %f38, [%o0 + 24] 2N/A ldd [%o1 + 32], %f10 2N/A faligndata %f6, %f8, %f38 2N/A std %f40, [%o0 + 32] 2N/A ldd [%o1 + 40], %f12 2N/A faligndata %f8, %f10, %f40 2N/A std %f42, [%o0 + 40] 2N/A ldd [%o1 + 48], %f14 2N/A sub %o2, BLOCK_SIZE, %o2 ! update count 2N/A faligndata %f10, %f12, %f42 2N/A std %f44, [%o0 + 48] 2N/A add %o0, BLOCK_SIZE, %o0 ! update DST 2N/A cmp %o2, BLOCK_SIZE + 8 2N/A add %o1, BLOCK_SIZE, %o1 ! update SRC 2N/A faligndata %f12, %f14, %f44 2N/A faligndata %f14, %f0, %f46 2N/A std %f36, [%o0 + 16] 2N/A std %f38, [%o0 + 24] 2N/A std %f40, [%o0 + 32] 2N/A std %f42, [%o0 + 40] 2N/A std %f44, [%o0 + 48] 2N/A std %f46, [%o0 + 56] ! store 64 bytes 2N/A bne %ncc, 2f ! exactly 1 block remaining? 2N/A add %o0, BLOCK_SIZE, %o0 ! update DST 2N/A brz,a %o3, 3f ! is SRC double aligned? 2N/A add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 2N/A andn %o5, 7, %o5 ! 8 byte aligned count 2N/A ! This is when there is exactly 1 block remaining and SRC is aligned 2N/A ldd [%o1 + 0x8], %f4 2N/A ldd [%o1 + 0x10], %f6 2N/A ldd [%o1 + 0x18], %f8 2N/A ldd [%o1 + 0x20], %f10 2N/A ldd [%o1 + 0x28], %f12 2N/A ldd [%o1 + 0x30], %f14 2N/A std %f36, [%o0 + 16] 2N/A std %f38, [%o0 + 24] 2N/A std %f40, [%o0 + 32] 2N/A std %f42, [%o0 + 40] 2N/A std %f44, [%o0 + 48] 2N/A std %f46, [%o0 + 56] ! store 64 bytes 2N/A ! long word aligned, larger than Block store threshold 2N/A ! %o0 DST, 8 byte aligned 2N/A ! %o1 SRC, 8 byte aligned 2N/A ! %o2 count (number of bytes to be moved) 2N/A ! %o3, %o4, %o5 available as temps 2N/A ! prefetch through %o1 + (12* BLOCK_SIZE) has been done 2N/A ! Need to align DST to 64 byte boundary for block stores 2N/A bz,pt %ncc, .xlarge_aligned 2N/A bnz,pt %ncc, .xlarge_a 2N/A ! DST is now on 64 byte boundary 2N/A prefetch [%o1 + (20 * BLOCK_SIZE)], #one_read 2N/A rd %fprs, %o4 ! check for unused FPU 2N/A andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 2N/A bz,a %ncc, .xlarge_loop 2N/A wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 2N/A prefetch [%o1 + (4 * BLOCK_SIZE)], #n_reads 2N/A ldd [%o1 + 40], %d10 2N/A ldd [%o1 + 48], %d12 2N/A stda %d0, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 2N/A bgt,pt %ncc, .xlarge_loop 2N/A prefetch [%o1 + (20 * BLOCK_SIZE)], #one_read 2N/A membar #StoreLoad|#StoreStore ! needed after final blk store 2N/A bgt,pt %ncc, .largel64 2N/A ! %o0 I/O DST is 64-byte aligned 2N/A faligndata %f12, %f14, %f44 2N/A ldd [%o1 + 0x8], %f4 2N/A faligndata %f14, %f0, %f46 2N/A stda %f32, [%o0]ASI_BLK_P 2N/A sub %o2, BLOCK_SIZE, %o2 ! update count 2N/A ldd [%o1 + 0x10], %f6 2N/A faligndata %f0, %f2, %f32 2N/A ldd [%o1 + 0x18], %f8 2N/A faligndata %f2, %f4, %f34 2N/A ldd [%o1 + 0x20], %f10 2N/A faligndata %f4, %f6, %f36 2N/A ldd [%o1 + 0x28], %f12 2N/A faligndata %f6, %f8, %f38 2N/A ldd [%o1 + 0x30], %f14 2N/A prefetch [%o1 + (4 * BLOCK_SIZE)], #n_reads 2N/A faligndata %f8, %f10, %f40 2N/A ldd [%o1 + 0x38], %f0 2N/A faligndata %f10, %f12, %f42 2N/A prefetch [%o1 + (20 * BLOCK_SIZE)], #one_read 2N/A add %o0, BLOCK_SIZE, %o0 ! update DST 2N/A cmp %o2, BLOCK_SIZE + 8 2N/A add %o1, BLOCK_SIZE, %o1 ! update SRC 2N/A faligndata %f12, %f14, %f44 2N/A faligndata %f14, %f0, %f46 2N/A stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 2N/A bne %ncc, 2f ! exactly 1 block remaining? 2N/A add %o0, BLOCK_SIZE, %o0 ! update DST 2N/A brz,a %o3, 3f ! is SRC double aligned? 2N/A add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 2N/A membar #StoreLoad|#StoreStore ! needed after final blk store 2N/A andn %o5, 7, %o5 ! 8 byte aligned count 2N/A ! This is when there is exactly 1 block remaining and SRC is aligned 2N/A ldd [%o1 + 0x8], %f4 2N/A ldd [%o1 + 0x10], %f6 2N/A ldd [%o1 + 0x18], %f8 2N/A ldd [%o1 + 0x20], %f10 2N/A ldd [%o1 + 0x28], %f12 2N/A ldd [%o1 + 0x30], %f14 2N/A stda %f32, [%o0]ASI_BLK_P 2N/A membar #StoreLoad|#StoreStore ! needed after final blk store