opl_olympus_copy.s revision c8a722abb8fd974fb16523acbd90ea75d5dcbeb2
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * CDDL HEADER START
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * The contents of this file are subject to the terms of the
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Common Development and Distribution License (the "License").
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * You may not use this file except in compliance with the License.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * See the License for the specific language governing permissions
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * and limitations under the License.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * When distributing Covered Code, include this CDDL HEADER in each
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * If applicable, add the following below this CDDL HEADER, with the
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * fields enclosed by brackets "[]" replaced with your own identifying
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * information: Portions Copyright [yyyy] [name of copyright owner]
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * CDDL HEADER END
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Use is subject to license terms.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte#pragma ident "%Z%%M% %I% %E% SMI"
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte#if !defined(lint)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte#endif /* lint */
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Pseudo-code to aid in understanding the control flow of the
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! Determine whether to use the FP register version
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! or the leaf routine version depending on size
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! of copy and flags. Set up error handling accordingly.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! The transition point depends on whether the src and
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! dst addresses can be aligned to long word, word,
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! half word, or byte boundaries.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! WARNING: <Register usage convention>
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! For FP version, %l6 holds previous error handling and
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! a flag: TRAMP_FLAG (low bits)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! for leaf routine version, %o4 holds those values.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! So either %l6 or %o4 is reserved and not available for
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! any other use.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to small_copy; ! to speed short copies
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! src, dst long word alignable
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (hw_copy_limit_8 == 0) ! hw_copy disabled
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to small_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (length <= hw_copy_limit_8)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to small_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to FPBLK_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (src,dst not alignable) {
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (hw_copy_limit_1 == 0) ! hw_copy disabled
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to small_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (length <= hw_copy_limit_1)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to small_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to FPBLK_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (src,dst halfword alignable) {
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (hw_copy_limit_2 == 0) ! hw_copy disabled
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to small_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (length <= hw_copy_limit_2)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to small_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to FPBLK_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (src,dst word alignable) {
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (hw_copy_limit_4 == 0) ! hw_copy disabled
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to small_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (length <= hw_copy_limit_4)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to small_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to FPBLK_copy;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * small_copy:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Setup_leaf_rtn_error_handler; ! diffs for each entry point
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (count <= 3) ! fast path for tiny copies
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to sm_left; ! special finish up code
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (count > CHKSIZE) ! medium sized copies
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to sm_med ! tuned by alignment
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if(src&dst not both word aligned) {
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * sm_movebytes:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move byte by byte in 4-way unrolled loop
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * fall into sm_left;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move 0-3 bytes byte at a time as needed.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * restore error handler and exit.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * } else { ! src&dst are word aligned
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * check for at least 8 bytes left,
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move word at a time, unrolled by 2
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * when fewer than 8 bytes left,
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * sm_half: move half word at a time while 2 or more bytes left
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * sm_byte: move final byte if necessary
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * restore error handler and exit.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! Medium length cases with at least CHKSIZE bytes available
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! method: line up src and dst as best possible, then
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! move data in 4-way unrolled loops.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if(src&dst unalignable)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to sm_movebytes
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if(src&dst halfword alignable)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to sm_movehalf
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if(src&dst word alignable)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * go to sm_moveword
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! fall into long word movement
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move bytes until src is word aligned
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if not long word aligned, move a word
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move long words in 4-way unrolled loop until < 32 bytes left
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move long words in 1-way unrolled loop until < 8 bytes left
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if zero bytes left, goto sm_exit
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if one byte left, go to sm_byte
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * else go to sm_half
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * sm_moveword:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move bytes until src is word aligned
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move words in 4-way unrolled loop until < 16 bytes left
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move words in 1-way unrolled loop until < 4 bytes left
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if zero bytes left, goto sm_exit
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if one byte left, go to sm_byte
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * else go to sm_half
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * sm_movehalf:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move a byte if needed to align src on halfword
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * move halfwords in 4-way unrolled loop until < 8 bytes left
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if zero bytes left, goto sm_exit
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if one byte left, go to sm_byte
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * else go to sm_half
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * FPBLK_copy:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * %l6 = curthread->t_lofault;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (%l6 != NULL) {
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * membar #Sync
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * curthread->t_lofault = .copyerr;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * caller_error_handler = TRUE ! %l6 |= 2
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! for FPU testing we must not migrate cpus
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (curthread->t_lwp == NULL) {
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! Kernel threads do not have pcb's in which to store
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! the floating point state, so disallow preemption during
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! the copy. This also prevents cpu migration.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * kpreempt_disable(curthread);
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * thread_nomigrate();
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * old_fprs = %fprs;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * old_gsr = %gsr;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (%fprs.fef) {
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * %fprs.fef = 1;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * save current fpregs on stack using blockstore
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * %fprs.fef = 1;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * do_blockcopy_here;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * In lofault handler:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * curthread->t_lofault = .copyerr2;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Continue on with the normal exit handler
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * On normal exit:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * %gsr = old_gsr;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (old_fprs & FPRS_FEF)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * restore fpregs from stack using blockload
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * zero fpregs
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * %fprs = old_fprs;
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * membar #Sync
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * curthread->t_lofault = (%l6 & ~3);
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! following test omitted from copyin/copyout as they
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ! will always have a current thread
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * if (curthread->t_lwp == NULL)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * kpreempt_enable(curthread);
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * thread_allowmigrate();
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * return (0)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * In second lofault handler (.copyerr2):
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * We've tried to restore fp state from the stack and failed. To
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * prevent from returning with a corrupted fp state, we will panic.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Comments about optimization choices
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * The initial optimization decision in this code is to determine
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * whether to use the FP registers for a copy or not. If we don't
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * use the FP registers, we can execute the copy as a leaf routine,
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * saving a register save and restore. Also, less elaborate setup
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * is required, allowing short copies to be completed more quickly.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * For longer copies, especially unaligned ones (where the src and
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * dst do not align to allow simple ldx,stx operation), the FP
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * registers allow much faster copy operations.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * The estimated extra cost of the FP path will vary depending on
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * src/dst alignment, dst offset from the next 64 byte FPblock store
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * boundary, remaining src data after the last full dst cache line is
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * moved whether the FP registers need to be saved, and some other
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * minor issues. The average additional overhead is estimated to be
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * 400 clocks. Since each non-repeated/predicted tst and branch costs
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * around 10 clocks, elaborate calculation would slow down to all
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * longer copies and only benefit a small portion of medium sized
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * copies. Rather than incur such cost, we chose fixed transition
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * points for each of the alignment choices.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * For the inner loop, here is a comparison of the per cache line
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * costs for each alignment when src&dst are in cache:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * byte aligned: 108 clocks slower for non-FPBLK
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * half aligned: 44 clocks slower for non-FPBLK
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * word aligned: 12 clocks slower for non-FPBLK
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * long aligned: 4 clocks >>faster<< for non-FPBLK
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * The long aligned loop runs faster because it does no prefetching.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * That wins if the data is not in cache or there is too little
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * data to gain much benefit from prefetching. But when there
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * is more data and that data is not in cache, failing to prefetch
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * can run much slower. In addition, there is a 2 Kbyte store queue
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * which will cause the non-FPBLK inner loop to slow for larger copies.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * The exact tradeoff is strongly load and application dependent, with
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * increasing risk of a customer visible performance regression if the
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * upper limit for the non-FPBLK code. To minimize performance regression
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * risk while still gaining the primary benefits of the improvements to
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hw_copy_limit_*. Later experimental studies using different values
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * of hw_copy_limit_* can be used to make further adjustments if
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * appropriate.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hw_copy_limit_8 = src and dst are longword aligned
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * To say that src and dst are word aligned means that after
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * some initial alignment activity of moving 0 to 3 bytes,
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * both the src and dst will be on word boundaries so that
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * word loads and stores may be used.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Default values at May,2005 are:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hw_copy_limit_1 = 256
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hw_copy_limit_2 = 512
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hw_copy_limit_4 = 1024
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hw_copy_limit_8 = 1024 (or 1536 on some systems)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * disabled for that alignment choice.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * the value of VIS_COPY_THRESHOLD is used.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * It is not envisioned that hw_copy_limit_? will be changed in the field
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * It is provided to allow for disabling FPBLK copies and to allow
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * easy testing of alternate values on future HW implementations
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * that might have different cache sizes, clock rates or instruction
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * timing rules.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * threshold to speedup all shorter copies (less than 256). That
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * saves an alignment test, memory reference, and enabling test
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * for all short copies, or an estimated 24 clocks.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * The order in which these limits are checked does matter since each
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * non-predicted tst and branch costs around 10 clocks.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * If src and dst are randomly selected addresses,
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * 4 of 8 will not be alignable.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * 2 of 8 will be half word alignable.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * 1 of 8 will be word alignable.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * 1 of 8 will be long word alignable.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * But, tests on running kernels show that src and dst to copy code
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * are typically not on random alignments. Structure copies and
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * copies of larger data sizes are often on long word boundaries.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * So we test the long word alignment case first, then
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * the byte alignment, then halfword, then word alignment.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Several times, tests for length are made to split the code
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * into subcases. These tests often allow later tests to be
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * avoided. For example, within the non-FPBLK copy, we first
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * check for tiny copies of 3 bytes or less. That allows us
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to use a 4-way unrolled loop for the general byte copy case
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * without a test on loop entry.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * vs longer cases. For the really short case, we don't attempt
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * align src and dst. We try to minimize special case tests in
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * the shortest loops as each test adds a significant percentage
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to the total time.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * For the medium sized cases, we allow ourselves to adjust the
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * src and dst alignment and provide special cases for each of
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * the four adjusted alignment cases. The CHKSIZE that was used
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to decide between short and medium size was chosen to be 39
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * as that allows for the worst case of 7 bytes of alignment
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * shift and 4 times 8 bytes for the first long word unrolling.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * That knowledge saves an initial test for length on entry into
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * the medium cases. If the general loop unrolling factor were
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to be increases, this number would also need to be adjusted.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * For all cases in the non-FPBLK code where it is known that at
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * least 4 chunks of data are available for movement, the
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * loop is unrolled by four. This 4-way loop runs in 8 clocks
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * or 2 clocks per data element.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Instruction alignment is forced by used of .align 16 directives
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * and nops which are not executed in the code. This
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * combination of operations shifts the alignment of following
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * loops to insure that loops are aligned so that their instructions
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * fall within the minimum number of 4 instruction fetch groups.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * If instructions are inserted or removed between the .align
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * instruction and the unrolled loops, then the alignment needs
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to be readjusted. Misaligned loops can add a clock per loop
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * iteration to the loop timing.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * In a few cases, code is duplicated to avoid a branch. Since
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * a non-predicted tst and branch takes 10 clocks, this savings
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * is judged an appropriate time-space tradeoff.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Within the FPBLK-code, the prefetch method in the inner
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * loop needs to be explained as it is not standard. Two
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * prefetches are issued for each cache line instead of one.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * The primary one is at the maximum reach of 8 cache lines.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Most of the time, that maximum prefetch reach gives the
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * cache line more time to reach the processor for systems with
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * higher processor clocks. But, sometimes memory interference
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * can cause that prefetch to be dropped. Putting a second
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * prefetch at a reach of 5 cache lines catches the drops
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * three iterations later and shows a measured improvement
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * in performance over any similar loop with a single prefetch.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * The prefetches are placed in the loop so they overlap with
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * non-memory instructions, so that there is no extra cost
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * when the data is already in-cache.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Notes on preserving existing fp state and on membars.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * When a copyOP decides to use fp we may have to preserve existing
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * floating point state. It is not the caller's state that we need to
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * preserve - the rest of the kernel does not use fp and, anyway, fp
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * registers are volatile across a call. Some examples:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * - userland has fp state and is interrupted (device interrupt
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * or trap) and within the interrupt/trap handling we use
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * - another (higher level) interrupt or trap handler uses bcopy
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * while a bcopy from an earlier interrupt is still active
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * - an asynchronous error trap occurs while fp state exists (in
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * userland or in kernel copy) and the tl0 component of the handling
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * uses bcopy
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * - a user process with fp state incurs a copy-on-write fault and
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * hwblkpagecopy always uses fp
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * We therefore need a per-call place in which to preserve fp state -
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * using our stack is ideal (and since fp copy cannot be leaf optimized
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * because of calls it makes, this is no hardship).
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * When we have finished fp copy (with it's repeated block stores)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * we must membar #Sync so that our block stores may complete before
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * we either restore the original fp state into the fp registers or
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * return to a caller which may initiate other fp operations that could
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * modify the fp regs we used before the block stores complete.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Synchronous faults (eg, unresolvable DMMU miss) that occur while
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * t_lofault is not NULL will not panic but will instead trampoline
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to the registered lofault handler. There is no need for any
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * membars for these - eg, our store to t_lofault will always be visible to
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * ourselves and it is our cpu which will take any trap.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * while t_lofault is not NULL will also not panic. Since we're copying
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to or from userland the extent of the damage is known - the destination
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * buffer is incomplete. So trap handlers will trampoline to the lofault
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * handler in this case which should take some form of error action to
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * avoid using the incomplete buffer. The trap handler also flags the
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * fault so that later return-from-trap handling (for the trap that brought
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * this thread into the kernel in the first place) can notify the process
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * and reboot the system (or restart the service with Greenline/Contracts).
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Asynchronous faults (eg, uncorrectable ECC error from memory) can
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * result in deferred error traps - the trap is taken sometime after
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * the event and the trap PC may not be the PC of the faulting access.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Delivery of such pending traps can be forced by a membar #Sync, acting
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * as an "error barrier" in this role. To accurately apply the user/kernel
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * separation described in the preceding paragraph we must force delivery
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * of deferred traps affecting kernel state before we install a lofault
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * handler (if we interpose a new lofault handler on an existing one there
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * is no need to repeat this), and we must force delivery of deferred
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * errors affecting the lofault-protected region before we clear t_lofault.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Failure to do so results in lost kernel state being interpreted as
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * affecting a copyin/copyout only, or of an error that really only
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * affects copy data being interpreted as losing kernel state.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Since the copy operations may preserve and later restore floating
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * point state that does not belong to the caller (see examples above),
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * we must be careful in how we do this in order to prevent corruption
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * of another program.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * To make sure that floating point state is always saved and restored
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * correctly, the following "big rules" must be followed when the floating
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * point registers will be used:
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * 1. %l6 always holds the caller's lofault handler. Also in this register,
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * lofault handler was set coming in.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * 2. The FPUSED flag indicates that all FP state has been successfully stored
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * on the stack. It should not be set until this save has been completed.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * 3. The FPUSED flag should not be cleared on exit until all FP state has
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * been restored from the stack. If an error occurs while restoring
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * data from the stack, the error handler can check this flag to see if
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * a restore is necessary.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * 4. Code run under the new lofault handler must be kept to a minimum. In
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * particular, any calls to FP_ALLOWMIGRATE, which could result in a call
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to kpreempt(), should not be made until after the lofault handler has
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * been restored.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to "break even" using FP/VIS-accelerated memory operations.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * The FPBLK code assumes a minimum number of bytes are available
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * to be moved on entry. Check that code carefully before
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * reducing VIS_COPY_THRESHOLD below 256.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * This shadows sys/machsystm.h which can't be included due to the lack of
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * _ASM guards in include files it references. Change it here, change it there.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * TEST for very short copies
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Be aware that the maximum unroll for the short unaligned case
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * is SHORTCOPY+1
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Indicates that we're to trampoline to the error handler.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Number of outstanding prefetches.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * first prefetch moves data from L2 to L1 (n_reads)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * second prefetch moves data from memory to L2 (one_read)
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * Size of stack frame in order to accomodate a 64-byte aligned
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * floating-point register save area and 2 64-bit temp locations.
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * All copy functions use two quadrants of fp registers; to assure a
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * block-aligned two block buffer in which to save we must reserve
fcf3ce441efd61da9bb2884968af01cb7c1452ccJohn Forte * three blocks on stack. Not all functions preserve %pfrs on stack
#define FZEROQ1Q3 \
#define FZEROQ2Q4 \
#if !defined(lint)
nop ;\
nop ;\
label1: ;\
nop ;\
nop ;\
label1: ;\
nop ;\
#if defined(lint)
.kcopy_2:
.kcopy_4:
.kcopy_8:
.copyerr:
.copyerr2:
#if defined(lint)
.bcopy_2:
.bcopy_4:
.bcopy_8:
.bc_med:
! returning.
.do_copy:
.bcb_exit:
#if defined(lint)
retl ! return
.ov_fwd:
retl ! return
.ov_bkwd:
retl ! return
#ifdef lint
* Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
* Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
#if defined(lint)
#if defined(lint)
.co_med:
#ifdef lint
#ifdef lint
* Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
#if defined(lint)
.copyin_2:
.copyin_4:
.copyin_8:
.ci_med:
#ifdef lint
#ifdef lint
#if defined(lint)
#if defined(lint)
#ifdef lint
.pz_zinst:
#ifdef lint
#if defined(lint)
.word 0
.word 0
.word 0
.word 0