ip_ocsum.s revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/asm_linkage.h>
#include <sys/machthread.h>
#include <sys/machparam.h>
#if defined(lint)
#else /* lint */
#include "assym.h"
#endif /* lint */
/*
* Prefetch considerations
*
* We prefetch one cacheline ahead. This may not be enough on Serengeti
* systems - see default_copyout() etc which prefetch 5 lines ahead.
* On the other hand, we expect most of the source buffers to be
* recently used enough to be cached.
*
* On US-I the prefetches are inoperative. On US-II they preload the E$;
* the mainloop unrolling and load-buffer should cover loads from E$.
* The stores appear to be the slow point on US-II.
*
* On US-IIICu the prefetch preloads the L2$ too, but there is no load
* buffer so the loads will stall for D$ miss, L2$ hit. The hardware
* auto-prefetch is not activated by integer loads. No solution
* in sight for this, barring odd games with FP read, write, integer read.
*
* US-IV (Panther) appears similar to US-IIICu, except that a strong
* variant of prefetch is available which can take TLB traps. We don't
* use this. The h/w prefetch stride can be set to 64, 128 or 192,
* and they only reach to the L2$ (we don't use these either).
* L2$ load-to-use latency is 15 cycles (best).
*/
/*
* ip_ocsum(address, halfword_count, sum)
* Do a 16 bit one's complement sum of a given number of (16-bit)
* halfwords. The halfword pointer must not be odd.
* %o0 address; %o1 count; %o2 sum accumulator; %o4 temp
* %g2 and %g3 used in main loop
*
* (from @(#)ocsum.s 1.3 89/02/24 SMI)
*
*/
#if defined(lint)
/* ARGSUSED */
unsigned int
{ return (0); }
#else /* lint */
/*
* On ttcp transmits, called once per ocsum_copyin but with a small
* On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
* and tx acks?
*
* To do: telnet and nfs traffic
*
* On an NCA'd webserver about 10% of the calls are >64 bytes
* about 10% of those start on a 64byte boundary
* about 30% are >5*64 bytes.
* The NCA numbers & proportions don't change with h/w cksum on.
*
* Tx hdrs are likely to be already in cache.
* Rx hdrs depends if already inspected.
*/
!
!
!
prefetch [%o0], #n_reads ! first hword, dword, cacheline
cmp %o1, 32 ! at least a cacheline (64 bytes)?
bge,pn %icc, ip_ocsum_long ! yes, do the whole works
andn %o0, 7, %o5 ! delay: base src addr
cmp %o1, 4 ! < 4 halfwords?
bl,pn %icc, .tiny ! < 4 halfwords, just do them
inc 8, %o5 ! delay: next addr (no matter for .tiny)
/* leading dword with 1-4 hwords: 9 clocks */
/* Assumes ok to read the entire dword with the leading hwords */
ldx [%o5-8], %o3 ! NB base addr
sub %o5, %o0, %g2 ! byte count: 2/4/6/8
mov %o5, %o0
sll %g2, 2, %g2 ! 8/16/24/32 for mask
sllx %g5, %g2, %o5
srlx %o3, 32, %o4 ! hi32
b 9f
sub %o1, %g2, %o1 ! delay: decr count, 1-4 halfwords
.short_dw: ! max 7 iters of 4 clocks; 1 mispred of 4
ldx [%o0], %o3 ! tmp64 = *src++ (groups with the branch)
inc 8, %o0 ! (D-cache load-use delay)
dec 4, %o1 ! decrement count, 4 halfwords
srlx %o3, 32, %o4 ! hi32
9: and %o3, %g1, %o3 ! lo32
add %o4, %o2, %o2 ! accumulator
andncc %o1, 3, %g0 ! more than 3 hwords left?
bnz,pt %icc, .short_dw
add %o3, %o2, %o2 ! accumulator
.short_hw: ! trailing dw: 0-3 hwords
tst %o1 ! 0 seems fairly common...
bz,a .short_fold
srlx %o2, 32, %o4 ! delay: hi32
! mispredict 4 + 7 clocks for 1-3
ldx [%o0], %o3
sll %o1, 4, %o1 ! bitcount: 16/32/48
srlx %o3, 32, %o4 ! hi32
and %o3, %g1, %o3 ! lo32
add %o4, %o2, %o2 ! accumulator
add %o3, %o2, %o2 ! accumulator
! at this point the 64-bit accumulator
! has the result that needs to be returned in 16-bits
srlx %o2, 32, %o4 ! hi32
.short_fold:
and %o2, %g1, %o2 ! lo32
add %o4, %o2, %o2 ! 33b
srlx %o2, 16, %o3 ! hi17
and %o2, %g4, %o2 ! lo16
add %o3, %o2, %o2 ! 18b
srlx %o2, 16, %o3 ! hi2
and %o2, %g4, %o2 ! lo16
retl ! return
add %o3, %o2, %o0 ! 16b result in %o0
.tiny: ! almost never: less than 4 halfwords total.
tst %o1
bz,a .short_fold
srlx %o2, 32, %o4 ! delay: hi32
lduh [%o0], %o3 ! tmp16 = *src++
1:
inc 2, %o0
! stall for D-cache
add %o3, %o2, %o2 ! accumulator
deccc %o1 ! decrement count
bnz,a,pt %icc, 1b
lduh [%o0], %o3 ! tmp16 = *src++
! at this point the 64-bit accumulator
! has the result that needs to be returned in 16-bits
b .short_fold
srlx %o2, 32, %o4 ! hi32
SET_SIZE(ip_ocsum) ! 64-bit version
ENTRY(ip_ocsum_long) ! 64-bit, large blocks
save %sp, -SA(MINFRAME), %sp ! get another window
!
! %i0 contains buffer address
! %i1 contains count of 16bit words
! %i2 contains sum
! %i4 contains the mainloop count
! %i5 comes in with the buffer address rounded down to the first dword
!
! %g1 32bit mask
! %g4 16bit mask
! %g5 64bit mask (all 1s)
! %g6 fetch-ahead offset for Ecache
!
! %l0-7,%o0-5,%g2-3 mainloop temporaries
!
!
! 1 clock overhead
btst 63, %i0 ! src 64-byte aligned?
bz,a,pt %icc, .mainsection ! aligned blocks are fairly common
andncc %i1, 31, %i4 ! at least 64 bytes for main loop?
! Leading dword, with 1-4 hwords: 9 clocks
! Assumes ok to read the entire dword with the leading bytes
ldx [%i5], %l0 ! NB base addr
inc 8, %i5 ! next addr
sub %i5, %i0, %l2 ! byte count: 2/4/6/8
mov %i5, %i0
sll %l2, 2, %l2 ! 8/16/24/32 for mask
sllx %g5, %l2, %l4
srlx %l0, 32, %o0 ! hi32
b 9f
sub %i1, %l2, %i1 ! decr count, 1-4 halfwords
! Do dwords until source is 64-byte aligned, 0-6 iterations
! 4 clocks per + 4 for 1 mispred = 16 clocks avg
.dw: ldx [%i0], %l0 ! tmp64 = *src++ (groups with the branch below)
inc 8, %i0 ! (Dcache load-use delay)
dec 4, %i1 ! decrement count, 4 halfwords
srlx %l0, 32, %o0 ! hi32
9: and %l0, %g1, %l0 ! lo32
add %o0, %i2, %i2 ! accumulator
btst 63, %i0 ! src 64-byte aligned?
bnz,pt %icc, .dw
add %l0, %i2, %i2 ! accumulator
! At this point source address is 64 byte aligned
ba 5b
8:
1:
.fold:
ret ! return
#endif /* lint */