/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
 */

	.file	"memset.s"
/*
 * This version of memset is tuned for SPARC-T4 and other processors
 * which share the S3 SPARC architecture core.
 *
 * char *memset(sp, c, n)
 *
 * Set an array of n chars starting at sp to the character c.
 * Return sp.
 *
 * Fast assembler language version of the following C-program for memset
 * which represents the `standard' for the C-library.
 *
 *	void *
 *	memset(void *sp1, int c, size_t n)
 *	{
 *	    if (n != 0) {
 *		char *sp = sp1;
 *		do {
 *		    *sp++ = (char)c;
 *		} while (--n != 0);
 *	    }
 *	    return (sp1);
 *	}
 *
 * Flow :
 *
 *	For small 6 or fewer bytes stores, bytes will be stored.
 *
 *	For less than 32 bytes stores, align the address on 4 byte boundary.
 *	Then store as many 4-byte chunks, followed by trailing bytes.
 *
 *	For sizes greater than 32 bytes, align the address on 8 byte boundary.
 *	if (count > 64) {
 *	    store 8-bytes chunks to align the address on 64 byte boundary
 *	    if (count > 1664 ) {
 *		if(value to be set is zero) {
 *		    Using BIS stores, set the first long word of each 32-byte
 *		    cache line to zero which will also clear the other
 *		    three long words of the cache line.
 *		} else {
 *		    Using BIS stores, set the first long word of each of
 *		    44 cache lines (32 bytes each) before the main loop
 *		    is entered.
 *		    In the main loop, continue pre-setting the first long
 *		    word of each cache line 44 lines in advance while setting
 *		    the other three long words (24 bytes)of each cache line
 *		    until fewer than 1536 bytes remain. Then set the remaining
 *		    three long words of each cache line that has already had
 *		    it's first long word set.
 *		}
 *	     store remaining data in 64-byte chunks until less than
 *	     64 bytes remain.
 *	}
 *	Store as many 8-byte chunks, followed by trialing bytes.
 *
 * BIS = Block Init Store
 *   Doing the advance store of the first element of the cache line
 *   initiates the displacement of a cache line while only using a single
 *   instruction in the pipeline. That avoids various pipeline delays,
 *   such as filling the miss buffer. The performance effect is
 *   similar to prefetching for normal stores.
 *   The special case for zero fills runs faster and uses fewer instruction
 *   cycles than the normal memset loop.
 *
 * We only use BIS for memset of greater than 1664 bytes because a sequence
 * BIS stores must be followed by a membar #StoreStore. The benefit of
 * the BIS store must be balanced against the cost of the membar operation.
 */

#include <sys/asm_linkage.h>
#include <sys/niagaraasi.h>
#include <sys/asi.h>

#define	ASI_PNF		0x82    /* primary no fault */
#define	ASI_BLK_P	0xF0    /* block primary */
#define	ASI_BLK_INIT_ST_QUAD_LDD_P	0xE2

/* documented name for primary block initializing store */
#define	ASI_STBI_P	ASI_BLK_INIT_ST_QUAD_LDD_P
#define	ASI_STBIMRU_P	0xF2
/*
 * ASI_STBI_P marks the cache line as "least recently used"
 * which means if many threads are active, it has a high chance
 * of being pushed out of the cache between the first initializing
 * store and the final stores.
 * Thus, we use ASI_STBIMRU_P which marks the cache line as
 * "most recently used" for all but the last cache line
 */

	ANSI_PRAGMA_WEAK(memset,function)

	ENTRY(memset)

	mov	%o0, %o5		! copy sp1 before using it
	cmp	%o2, 7			! if small counts, just write bytes
	blu,pn	%ncc, .wrchar
	and	%o1, 0xff, %o1		! o1 is (char)c

	sll	%o1, 8, %o3
	or	%o1, %o3, %o1		! now o1 has 2 bytes of c
	sll	%o1, 16, %o3
	cmp	%o2, 32
	blu,pn	%ncc, .wdalign
	or	%o1, %o3, %o1		! now o1 has 4 bytes of c

	sllx	%o1, 32, %o3
	or	%o1, %o3, %o1		! now o1 has 8 bytes of c

.dbalign:
	andcc	%o5, 7, %o3		! is sp1 aligned on a 8 byte bound
	bz,pt	%ncc, .blkalign		! already long word aligned
	sub	%o3, 8, %o3		! -(bytes till long word aligned)

	add	%o2, %o3, %o2		! update o2 with new count
	! Set -(%o3) bytes till sp1 long word aligned
1:	stb	%o1, [%o5]		! there is at least 1 byte to set
	inccc	%o3			! byte clearing loop 
	bl,pt	%ncc, 1b
	inc	%o5 

	! Now sp1 is long word aligned (sp1 is found in %o5)
.blkalign:
	cmp	%o2, 64			! check if there are 64 bytes to set
	blu,pn	%ncc, .wrshort
	mov	%o2, %o3

	andcc	%o5, 63, %o3		! is sp1 block aligned?
	bz,pt	%ncc, .blkwr		! now block aligned
	sub	%o3, 64, %o3		! o3 is -(bytes till block aligned)
	add	%o2, %o3, %o2		! o2 is the remainder

	! Store -(%o3) bytes till dst is block (64 byte) aligned.
	! Use long word word stores.
	! Recall that dst is already long word aligned
1:
	addcc	%o3, 8, %o3
	stx	%o1, [%o5]
	bl,pt	%ncc, 1b
	add	%o5, 8, %o5

	! Now sp1 is block aligned
.blkwr:
	andn	%o2, 63, %o4		! calculate size of blocks in bytes
	cmp	%o4, 1664		! check there are enough bytes to set
	blu,pn	%ncc, 3f		! to justify cost of membar 
					! must be > pre-cleared lines
	and	%o2, 63, %o3		! %o3 = bytes left after blk stores.
	brz,pn	%o1, .wrzero		! special case if c1 == 0
	mov	ASI_STBIMRU_P, %asi


	! initial cache-clearing stores
	! get store pipeline moving

	stxa	%o1, [%o5+0x00+0x000]%asi
	stxa	%o1, [%o5+0x40+0x000]%asi
	stxa	%o1, [%o5+0x20+0x000]%asi
	stxa	%o1, [%o5+0x60+0x000]%asi
	stxa	%o1, [%o5+0x00+0x080]%asi
	stxa	%o1, [%o5+0x40+0x080]%asi
	stxa	%o1, [%o5+0x20+0x080]%asi
	stxa	%o1, [%o5+0x60+0x080]%asi

	stxa	%o1, [%o5+0x00+0x100]%asi
	stxa	%o1, [%o5+0x40+0x100]%asi
	stxa	%o1, [%o5+0x20+0x100]%asi
	stxa	%o1, [%o5+0x60+0x100]%asi
	stxa	%o1, [%o5+0x00+0x180]%asi
	stxa	%o1, [%o5+0x40+0x180]%asi
	stxa	%o1, [%o5+0x20+0x180]%asi
	stxa	%o1, [%o5+0x60+0x180]%asi

	stxa	%o1, [%o5+0x00+0x200]%asi
	stxa	%o1, [%o5+0x40+0x200]%asi
	stxa	%o1, [%o5+0x20+0x200]%asi
	stxa	%o1, [%o5+0x60+0x200]%asi
	stxa	%o1, [%o5+0x00+0x280]%asi
	stxa	%o1, [%o5+0x40+0x280]%asi
	stxa	%o1, [%o5+0x20+0x280]%asi
	stxa	%o1, [%o5+0x60+0x280]%asi

	stxa	%o1, [%o5+0x00+0x300]%asi
	stxa	%o1, [%o5+0x40+0x300]%asi
	stxa	%o1, [%o5+0x20+0x300]%asi
	stxa	%o1, [%o5+0x60+0x300]%asi
	stxa	%o1, [%o5+0x00+0x380]%asi
	stxa	%o1, [%o5+0x40+0x380]%asi
	stxa	%o1, [%o5+0x20+0x380]%asi
	stxa	%o1, [%o5+0x60+0x380]%asi

	stxa	%o1, [%o5+0x00+0x400]%asi
	stxa	%o1, [%o5+0x40+0x400]%asi
	stxa	%o1, [%o5+0x20+0x400]%asi
	stxa	%o1, [%o5+0x60+0x400]%asi
	stxa	%o1, [%o5+0x00+0x480]%asi
	stxa	%o1, [%o5+0x40+0x480]%asi
	stxa	%o1, [%o5+0x20+0x480]%asi
	stxa	%o1, [%o5+0x60+0x480]%asi

	stxa	%o1, [%o5+0x00+0x500]%asi
	stxa	%o1, [%o5+0x40+0x500]%asi
	stxa	%o1, [%o5+0x20+0x500]%asi
	stxa	%o1, [%o5+0x60+0x500]%asi

!	Primary memset loop for large memsets
!	Continue with advance stores to keep store pipeline moving
	sub	%o5, 8, %o5		! adjust %o5 for immediate %asi value
2:
	stxa	%o1, [%o5+0x08+0x580]%asi
	stxa	%o1, [%o5+0x48+0x580]%asi
	stxa	%o1, [%o5+0x28+0x580]%asi
	stxa	%o1, [%o5+0x68+0x580]%asi
	sub	%o4, 128, %o4

	stxa	%o1, [%o5+0x10]%asi
	stxa	%o1, [%o5+0x50]%asi
	stxa	%o1, [%o5+0x30]%asi
	stxa	%o1, [%o5+0x70]%asi

	cmp	%o4, 1536		! 1536 = 128+1408 (1408=0x580)
	stxa	%o1, [%o5+0x18]%asi
	stxa	%o1, [%o5+0x58]%asi
	stxa	%o1, [%o5+0x38]%asi
	stxa	%o1, [%o5+0x78]%asi

	stxa	%o1, [%o5+0x20]%asi
	stxa	%o1, [%o5+0x60]%asi
	! Adjust %o5 to allow zero offset, allowing immediate %asi value
	add	%o5, 64, %o5
	stxa	%o1, [%o5]ASI_STBI_P
	add	%o5, 64, %o5
	bgu,pt	%ncc, 2b
	stxa	%o1, [%o5]ASI_STBI_P

!	Clean up loop
!	All cache lines have had initial 8 byte store
!	Now just fill in remaining 24 bytes
.cleanup:
	stxa	%o1, [%o5+0x10]%asi
	sub	%o4, 128, %o4
	stxa	%o1, [%o5+0x50]%asi
	stxa	%o1, [%o5+0x30]%asi
	stxa	%o1, [%o5+0x70]%asi
	cmp	%o4, 128
	stxa	%o1, [%o5+0x18]%asi
	stxa	%o1, [%o5+0x58]%asi
	stxa	%o1, [%o5+0x38]%asi
	stxa	%o1, [%o5+0x78]%asi

	stxa	%o1, [%o5+0x20]%asi
	stxa	%o1, [%o5+0x60]%asi
	! Adjust %o5 to allow zero offset, allowing immediate %asi value
	add	%o5, 64, %o5
	stxa	%o1, [%o5]ASI_STBI_P
	add	%o5, 64, %o5
	bgu,pt	%ncc, .cleanup
	stxa	%o1, [%o5]ASI_STBI_P

	ba	.bsi_done
	add	%o5, 8, %o5		! restore %o5 offset


!	Special case loop for zero fill memsets
!	blk store to first 8 bytes of 32 byte cache line clears all 32 bytes.
!	For each 64 byte cache line, first 32 bytes marked MRU
!	second 32 bytes marked LRU to allow L3 cache to replace it
!
.wrzero:
	mov	ASI_STBI_P, %asi
.wrzero_loop:
	stxa	%o1, [%o5]ASI_STBIMRU_P
	add	%o5, 64, %o5
	stxa	%o1, [%o5]ASI_STBIMRU_P
	add	%o5, 64, %o5
	stxa	%o1, [%o5]ASI_STBIMRU_P
	add	%o5, 64, %o5
	stxa	%o1, [%o5]ASI_STBIMRU_P
	add	%o5, 64, %o5
	stxa	%o1, [%o5]ASI_STBIMRU_P
	add	%o5, 64, %o5
	stxa	%o1, [%o5]ASI_STBIMRU_P
	add	%o5, 64, %o5
	stxa	%o1, [%o5]ASI_STBIMRU_P
	add	%o5, 64, %o5
	stxa	%o1, [%o5]ASI_STBIMRU_P
	add	%o5, 64, %o5

	stxa	%o1, [%o5-0x1e0]%asi
	stxa	%o1, [%o5-0x1a0]%asi
	sub	%o4, 512, %o4
	stxa	%o1, [%o5-0x160]%asi
	stxa	%o1, [%o5-0x120]%asi
	cmp	%o4, 512
	stxa	%o1, [%o5-0xe0]%asi
	stxa	%o1, [%o5-0xa0]%asi
	stxa	%o1, [%o5-0x60]%asi
	bgu,pt	%ncc, .wrzero_loop
	stxa	%o1, [%o5-0x20]%asi

.bsi_done:
	membar	#StoreStore		! required by use of Block Store Init
	mov	ASI_PNF, %asi		! restore %asi to default
					! ASI_PRIMARY_NOFAULT value
3:
	cmp	%o4, 64			! check if 64 bytes to set
	blu	%ncc, 5f
	nop
4:					! set final blocks of 64 bytes
	stx	%o1, [%o5+0x00]
	stx	%o1, [%o5+0x20]
	stx	%o1, [%o5+0x08]
	stx	%o1, [%o5+0x10]
	stx	%o1, [%o5+0x18]
	subcc	%o4, 64, %o4
	stx	%o1, [%o5+0x28]
	add	%o5, 64, %o5
	stx	%o1, [%o5-0x10]
	bgu,pt	%ncc, 4b
	stx	%o1, [%o5-0x08]
5:
	! Set the remaining long words
.wrshort:
	subcc	%o3, 8, %o3		! Can we store any long words?
	blu,pn	%ncc, .wrchars
	and	%o2, 7, %o2		! calc bytes left after long words
6:
	subcc	%o3, 8, %o3
	stx	%o1, [%o5]		! store the long words
	bgeu,pt	%ncc, 6b
	add	%o5, 8, %o5

.wrchars:				! check for extra chars
	cmp	%o2, 0
	bne	%ncc, 7f
	nop
	retl
	nop

.wdalign:			
	andcc	%o5, 3, %o3		! is sp1 aligned on a word boundary
	bz,pn	%ncc, .wrword
	andn	%o2, 3, %o3		! create word sized count in %o3

	dec	%o2			! decrement count
	stb	%o1, [%o5]		! clear a byte
	b	.wdalign
	inc	%o5			! next byte

.wrword:
	subcc	%o3, 4, %o3
	st	%o1, [%o5]		! 4-byte writing loop
	bnz,pt	%ncc, .wrword
	add	%o5, 4, %o5

	and	%o2, 3, %o2		! leftover count, if any

.wrchar:
	! Set the remaining bytes, if any
	cmp	%o2, 0
	be	%ncc, .exit
	nop
7:
	deccc	%o2
	stb	%o1, [%o5]
	bgu,pt	%ncc, 7b
	inc	%o5
.exit:
	retl				! %o0 was preserved
	nop

	SET_SIZE(memset)
