common/gen/strlen.s

	strlen.s revision b1593d50e783f7d66722dde093752b74ffa95176
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"strlen.s"

/*
 * strlen(s)
 *
 * Given string s, return length (not including the terminating null).
 *
 * Fast assembler language version of the following C-program strlen
 * which represents the `standard' for the C-library.
 *
 *	size_t
 *	strlen(s)
 *	register const char *s;
 *	{
 *		register const char *s0 = s + 1;
 *
 *		while (*s++ != '\0')
 *			;
 *		return (s - s0);
 *	}
 */

#include <sys/asm_linkage.h>

	/*
	 * There are two key optimizations in the routine below.
	 * First, all memory accesses are 8 bytes wide.  The time
	 * for long strings is dominated by the latency of load
	 * instructions in the inner loop, and going 8 bytes at
	 * a time means 1/8th as much latency.
	 *
	 * Scanning an 8 byte word for a '\0' is made fast by
	 * this formula (due to Alan Mycroft):
	 *     ~x & 0x808080808080 & (x - 0x0101010101010101)
	 * The result of this formula is non-zero iff there's
	 * a '\0' somewhere in x.
	 *
	 * Second, the cost of short strings is dominated by the
	 * cost of figuring out which byte out of the last 8
	 * contained the '\0' that terminated the string.  We use
	 * properties of the formula above to convert scanning the
	 * word for '\0' into a single LZD instruction.
	 */
	.align	64
	.skip	4*4	! force .findnull to align to 64 bytes
	ENTRY_NP(strlen)
	and	%o0, 7, %o3			! off = addr & 7
	sethi	%hi(0x01010101), %o4		! 0x01010000

	sub	%g0, %o3, %o2			! count = -off
	or	%o4, %lo(0x01010101), %o4	! 0x01010101

	ldx	[%o0 + %o2], %o1		! val = *(addr + count)
	sllx	%o4, 32, %o5			! 0x01010101 << 32

	mov	-1, %g1				! mask = -1
	sllx	%o3, 3, %o3			! shift = off * 8

	or	%o4, %o5, %o4			! 0x0101010101010101
	srlx	%g1, %o3, %g1			! -1 >> ((addr & 7) * 8)

	sllx	%o4, 7, %o5			! 0x8080808080808080
	orn	%o1, %g1, %o1			! val |= ~mask
.strlen_findnull:
	!! %o0 - base address
	!! %o1 - xword from memory
	!! %o2 - index
	!! %o3 - result of test for '\0'
	!! %o4 - constant 0x0101.0101.0101.0101
	!! %o5 - constant 0x8080.8080.8080.8080
	!! %g1 - scratch
	andn	%o5, %o1, %o3		! ~val & 0x80
	sub	%o1, %o4, %g1		! val - 0x01
	andcc	%o3, %g1, %o3		! ~val & 0x80 & (val - 0x01)
	inc	8, %o2
	bz,a,pt	%xcc, .strlen_findnull
	  ldx	[%o0 + %o2], %o1

	/*
	 * The result of Mycroft's formula is a pattern of 0x80 and
	 * 0x00 bytes.  There's a 0x80 at every byte position where
	 * there was a '\0' character, but a string of 0x01 bytes
	 * immediately preceding a '\0' becomes a corresponding
	 * string of 0x80 bytes.  (e.g. 0x0101010101010100 becomes
	 * 0x8080808080808080).  We need one final step to discount
	 * any leading 0x01 bytes, and then LZD can tell us how many
	 * characters there were before the terminating '\0'.
	 */
	!! %o1 - last data word
	!! %o2 - length+8, plus 1-8 extra
	!! %o3 - xword with 0x80 for each 0x00 byte and leading 0x01
	sub	%o2, 8, %o2		! subtract off '\0' and last 8
	srlx	%o3, 7, %o3		! shift 0x80 -> 0x01
	andn	%o3, %o1, %o3		! mask off leading 0x01 bytes
	lzd	%o3, %o3		! 7, 15, ... 63
	srlx	%o3, 3, %o3		! 0 ... 7

	retl
	add	%o2, %o3, %o0		! add back bytes before '\0'

	SET_SIZE(strlen)