/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009, Intel Corporation
* All rights reserved.
*/
/*
* strlen - calculate the length of string
*/
#include "SYS.h"
#include "proc64_id.h"
#define LABEL(s) .strlen/**/s
/*
* This implementation uses SSE instructions to compare up to 16 bytes
* at a time looking for the end of string (null char).
*/
ENTRY(strlen) /* (const char *s) */
mov %rdi, %rsi /* keep original %rdi value */
mov %rsi, %rcx
pxor %xmm0, %xmm0 /* 16 null chars */
and $15, %rcx
jz LABEL(align16_loop) /* string is 16 byte aligned */
/*
* Unaligned case. Round down to 16-byte boundary before comparing
* 16 bytes for a null char. The code then compensates for any extra chars
* preceding the start of the string.
*/
LABEL(unalign16):
and $0xfffffffffffffff0, %rsi
pcmpeqb (%rsi), %xmm0
lea 16(%rdi), %rsi
pmovmskb %xmm0, %edx
shr %cl, %edx /* Compensate for bytes preceding the string */
test %edx, %edx
jnz LABEL(exit)
sub %rcx, %rsi /* no null, adjust to next 16-byte boundary */
pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */
.p2align 4
LABEL(align16_loop): /* 16 byte aligned */
pcmpeqb (%rsi), %xmm0 /* look for null bytes */
pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */
add $16, %rsi /* prepare to search next 16 bytes */
test %edx, %edx /* if no null byte, %edx must be 0 */
jnz LABEL(exit) /* found a null */
pcmpeqb (%rsi), %xmm0
pmovmskb %xmm0, %edx
add $16, %rsi
test %edx, %edx
jnz LABEL(exit)
pcmpeqb (%rsi), %xmm0
pmovmskb %xmm0, %edx
add $16, %rsi
test %edx, %edx
jnz LABEL(exit)
pcmpeqb (%rsi), %xmm0
pmovmskb %xmm0, %edx
add $16, %rsi
test %edx, %edx
jz LABEL(align16_loop)
.p2align 4
LABEL(exit):
neg %rdi
/*
* Check to see if BSF is fast on this processor. If not, use a different
* exit tail to find first bit set indicating null byte match.
*/
testl $USE_BSF, .memops_method(%rip)
jz LABEL(AMD_exit)
lea -16(%rdi, %rsi), %rax /* calculate exact offset */
bsf %edx, %ecx /* Least significant 1 bit is index of null */
lea (%rax, %rcx),%rax
ret
/*
* This exit tail does not use the bsf instruction.
*/
.p2align 4
LABEL(AMD_exit):
lea -16(%rdi, %rsi), %rax
test %dl, %dl
jz LABEL(exit_high)
test $0x01, %dl
jnz LABEL(exit_tail0)
test $0x02, %dl
jnz LABEL(exit_tail1)
.p2align 4
test $0x04, %dl
jnz LABEL(exit_tail2)
test $0x08, %dl
jnz LABEL(exit_tail3)
test $0x10, %dl
jnz LABEL(exit_tail4)
test $0x20, %dl
jnz LABEL(exit_tail5)
test $0x40, %dl
jnz LABEL(exit_tail6)
add $7, %rax
ret
.p2align 4
LABEL(exit_high):
add $8, %rax
test $0x01, %dh
jnz LABEL(exit_tail0)
test $0x02, %dh
jnz LABEL(exit_tail1)
test $0x04, %dh
jnz LABEL(exit_tail2)
test $0x08, %dh
jnz LABEL(exit_tail3)
test $0x10, %dh
jnz LABEL(exit_tail4)
test $0x20, %dh
jnz LABEL(exit_tail5)
test $0x40, %dh
jnz LABEL(exit_tail6)
add $7, %rax
ret
.p2align 4
LABEL(exit_tail0):
xor %ecx, %ecx
ret
.p2align 4
LABEL(exit_tail1):
add $1, %rax
ret
.p2align 4
LABEL(exit_tail2):
add $2, %rax
ret
.p2align 4
LABEL(exit_tail3):
add $3, %rax
ret
.p2align 4
LABEL(exit_tail4):
add $4, %rax
ret
.p2align 4
LABEL(exit_tail5):
add $5, %rax
ret
.p2align 4
LABEL(exit_tail6):
add $6, %rax
ret
SET_SIZE(strlen)