sparcv9/gen/ascii_strcasecmp.s

2N/A/*
2N/A * CDDL HEADER START
2N/A *
2N/A * The contents of this file are subject to the terms of the
2N/A * Common Development and Distribution License (the "License").
2N/A * You may not use this file except in compliance with the License.
2N/A *
2N/A * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
2N/A * or http://www.opensolaris.org/os/licensing.
2N/A * See the License for the specific language governing permissions
2N/A * and limitations under the License.
2N/A *
2N/A * When distributing Covered Code, include this CDDL HEADER in each
2N/A * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
2N/A * If applicable, add the following below this CDDL HEADER, with the
2N/A * fields enclosed by brackets "[]" replaced with your own identifying
2N/A * information: Portions Copyright [yyyy] [name of copyright owner]
2N/A *
2N/A * CDDL HEADER END
2N/A */
2N/A
2N/A/*
2N/A * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
2N/A */
2N/A
2N/A/*
2N/A * The ascii_strcasecmp() function is a case insensitive versions of strcmp().
2N/A * It assumes the ASCII character set and ignores differences in case
2N/A * when comparing lower and upper case characters. In other words, it
2N/A * behaves as if both strings had been converted to lower case using
2N/A * tolower() in the "C" locale on each byte, and the results had then
2N/A * been compared using strcmp().
2N/A *
2N/A * The assembly code below is an optimized version of the following C
2N/A * reference:
2N/A *
2N/A * static const char charmap[] = {
2N/A *  '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
2N/A *  '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
2N/A *  '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
2N/A *  '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
2N/A *  '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
2N/A *  '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
2N/A *  '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
2N/A *  '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
2N/A *  '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
2N/A *  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
2N/A *  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
2N/A *  '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
2N/A *  '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
2N/A *  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
2N/A *  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
2N/A *  '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
2N/A *  '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
2N/A *  '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
2N/A *  '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
2N/A *  '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
2N/A *  '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
2N/A *  '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
2N/A *  '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
2N/A *  '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
2N/A *  '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
2N/A *  '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
2N/A *  '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
2N/A *  '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
2N/A *  '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
2N/A *  '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
2N/A *  '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
2N/A *  '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
2N/A * };
2N/A *
2N/A * int
2N/A * ascii_strcasecmp(const char *s1, const char *s2)
2N/A * {
2N/A *  const unsigned char *cm = (const unsigned char *)charmap;
2N/A *  const unsigned char *us1 = (const unsigned char *)s1;
2N/A *  const unsigned char *us2 = (const unsigned char *)s2;
2N/A *
2N/A *  while (cm[*us1] == cm[*us2++])
2N/A *      if (*us1++ == '\0')
2N/A *          return (0);
2N/A *  return (cm[*us1] - cm[*(us2 - 1)]);
2N/A * }
2N/A *
2N/A * The following algorithm, from a 1987 news posting by Alan Mycroft, is
2N/A * used for finding null bytes in a word:
2N/A *
2N/A * #define has_null(word) ((word - 0x01010101) & (~word & 0x80808080))
2N/A *
2N/A * The following algorithm is used for a wordwise tolower() operation:
2N/A *
2N/A * unsigned int
2N/A * parallel_tolower (unsigned int x)
2N/A * {
2N/A *  unsigned int p;
2N/A *  unsigned int q;
2N/A *
2N/A *  unsigned int m1 = 0x80808080;
2N/A *  unsigned int m2 = 0x3f3f3f3f;
2N/A *  unsigned int m3 = 0x25252525;
2N/A *
2N/A *  q = x & ~m1;// newb = byte & 0x7F
2N/A *  p = q + m2; // newb > 0x5A --> MSB set
2N/A *  q = q + m3; // newb < 0x41 --> MSB clear
2N/A *  p = p & ~q; // newb > 0x40 && newb < 0x5B --> MSB set
2N/A *  q = m1 & ~x;//  byte < 0x80 --> 0x80
2N/A *  q = p & q;  // newb > 0x40 && newb < 0x5B && byte < 0x80 -> 0x80,else 0
2N/A *  q = q >> 2; // newb > 0x40 && newb < 0x5B && byte < 0x80 -> 0x20,else 0
2N/A *  return (x + q); // translate uppercase characters to lowercase
2N/A * }
2N/A *
2N/A * Both algorithms have been tested exhaustively for all possible 2^32 inputs.
2N/A */
2N/A
2N/A#include <sys/asm_linkage.h>
2N/A
2N/A    ! The first part of this algorithm walks through the beginning of
2N/A    ! both strings a byte at a time until the source ptr is  aligned to
2N/A    ! a word boundary. During these steps, the bytes are translated to
2N/A    ! lower-case if they are upper-case, and are checked against
2N/A    ! the source string.
2N/A
2N/A    ENTRY(ascii_strcasecmp)
2N/A
2N/A    .align 32
2N/A
2N/A    save    %sp, -SA(WINDOWSIZE), %sp
2N/A    subcc   %i0, %i1, %i2       ! s1 == s2 ?
2N/A    bz,pn   %ncc, .stringsequal ! yup, done, strings equal
2N/A    andcc   %i0, 3, %i3     ! s1 word-aligned ?
2N/A    bz,pn   %ncc, .s1aligned1   ! yup
2N/A    sethi   %hi(0x80808080), %i4    ! start loading Mycroft's magic1
2N/A
2N/A    ldub    [%i1 + %i2], %i0    ! s1[0]
2N/A    ldub    [%i1], %g1      ! s2[0]
2N/A    sub %i0, 'A', %l0       ! transform for faster uppercase check
2N/A    sub %g1, 'A', %l1       ! transform for faster uppercase check
2N/A    cmp %l0, ('Z' - 'A')    ! s1[0] uppercase?
2N/A    bleu,a  .noxlate11      ! yes
2N/A    add %i0, ('a' - 'A'), %i0   ! s1[0] = tolower(s1[0])
2N/A.noxlate11:
2N/A    cmp %l1, ('Z' - 'A')    ! s2[0] uppercase?
2N/A    bleu,a  .noxlate12      ! yes
2N/A    add %g1, ('a' - 'A'), %g1   ! s2[0] = tolower(s2[0])
2N/A.noxlate12:
2N/A    subcc   %i0, %g1, %i0       ! tolower(s1[0]) != tolower(s2[0]) ?
2N/A    bne,pn  %ncc, .done     ! yup, done
2N/A    inc %i1         ! s1++, s2++
2N/A    addcc   %i0, %g1, %i0       ! s1[0] == 0 ?
2N/A    bz,pn   %ncc, .done     ! yup, done, strings equal
2N/A    cmp %i3, 3          ! s1 aligned now?
2N/A    bz  %ncc, .s1aligned2   ! yup
2N/A    sethi   %hi(0x01010101), %i5    ! start loading Mycroft's magic2
2N/A
2N/A    ldub    [%i1 + %i2], %i0    ! s1[1]
2N/A    ldub    [%i1], %g1      ! s2[1]
2N/A    sub %i0, 'A', %l0       ! transform for faster uppercase check
2N/A    sub %g1, 'A', %l1       ! transform for faster uppercase check
2N/A    cmp %l0, ('Z' - 'A')    ! s1[1] uppercase?
2N/A    bleu,a  .noxlate21      ! yes
2N/A    add %i0, ('a' - 'A'), %i0   ! s1[1] = tolower(s1[1])
2N/A.noxlate21:
2N/A    cmp %l1, ('Z' - 'A')    ! s2[1] uppercase?
2N/A    bleu,a  .noxlate22      ! yes
2N/A    add %g1, ('a' - 'A'), %g1   ! s2[1] = tolower(s2[1])
2N/A.noxlate22:
2N/A    subcc   %i0, %g1, %i0       ! tolower(s1[1]) != tolower(s2[1]) ?
2N/A    bne,pn  %ncc, .done     ! yup, done
2N/A    inc %i1         ! s1++, s2++
2N/A    addcc   %i0, %g1, %i0       ! s1[1] == 0 ?
2N/A    bz,pn   %ncc, .done     ! yup, done, strings equal
2N/A    cmp %i3, 2          ! s1 aligned now?
2N/A    bz  %ncc, .s1aligned3   ! yup
2N/A    or  %i4, %lo(0x80808080),%i4! finish loading Mycroft's magic1
2N/A
2N/A    ldub    [%i1 + %i2], %i0    ! s1[2]
2N/A    ldub    [%i1], %g1      ! s2[2]
2N/A    sub %i0, 'A', %l0       ! transform for faster uppercase check
2N/A    sub %g1, 'A', %l1       ! transform for faster uppercase check
2N/A    cmp %l0, ('Z' - 'A')    ! s1[2] uppercase?
2N/A    bleu,a  .noxlate31      ! yes
2N/A    add %i0, ('a' - 'A'), %i0   ! s1[2] = tolower(s1[2])
2N/A.noxlate31:
2N/A    cmp %l1, ('Z' - 'A')    ! s2[2] uppercase?
2N/A    bleu,a  .noxlate32      ! yes
2N/A    add %g1, ('a' - 'A'), %g1   ! s2[2] = tolower(s2[2])
2N/A.noxlate32:
2N/A    subcc   %i0, %g1, %i0       ! tolower(s1[2]) != tolower(s2[2]) ?
2N/A    bne,pn  %ncc, .done     ! yup, done
2N/A    inc %i1         ! s1++, s2++
2N/A    addcc   %i0, %g1, %i0       ! s1[2] == 0 ?
2N/A    bz,pn   %ncc, .done     ! yup, done, strings equal
2N/A    or  %i5, %lo(0x01010101),%i5! finish loading Mycroft's magic2
2N/A    ba  .s1aligned4     ! s1 aligned now
2N/A    andcc   %i1, 3, %i3     ! s2 word-aligned ?
2N/A
2N/A    ! Here, we initialize our checks for a zero byte and decide
2N/A    ! whether or not we can optimize further if we're fortunate
2N/A    ! enough to have a word aligned desintation
2N/A
2N/A.s1aligned1:
2N/A    sethi   %hi(0x01010101), %i5    ! start loading Mycroft's magic2
2N/A.s1aligned2:
2N/A    or  %i4, %lo(0x80808080),%i4! finish loading Mycroft's magic1
2N/A.s1aligned3:
2N/A    or  %i5, %lo(0x01010101),%i5! finish loading Mycroft's magic2
2N/A    andcc   %i1, 3, %i3     ! s2 word aligned ?
2N/A.s1aligned4:
2N/A    sethi   %hi(0x3f3f3f3f), %l2    ! load m2 for parallel tolower()
2N/A    sethi   %hi(0x25252525), %l3    ! load m3 for parallel tolower()
2N/A    or  %l2, %lo(0x3f3f3f3f),%l2! finish loading m2
2N/A    bz  .word4          ! yup, s2 word-aligned
2N/A    or  %l3, %lo(0x25252525),%l3! finish loading m3
2N/A
2N/A    add %i2, %i3, %i2       ! start adjusting offset s1-s2
2N/A    sll     %i3, 3, %l6         ! shift factor for left shifts
2N/A    andn    %i1, 3, %i1     ! round s1 pointer down to next word
2N/A    sub %g0, %l6, %l7       ! shift factor for right shifts
2N/A    orn %i3, %g0, %i3       ! generate all ones
2N/A    lduw    [%i1], %i0      ! new lower word from s2
2N/A    srl %i3, %l6, %i3       ! mask for fixing up bytes
2N/A    sll %i0, %l6, %g1       ! partial unaligned word from s2
2N/A    orn %i0, %i3, %i0       ! force start bytes to non-zero
2N/A    nop             ! pad to align loop to 16-byte boundary
2N/A    nop             ! pad to align loop to 16-byte boundary
2N/A
2N/A    ! This is the comparision procedure used if the destination is not
2N/A    ! word aligned, if it is, we use word4 & cmp4
2N/A
2N/A.cmp:
2N/A    andn    %i4, %i0, %l4       ! ~word & 0x80808080
2N/A    sub %i0, %i5, %l5       ! word - 0x01010101
2N/A    andcc   %l5, %l4, %g0       ! (word - 0x01010101) & ~word & 0x80808080
2N/A    bz,a,pt %ncc, .doload       ! null byte in previous aligned s2 word
2N/A    lduw    [%i1 + 4], %i0      ! load next aligned word from s2
2N/A.doload:
2N/A    srl %i0, %l7, %i3       ! byte 1 from new aligned word from s2
2N/A    or  %g1, %i3, %g1       ! merge to get unaligned word from s2
2N/A    lduw    [%i1 + %i2], %i3    ! x1 = word from s1
2N/A    andn    %i3, %i4, %l0       ! q1 = x1 & ~m1
2N/A    andn    %g1, %i4, %l4       ! q2 = x2 & ~m1
2N/A    add %l0, %l2, %l1       ! p1 = q1 + m2
2N/A    add %l4, %l2, %l5       ! p2 = q2 + m2
2N/A    add %l0, %l3, %l0       ! q1 = q1 + m3
2N/A    add %l4, %l3, %l4       ! q2 = q2 + m3
2N/A    andn    %l1, %l0, %l1       ! p1 = p1 & ~q1
2N/A    andn    %l5, %l4, %l5       ! p2 = p2 & ~q2
2N/A    andn    %i4, %i3, %l0       ! q1 = m1 & ~x1
2N/A    andn    %i4, %g1, %l4       ! q2 = m1 & ~x2
2N/A    and %l0, %l1, %l0       ! q1 = p1 & q1
2N/A    and %l4, %l5, %l4       ! q2 = p2 & q2
2N/A    srl %l0, 2, %l0     ! q1 = q1 >> 2
2N/A    srl %l4, 2, %l4     ! q2 = q2 >> 2
2N/A    add %l0, %i3, %i3       ! lowercase word from s1
2N/A    add %l4, %g1, %g1       ! lowercase word from s2
2N/A    cmp %i3, %g1        ! tolower(*s1) != tolower(*s2) ?
2N/A    bne %icc, .wordsdiffer  ! yup, now find byte that is different
2N/A    add %i1, 4, %i1     ! s1+=4, s2+=4
2N/A    andn    %i4, %i3, %l4       ! ~word & 0x80808080
2N/A    sub %i3, %i5, %l5       ! word - 0x01010101
2N/A    andcc   %l5, %l4, %g0       ! (word - 0x01010101) & ~word & 0x80808080
2N/A    bz,pt   %ncc, .cmp      ! no null-byte in s1 yet
2N/A    sll %i0, %l6, %g1       ! partial unaligned word from s2
2N/A
2N/A    ! words are equal but the end of s1 has been reached
2N/A    ! this means the strings must be equal
2N/A.stringsequal:
2N/A    ret             ! return
2N/A    restore %g0, %g0, %o0       ! return 0, i.e. strings are equal
2N/A    nop             ! pad
2N/A
2N/A
2N/A    ! we have a word aligned source and destination!  This means
2N/A    ! things get to go fast!
2N/A
2N/A.word4:
2N/A    lduw    [%i1 + %i2], %i3    ! x1 = word from s1
2N/A
2N/A.cmp4:
2N/A    andn    %i3, %i4, %l0       ! q1 = x1 & ~m1
2N/A    lduw    [%i1], %g1      ! x2 = word from s2
2N/A    andn    %g1, %i4, %l4       ! q2 = x2 & ~m1
2N/A    add %l0, %l2, %l1       ! p1 = q1 + m2
2N/A    add %l4, %l2, %l5       ! p2 = q2 + m2
2N/A    add %l0, %l3, %l0       ! q1 = q1 + m3
2N/A    add %l4, %l3, %l4       ! q2 = q2 + m3
2N/A    andn    %l1, %l0, %l1       ! p1 = p1 & ~q1
2N/A    andn    %l5, %l4, %l5       ! p2 = p2 & ~q2
2N/A    andn    %i4, %i3, %l0       ! q1 = m1 & ~x1
2N/A    andn    %i4, %g1, %l4       ! q2 = m1 & ~x2
2N/A    and %l0, %l1, %l0       ! q1 = p1 & q1
2N/A    and %l4, %l5, %l4       ! q2 = p2 & q2
2N/A    srl %l0, 2, %l0     ! q1 = q1 >> 2
2N/A    srl %l4, 2, %l4     ! q2 = q2 >> 2
2N/A    add %l0, %i3, %i3       ! lowercase word from s1
2N/A    add %l4, %g1, %g1       ! lowercase word from s2
2N/A    cmp %i3, %g1        ! tolower(*s1) != tolower(*s2) ?
2N/A    bne,pn  %icc, .wordsdiffer  ! yup, now find mismatching character
2N/A    add %i1, 4, %i1     ! s1+=4, s2+=4
2N/A    andn    %i4, %i3, %l4       ! ~word & 0x80808080
2N/A    sub %i3, %i5, %l5       ! word - 0x01010101
2N/A    andcc   %l5, %l4, %g0       ! (word - 0x01010101) & ~word & 0x80808080
2N/A    bz,a,pt %icc, .cmp4     ! no null-byte in s1 yet
2N/A    lduw    [%i1 + %i2], %i3    ! load word from s1
2N/A
2N/A    ! words are equal but the end of s1 has been reached
2N/A    ! this means the strings must be equal
2N/A.stringsequal4:
2N/A    ret             ! return
2N/A    restore %g0, %g0, %o0       ! return 0, i.e. strings are equal
2N/A
2N/A.wordsdiffer:
2N/A    srl %g1, 24, %i2        ! first byte of mismatching word in s2
2N/A    srl %i3, 24, %i1        ! first byte of mismatching word in s1
2N/A    subcc   %i1, %i2, %i0       ! *s1-*s2
2N/A    bnz,pn  %ncc, .done     ! bytes differ, return difference
2N/A    srl %g1, 16, %i2        ! second byte of mismatching word in s2
2N/A    andcc   %i1, 0xff, %i0      ! *s1 == 0 ?
2N/A    bz,pn   %ncc, .done     ! yup
2N/A
2N/A    ! we know byte 1 is equal, so can compare bytes 1,2 as a group
2N/A
2N/A    srl %i3, 16, %i1        ! second byte of mismatching word in s1
2N/A    subcc   %i1, %i2, %i0       ! *s1-*s2
2N/A    bnz,pn  %ncc, .done     ! bytes differ, return difference
2N/A    srl %g1, 8, %i2     ! third byte of mismatching word in s2
2N/A    andcc   %i1, 0xff, %i0      ! *s1 == 0 ?
2N/A    bz,pn   %ncc, .done     ! yup
2N/A
2N/A    ! we know bytes 1, 2 are equal, so can compare bytes 1,2,3 as a group
2N/A
2N/A    srl %i3, 8, %i1     ! third byte of mismatching word in s1
2N/A    subcc   %i1, %i2, %i0       ! *s1-*s2
2N/A    bnz,pn  %ncc, .done     ! bytes differ, return difference
2N/A    andcc   %i1, 0xff, %g0      ! *s1 == 0 ?
2N/A    bz,pn   %ncc, .stringsequal ! yup
2N/A
2N/A    ! we know bytes 1,2,3 are equal, so can compare bytes 1,2,3,4 as group
2N/A
2N/A    subcc   %i3, %g1, %i0       ! *s1-*s2
2N/A    bz,a    .done           ! bytes differ, return difference
2N/A    andcc   %i3, 0xff, %i0      ! *s1 == 0 ?
2N/A
2N/A.done:
2N/A    ret             ! return
2N/A    restore %i0, %g0, %o0       ! return tolower(*s1) - tolower(*s2)
2N/A
2N/A    SET_SIZE(ascii_strcasecmp)