sort/common/fields.c

	fields.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include "fields.h"

/*
 * fields
 *
 * Overview
 *   By a field, we mean the various delimited character sequences within each
 *   line of the input files.  The sort key consists of an ordered sequence of
 *   fields, which need not include all possible fields for the given line.
 *   (Furthermore, not every line need contain sufficient fields for the fields
 *   given within the sort key.  In fact, none of the lines in the input stream
 *   need contain sufficient fields.)
 *
 *   There are two methods for specifying fields for sort(1); these are
 *   discussed in options.c.  Here we discuss only the internal representation
 *   of fields, as used for constructing the collation vector for each line as
 *   defined by the sort key.
 *
 * Representation
 *   The sort key is a singly-linked list of field specifiers.  At present,
 *   fields may belong to one of three species:  alphabetical, numerical, or
 *   monthly; the species (f_species) then indicates the conversion function
 *   (f_convert) used to transform the raw characters of the character sequence
 *   to a collatable form.  (In principle, this allows us to consider future
 *   field species such as hexadecimal.)
 *
 *   Fields and offsets are numbered such that zero refers to the first field or
 *   character, respectively.  Thus, the interpretation of a key specifier, m.n,
 *   is that the field begins at the nth character beyond the mth occurence of
 *   the key separator.  If the blanks flag has been specified, then the field
 *   begins at the nth non-blank character past the mth key separator.  If the
 *   key separator is unspecified, then the key separator is defined as one or
 *   more blank characters.
 *
 *   In general, the various options afforded by sort may be broken into two
 *   categories:  field species and field modifiers.  For each field species,
 *   there is one or more conversion routines that take a delimited character
 *   sequence and convert it to a character sequence collatable by strcmp() or
 *   memcmp().  For field species that may be further modified, such as the
 *   fold-to-uppercase option for alphabetic fields, the conversion routine may
 *   be aware of how the modifier affects collation.  Finally, the no-modifiers
 *   case may present an opportunity for a simplified, faster version.
 *
 * Code Structure
 *   The code paths for single-byte and multi-byte locales diverge significantly
 *   in fields.c.  Most routines have an *_wide() version, which produces an
 *   equivalent effect for line records whose data field is composed of wide
 *   characters (wchar_t).  However, the l_collated field of a line record is
 *   always composed of characters, so that the radix sorts provided in
 *   internal.c can work in both single- and multi-byte locales.  Thus, in the
 *   various convert_*_wide() routines, the output is placed in l_collated, with
 *   a length multiplier of 4.
 */

#define BEFORE_NUMBER   0x0
#define IN_NUMBER   0x1

static char numerical_separator;
static char numerical_decimal;
static char monetary_separator;
static char monetary_decimal;

static wchar_t  w_numerical_separator;
static wchar_t  w_numerical_decimal;
static wchar_t  w_monetary_separator;
static wchar_t  w_monetary_decimal;

#define MONTHS_IN_YEAR  12
#define MAX_MON_LEN 20

enum { MO_NONE = 1, MO_OFFSET = 2 };

static char *months[MONTHS_IN_YEAR];
static size_t   month_lengths[MONTHS_IN_YEAR];
static wchar_t  *w_months[MONTHS_IN_YEAR];
static size_t   w_month_lengths[MONTHS_IN_YEAR];

#define DECIMAL_CHAR        (numerical_decimal)
#define IS_BLANK(x)     (isspace((uchar_t)(x)) && (x) != '\n')
#define IS_SEPARATOR(x)     \
    ((numerical_separator != '\0' && (x) == numerical_separator) || \
    (monetary_separator != '\0' && (x) == monetary_separator))
#define IS_DECIMAL(x)       \
    ((x) == numerical_decimal || \
    (monetary_decimal != '\0' && (x) == monetary_decimal))
#define W_DECIMAL_CHAR      (w_numerical_decimal)
#define W_IS_BLANK(x)       (iswspace(x) && (x) != L'\n')
#define W_IS_SEPARATOR(x)   \
    ((numerical_separator != '\0' && (x) == w_numerical_separator) || \
    (monetary_separator != '\0' && (x) == w_monetary_separator))
#define W_IS_DECIMAL(x)     \
    (((x) == w_numerical_decimal) || \
    (monetary_decimal != '\0' && (x) == w_monetary_decimal))

#define INTERFIELD_SEPARATOR '\0'
#define W_INTERFIELD_SEPARATOR L'\0'

#define INT_SIGN_FLIP_MASK 0x80000000
#define INT_SIGN_PASS_MASK 0x00000000

/*
 * strx_ops_t, xfrm_len, and xfrm_cpy:  In the case where we are sorting in the
 * C locale, we want to avoid the expense of transforming strings to collatable
 * forms since, by definition, an arbitrary string in the C locale is already in
 * its collatable form.  Therefore, we construct a small ops vector (the
 * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into
 * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear
 * strxfrm()-like.
 */
/*ARGSUSED*/
static size_t
xfrm_len(const char *s2, size_t len)
{
    return (strxfrm(NULL, s2, 0) + 1);
}

/*
 * The length represented by n includes a null character, so to return the
 * correct length we subtract 1.  Note that this function is only used by
 * field_convert_alpha, and isn't for general use, as it assumes that n is the
 * length of s2 plus a null character.
 */
static size_t
C_ncpy(char *s1, const char *s2, size_t n)
{
    (void) strncpy(s1, s2, n);
    return (n - 1);
}

/*ARGSUSED*/
static size_t
C_len(const char *s, size_t len)
{
    ASSERT(s != NULL);
    return (len);
}

typedef struct _strx_ops {
    size_t  (*sx_len)(const char *, size_t);
    size_t  (*sx_xfrm)(char *, const char *, size_t);
} strx_ops_t;

static const strx_ops_t C_ops = { C_len, C_ncpy };
static const strx_ops_t SB_ops = { xfrm_len, strxfrm };

static const strx_ops_t *xfrm_ops;

static void
field_initialize_separator(void)
{
    /*
     * A locale need not define all of the cases below:  only decimal_point
     * must be defined.  Furthermore, sort(1) has traditionally not used the
     * positive_sign and negative_sign, grouping, or currency_symbols (or
     * their numeric counterparts, if any).
     */
    struct lconv *conv = localeconv();

    if (!xstreql(conv->thousands_sep, "")) {
        numerical_separator = *conv->thousands_sep;
        (void) mbtowc(&w_numerical_separator, conv->thousands_sep,
            MB_CUR_MAX);
    } else
        numerical_separator = '\0';

    if (!xstreql(conv->mon_thousands_sep, "")) {
        monetary_separator = *conv->mon_thousands_sep;
        (void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep,
            MB_CUR_MAX);
    } else
        monetary_separator = '\0';

    if (!xstreql(conv->mon_decimal_point, "")) {
        monetary_decimal = *conv->mon_decimal_point;
        (void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point,
            MB_CUR_MAX);
    } else
        monetary_decimal = '\0';

    numerical_decimal = *conv->decimal_point;
    (void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX);
}

static void
field_initialize_month(int is_c_locale)
{
    int i;
    int j;
    struct tm this_month;
    const char *c_months[MONTHS_IN_YEAR] = {
        "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
        "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
    };

    char month_name[MAX_MON_LEN * MB_LEN_MAX];
    wchar_t w_month_name[MAX_MON_LEN];

    if (is_c_locale) {
        for (i = 0; i < MONTHS_IN_YEAR; i++) {
            months[i] = (char *)c_months[i];
            month_lengths[i] = strlen(c_months[i]);
        }
        /*
         * We don't need to initialize the wide version of the month
         * names.
         */
        return;
    }

    (void) memset(&this_month, 0, sizeof (this_month));

    for (i = 0; i < MONTHS_IN_YEAR; i++) {
        this_month.tm_mon = i;

        (void) strftime(month_name, sizeof (month_name),
            "%b", &this_month);

        for (j = 0; j < strlen(month_name); j++)
            month_name[j] = toupper(month_name[j]);
        (void) mbstowcs(w_month_name, month_name, MAX_MON_LEN);

        months[i] = strdup(month_name);
        month_lengths[i] = strlen(month_name);
        w_months[i] = wsdup(w_month_name);
        w_month_lengths[i] = wslen(w_month_name);
    }
}

void
field_initialize(sort_t *S)
{
    field_initialize_month(S->m_c_locale);
    field_initialize_separator();

    if (S->m_c_locale)
        xfrm_ops = &C_ops;
    else
        xfrm_ops = &SB_ops;
}

field_t *
field_new(sort_t *S)
{
    field_t *F = safe_realloc(NULL, sizeof (field_t));

    F->f_start_field = -1;
    F->f_start_offset = -1;
    F->f_end_field = -1;
    F->f_end_offset = -1;
    F->f_next = NULL;

    if (S == NULL) {
        F->f_species = ALPHA;
        F->f_options = 0;
    } else {
        F->f_species = S->m_default_species;
        F->f_options = S->m_field_options;
    }

    return (F);
}

void
field_delete(field_t *F)
{
    free(F);
}

/*
 * The recursive implementation of field_add_to_chain() given below is
 * inappropriate if function calls are expensive, or a truly large number of
 * fields are anticipated.
 */
void
field_add_to_chain(field_t **F, field_t *A)
{
    if (*F == NULL)
        *F = A;
    else
        field_add_to_chain(&((*F)->f_next), A);
}

#ifdef DEBUG
#ifndef _LP64
#define FIELD_FMT \
"\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n"
#else /* !_LP64 */
#define FIELD_FMT \
"\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n"
#endif /* !_LP64 */

/*
 * field_print is used only for debugging purposes.
 */
void
field_print(field_t *F)
{
    char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"};
    int status = 0;

    (void) fprintf(stderr, "Type: %s", field_names[F->f_species]);
    (void) fprintf(stderr, "\tOptions: ");

    if (F->f_options & FIELD_REVERSE_COMPARISONS) {
        (void) fprintf(stderr, "REVERSE");
        status++;
    }
    if (F->f_options & FIELD_DICTIONARY_ORDER) {
        (void) fprintf(stderr, "DICTIONARY ");
        status++;
    }
    if (F->f_options & FIELD_FOLD_UPPERCASE) {
        (void) fprintf(stderr, "UPPERCASE ");
        status++;
    }
    if (F->f_options & FIELD_IGNORE_NONPRINTABLES) {
        (void) fprintf(stderr, "PRINTABLES ");
        status++;
    }
    if (F->f_options & FIELD_IGNORE_BLANKS_START) {
        (void) fprintf(stderr, "BLANKS_START ");
        status++;
    }
    if (F->f_options & FIELD_IGNORE_BLANKS_END) {
        (void) fprintf(stderr, "BLANKS_END ");
        status++;
    }

    if (status == 0)
        (void) fprintf(stderr, "NO_MODIFIERS");

    (void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset,
        F->f_end_field, F->f_end_offset);
}
#endif /* DEBUG */

static ssize_t
field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks)
{
    char *S = L->l_data.sp;
    char *T = S;
    char *eol = S + L->l_data_length;
    ssize_t field = is_end ? F->f_end_field : F->f_start_field;
    ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
    ssize_t ret;

    ASSERT(is_end || field > -1);

    if (is_end && field == -1)
        return (L->l_data_length);

    while (field-- > 0) {
        while (T < eol && IS_BLANK(*T))
            T++;

        while (T < eol && !IS_BLANK(*T))
            T++;
    }

    if ((!is_end || offset > 0) && is_blanks) {
        while (IS_BLANK(*T))
            T++;
    }

    if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
        return (L->l_data_length);

    return (ret);
}

static void
field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
{
    ASSERT(F->f_start_field > -1);

    *start = field_boundary(F, L, 0,
        F->f_options & FIELD_IGNORE_BLANKS_START);
    *end = field_boundary(F, L, 1,
        F->f_options & FIELD_IGNORE_BLANKS_END);
}

static ssize_t
field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks)
{
    wchar_t *S = L->l_data.wp;
    wchar_t *T = S;
    wchar_t *eol = S + L->l_data_length;
    ssize_t field = is_end ? F->f_end_field : F->f_start_field;
    ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
    ssize_t ret;

    ASSERT(is_end || field > -1);

    if (is_end && field == -1)
        return (L->l_data_length);

    while (field-- > 0) {
        while (T < eol && W_IS_BLANK(*T))
            T++;

        while (T < eol && !W_IS_BLANK(*T))
            T++;
    }

    if ((!is_end || offset > 0) && is_blanks) {
        while (W_IS_BLANK(*T))
            T++;
    }

    if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
        return (L->l_data_length);

    return (ret);
}

static void
field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
{
    ASSERT(F->f_start_field > -1);

    *start = field_boundary_wide(F, L, 0,
        F->f_options & FIELD_IGNORE_BLANKS_START);
    *end = field_boundary_wide(F, L, 1,
        F->f_options & FIELD_IGNORE_BLANKS_END);
}

static ssize_t
field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks,
    vchar_t delimiter)
{
    char *S = L->l_data.sp;
    char *T = S;
    char *eol = S + L->l_data_length;
    ssize_t field = is_end ? F->f_end_field : F->f_start_field;
    ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
    ssize_t ret;

    ASSERT(is_end || field > -1);

    if (is_end && field == -1)
        return (L->l_data_length);

    while (field-- > 0) {
        T = xstrnchr(T, delimiter.sc, eol - T);
        if (T == NULL || T > eol)
            return (L->l_data_length);

        T++;
    }

    if ((!is_end || offset != 0) && is_blanks) {
        while (IS_BLANK(*T))
            T++;
    }

    if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
        return (L->l_data_length);

    if (is_end && offset == 0)
        ret--;

    return (ret);
}

/*
 * field_delimit_tabbed() is called when a field separator has been defined
 * using the -t option.  The character at the offset, start, is either one or
 * more character positions past the delimiter marking the start of the
 * field, or at the end of the line.
 */
static void
field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end,
    vchar_t delimiter)
{
    ASSERT(F->f_start_field > -1);

    *start = field_boundary_tabbed(F, L, 0, F->f_options &
        FIELD_IGNORE_BLANKS_START, delimiter);
    *end = field_boundary_tabbed(F, L, 1, F->f_options &
        FIELD_IGNORE_BLANKS_END, delimiter);
}

static ssize_t
field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks,
    vchar_t delimiter)
{
    wchar_t *S = L->l_data.wp;
    wchar_t *T = S;
    wchar_t *eol = S + L->l_data_length;
    ssize_t field = is_end ? F->f_end_field : F->f_start_field;
    ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
    ssize_t ret;

    ASSERT(is_end || field > -1);

    if (is_end && field == -1)
        return (L->l_data_length);

    while (field-- > 0) {
        T = xwsnchr(T, delimiter.wc, eol - T);
        if (T == NULL || T > eol)
            return (L->l_data_length);

        T++;
    }

    if ((!is_end || offset != 0) && is_blanks) {
        while (W_IS_BLANK(*T))
            T++;
    }

    if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
        return (L->l_data_length);

    if (is_end && offset == 0)
        ret--;

    return (ret);
}

static void
field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start,
    ssize_t *end, vchar_t delimiter)
{
    ASSERT(F->f_start_field > -1);

    *start = field_boundary_tabbed_wide(F, L, 0, F->f_options &
        FIELD_IGNORE_BLANKS_START, delimiter);
    *end = field_boundary_tabbed_wide(F, L, 1, F->f_options &
        FIELD_IGNORE_BLANKS_END, delimiter);
}

/*ARGSUSED*/
ssize_t
field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
    int j;
    ssize_t val;
    char month_candidate[MAX_MON_LEN * MB_LEN_MAX];
    ssize_t month_length = data_length;
    ssize_t month_offset = data_offset;

    if (sizeof (char) > L->l_collate_bufsize - coll_offset)
        return (-1);

    (void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX);


    /*
     * The month field formally begins with the first non-blank character.
     */
    while (IS_BLANK(*(L->l_data.sp + month_offset))) {
        month_offset++;
        month_length--;
    }

    for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
        month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]);

    for (j = 0; j < MONTHS_IN_YEAR; j++) {
        if (xstrneql(month_candidate, months[j], month_lengths[j])) {
            *(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET;
            return (1);
        }
    }

    /*
     * no matching month; copy string into field.  required behaviour is
     * that "month-free" keys sort before month-sortable keys, so insert
     * a "will sort first" token.
     */
    *(L->l_collate.sp + coll_offset) = '\0' + MO_NONE;

    val = field_convert_alpha_simple(F, L, delimiter, data_offset,
        data_length, coll_offset + 1);

    if (val < 0)
        return (-1);
    else
        return (val + 1);
}

/*ARGSUSED*/
ssize_t
field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
    ssize_t j;
    ssize_t val;
    wchar_t month_candidate[MAX_MON_LEN];
    wchar_t *month;
    wchar_t *buffer = L->l_collate.wp + coll_offset;
    ssize_t month_length = data_length;
    ssize_t month_offset = data_offset;

    if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) <
        sizeof (wchar_t))
        return (-1);

    (void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t));


    while (W_IS_BLANK(*(L->l_data.wp + month_offset))) {
        month_offset++;
        month_length--;
    }

    month = L->l_data.wp + month_offset;

    for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
        month_candidate[j] = towupper(month[j]);

    for (j = 0; j < MONTHS_IN_YEAR; j++)
        if (xwcsneql(month_candidate, w_months[j],
            w_month_lengths[j])) {
            *buffer = L'\0' + j + MO_OFFSET;
            return (1);
        }

    *buffer = L'\0' + MO_NONE;

    val = field_convert_alpha_wide(F, L, delimiter, data_offset,
        data_length, coll_offset + sizeof (wchar_t));

    if (val < 0)
        return (-1);
    else
        return (val + 1);
}

/*
 * field_convert_alpha() always fails with return value -1 if the converted
 * string would cause l_collate_length to exceed l_collate_bufsize
 */
/*ARGSUSED*/
ssize_t
field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
    static char *compose;
    static ssize_t compose_length;

    ssize_t clength = 0;
    ssize_t dlength;
    ssize_t i;

    if (compose_length < (data_length + 1)) {
        compose_length = data_length + 1;
        compose = safe_realloc(compose, compose_length * sizeof (char));
    }

    for (i = data_offset; i < data_offset + data_length; i++) {
        char t = (L->l_data.sp)[i];

        if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) &&
            !isprint((uchar_t)t))
            continue;

        if ((F->f_options & FIELD_DICTIONARY_ORDER) &&
            !isalnum((uchar_t)t) && !isspace((uchar_t)t))
            continue;

        if (F->f_options & FIELD_FOLD_UPPERCASE)
            t = toupper(t);

        compose[clength++] = t;
    }
    compose[clength] = '\0';

    if ((dlength = xfrm_ops->sx_len(compose, clength)) <
        L->l_collate_bufsize - coll_offset)
        return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
                compose, dlength + 1));
    else
        return ((ssize_t)-1);
}

/*ARGSUSED*/
ssize_t
field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
    static char *compose;
    static ssize_t compose_length;

    ssize_t clength;
    ssize_t dlength;

    if (compose_length < (data_length + 1)) {
        compose_length = data_length + 1;
        compose = safe_realloc(compose, compose_length * sizeof (char));
    }

    (void) memcpy(compose, L->l_data.sp + data_offset, data_length);
    clength = data_length;
    compose[clength] = '\0';

    if ((dlength = xfrm_ops->sx_len(compose, clength)) <
        L->l_collate_bufsize - coll_offset)
        return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
                compose, dlength + 1));
    else
        return ((ssize_t)-1);
}

/*ARGSUSED*/
ssize_t
field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
    wchar_t *compose = safe_realloc(NULL, (data_length + 1) *
        sizeof (wchar_t));
    ssize_t clength = 0;
    ssize_t dlength;
    ssize_t i;
    ssize_t ret;

    for (i = data_offset; i < data_offset + data_length; i++) {
        wchar_t t = (L->l_data.wp)[i];

        if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t))
            continue;

        if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) &&
            !iswspace(t))
            continue;

        if (F->f_options & FIELD_FOLD_UPPERCASE)
            t = towupper(t);

        compose[clength++] = t;
    }
    compose[clength] = L'\0';

    dlength = wcsxfrm(NULL, compose, (size_t)0);
    if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize -
        coll_offset * sizeof (wchar_t)) {
        ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose,
            (size_t)dlength + 1);
    } else {
        ret = (ssize_t)-1;
    }

    safe_free(compose);

    return (ret);
}

/*
 * field_convert_numeric() converts the given field into a collatable numerical
 * sequence.  The sequence is ordered as { log, integer, separator, fraction },
 * with an optional sentinel component at the sequence end.
 */
/*ARGSUSED*/
ssize_t
field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
    char *number;
    char *buffer = L->l_collate.sp + coll_offset;
    ssize_t length;

    char sign = '2';
    int log_ten;
    char *digits = buffer + 1 + sizeof (int) / sizeof (char);
    size_t j = 0;
    size_t i;

    int state = BEFORE_NUMBER;

    number = L->l_data.sp + data_offset;
    length = data_length;

    /*
     * Eat leading blanks, if any.
     */
    for (i = 0; i < length; i++)
        if (!IS_BLANK(number[i]))
            break;

    /*
     * Test that there is sufficient size in the collation buffer for our
     * number.  In addition to the possible remaining characters in the
     * field, we also require space for the sign (char), logarithm (int),
     * separator (char), and as many as two string terminators (for reverse
     * sorts).
     */
    if (((length - i) + 4 * sizeof (char) + sizeof (int)) >
        (L->l_collate_bufsize - coll_offset))
        return ((ssize_t)-1);

    /*
     * If negative, set sign.
     */
    if (number[i] == '-') {
        i++;
        sign = '0';
    }

    /*
     * Scan integer part; eat leading zeros.
     */
    for (; i < length; i++) {
        if (IS_SEPARATOR(number[i]))
            continue;

        if (number[i] == '0' && !(state & IN_NUMBER))
            continue;

        if (!isdigit((uchar_t)number[i]))
            break;

        state |= IN_NUMBER;
        if (sign == '0')
            digits[j++] = '0' + '9' - number[i];
        else
            digits[j++] = number[i];
    }

    if (i < length && IS_DECIMAL(number[i])) {
        /*
         * Integer part terminated by decimal.
         */
        digits[j] = DECIMAL_CHAR;
        log_ten = j++;

        /*
         * Scan fractional part.
         */
        for (++i; i < length; i++) {
            if (IS_SEPARATOR(number[i]))
                continue;

            if (!isdigit((uchar_t)number[i]))
                break;

            if (number[i] != '0')
                state |= IN_NUMBER;

            if (sign == '0')
                digits[j++] = '0' + '9' - number[i];
            else
                digits[j++] = number[i];
        }

        if (sign == '0')
            digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
    } else {
        /*
         * Nondigit or end of string seen.
         */
        log_ten = (int)j;
        if (sign == '0')
            digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
        else
            digits[j] = INTERFIELD_SEPARATOR;
    }

    if ((state & IN_NUMBER) == 0) {
        /*
         * A non-zero number was not detected; treat as defined zero.
         */
        sign = '1';
        log_ten = 0;
        digits[0] = '0';
        j = 1;
    }

    /*
     * We subtract a constant from the log of negative values so that
     * they will correctly precede positive values with a zero logarithm.
     */
    if (sign == '0') {
        if (j != 0)
            log_ten = -log_ten - 2;
        else
            /*
             * Special case for -0.
             */
            log_ten = -1;
    }

    buffer[0] = sign;

    /*
     * Place logarithm in big-endian form.
     */
    for (i = 0; i < sizeof (int); i++)
        buffer[i + 1] = (log_ten << (i * NBBY))
            >> ((sizeof (int) - 1) * NBBY);

    if (j + sizeof (char) + sizeof (int) <
        L->l_collate_bufsize - coll_offset)
        return (j + 1 + sizeof (int));
    else
        return ((ssize_t)-1);
}

/*ARGSUSED*/
ssize_t
field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
    wchar_t *number;
    wchar_t *buffer = L->l_collate.wp + coll_offset;
    char *lbuffer;
    ssize_t length;

    wchar_t sign = L'2';
    int log_ten;
    wchar_t *digits = buffer + 1 + sizeof (int)/sizeof (wchar_t);
    size_t j = 0;
    size_t i;

    int state = BEFORE_NUMBER;

    number = L->l_data.wp + data_offset;
    length = data_length;

    for (i = 0; i < length; i++)
        if (!W_IS_BLANK(number[i]))
            break;

    if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) +
        sizeof (int)) > (L->l_collate_bufsize - coll_offset))
        return ((ssize_t)-1);

    if (number[i] == L'-') {
        i++;
        sign = L'0';
    }

    for (; i < length; i++) {
        if (W_IS_SEPARATOR(number[i]))
            continue;

        if (number[i] == L'0' && !(state & IN_NUMBER))
            continue;

        if (!iswdigit(number[i]))
            break;

        state |= IN_NUMBER;
        if (sign == L'0')
            digits[j++] = L'0' + L'9' - number[i];
        else
            digits[j++] = number[i];
    }

    if (i < length && W_IS_DECIMAL(number[i])) {
        digits[j] = W_DECIMAL_CHAR;
        log_ten = j++;

        for (++i; i < length; i++) {
            if (W_IS_SEPARATOR(number[i]))
                continue;

            if (!iswdigit(number[i]))
                break;

            if (number[i] != L'0')
                state |= IN_NUMBER;

            if (sign == L'0')
                digits[j++] = L'0' + L'9' - number[i];
            else
                digits[j++] = number[i];
        }

        if (sign == L'0')
            digits[j++] = (wchar_t)(WCHAR_MAX -
                W_INTERFIELD_SEPARATOR);
    } else {
        log_ten = (int)j;
        if (sign == L'0')
            digits[j++] = (wchar_t)(WCHAR_MAX -
                W_INTERFIELD_SEPARATOR);
        else
            digits[j] = W_INTERFIELD_SEPARATOR;
    }

    if ((state & IN_NUMBER) == 0) {
        sign = L'1';
        log_ten = 0;
        digits[0] = L'0';
        j = 1;
    }

    if (sign == L'0') {
        if (j != 0)
            log_ten = -log_ten - 2;
        else
            log_ten = -1;
    }

    buffer[0] = sign;
    /*
     * Place logarithm in big-endian form.
     */
    lbuffer = (char *)(buffer + 1);
    for (i = 0; i < sizeof (int); i++)
        lbuffer[i] = (log_ten << (i * NBBY))
            >> ((sizeof (int) - 1) * NBBY);

    if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) <
        L->l_collate_bufsize - coll_offset * sizeof (wchar_t))
        return (j + 1 + sizeof (int) / sizeof (wchar_t));
    else
        return ((ssize_t)-1);
}

/*
 * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour
 * when coll_offset exceeds l_collate_bufsize.
 */
ssize_t
field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator)
{
    ssize_t coll_offset = 0;
    ssize_t start, end, distance;
    field_t *cur_fieldp = F;

    while (cur_fieldp != NULL) {
        /*
         * delimit field
         */
        if (!field_separator.sc)
            field_delimit(cur_fieldp, L, &start, &end);
        else
            field_delimit_tabbed(cur_fieldp, L, &start, &end,
                field_separator);

        distance = 0;
        if (end - start > 0 ||
            (end - start == 0 && F->f_species == NUMERIC)) {
            /*
             * Convert field, appending to collated field of line
             * record.
             */
            distance = cur_fieldp->f_convert(cur_fieldp, L,
                field_separator, start, end - start, coll_offset);

            /*
             * branch should execute comparatively rarely
             */
            if (distance == -1) {
                if (flags & FCV_REALLOC) {
                    ASSERT(L->l_collate_bufsize > 0);
                    L->l_collate_bufsize *= 2;
                    L->l_collate.sp =
                        safe_realloc(L->l_collate.sp,
                        L->l_collate_bufsize);

                    __S(stats_incr_convert_reallocs());
                    continue;
                } else {
                    /*
                     * FCV_FAIL has been set.
                     */
                    return (-1);
                }
            }
        }

        if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
            xstrninv(L->l_collate.sp, coll_offset, distance);
            *(L->l_collate.sp + coll_offset + distance) =
                (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
            distance++;
        }

        ASSERT(distance >= 0);
        coll_offset += distance;
        if (coll_offset >= L->l_collate_bufsize) {
            if (flags & FCV_REALLOC) {
                ASSERT(L->l_collate_bufsize > 0);
                L->l_collate_bufsize *= 2;
                L->l_collate.sp = safe_realloc(L->l_collate.sp,
                    L->l_collate_bufsize);

                __S(stats_incr_convert_reallocs());
            } else {
                return (-1);
            }
        }
        *(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR;
        coll_offset++;

        cur_fieldp = cur_fieldp->f_next;
    }

    L->l_collate_length = coll_offset;

    return (L->l_collate_length);
}

ssize_t
field_convert_wide(field_t *F, line_rec_t *L, int flags,
    vchar_t field_separator)
{
    ssize_t coll_offset = 0;
    ssize_t start, end, distance;
    field_t *cur_fieldp = F;

    while (cur_fieldp != NULL) {
        if (!field_separator.wc)
            field_delimit_wide(cur_fieldp, L, &start, &end);
        else
            field_delimit_tabbed_wide(cur_fieldp, L, &start, &end,
                field_separator);

        distance = 0;
        if (end - start > 0 ||
            end - start == 0 && F->f_species == NUMERIC) {
            distance = cur_fieldp->f_convert(cur_fieldp, L,
                field_separator, start, end - start, coll_offset);

            if (distance == -1) {
                if (flags & FCV_REALLOC) {
                    ASSERT(L->l_collate_bufsize > 0);
                    L->l_collate_bufsize *= 2;
                    L->l_collate.wp = safe_realloc(
                        L->l_collate.wp,
                        L->l_collate_bufsize);

                    __S(stats_incr_convert_reallocs());
                    continue;
                } else {
                    return (-1);
                }
            }
        }

        if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
            xwcsninv(L->l_collate.wp, coll_offset, distance);
            *(L->l_collate.wp + coll_offset + distance) =
                WCHAR_MAX - INTERFIELD_SEPARATOR;
            distance++;
        }

        ASSERT(distance >= 0);
        coll_offset += distance;
        if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) {
            if (flags & FCV_REALLOC) {
                ASSERT(L->l_collate_bufsize > 0);
                L->l_collate_bufsize *= 2;
                L->l_collate.wp = safe_realloc(L->l_collate.wp,
                    L->l_collate_bufsize);

                __S(stats_incr_convert_reallocs());
            } else {
                return (-1);
            }
        }
        *(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR;
        coll_offset++;

        cur_fieldp = cur_fieldp->f_next;
    }

    L->l_collate_length = coll_offset * sizeof (wchar_t);
#ifdef _LITTLE_ENDIAN
    xwcsntomsb(L->l_collate.wp, coll_offset);
#endif /* _LITTLE_ENDIAN */

    return (L->l_collate_length);
}

/*
 * line_convert() and line_convert_wide() are called when the collation vector
 * of a given line has been exhausted, and we are performing the final,
 * full-line comparison required by the sort specification.  Because we do not
 * have a guarantee that l_data is null-terminated, we create an explicitly
 * null-terminated copy suitable for transformation to a collatable form for the
 * current locale.
 */
static void
line_convert(line_rec_t *L)
{
    static ssize_t bufsize;
    static char *buffer;

    if (L->l_raw_collate.sp != NULL)
        return;

    if (L->l_data_length + 1 > bufsize) {
        buffer = safe_realloc(buffer, L->l_data_length + 1);
        bufsize = L->l_data_length + 1;
    }

    (void) strncpy(buffer, L->l_data.sp, L->l_data_length);
    buffer[L->l_data_length] = '\0';

    L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp,
        xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
    xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer,
        xfrm_ops->sx_len(buffer, L->l_data_length) + 1);

    __S(stats_incr_line_conversions());
}

static void
line_convert_wide(line_rec_t *L)
{
    static wchar_t *buffer;
    static ssize_t bufsize;

    ssize_t dlength;

    if (L->l_raw_collate.wp != NULL)
        return;

    if (L->l_data_length + 1 > bufsize) {
        buffer = safe_realloc(buffer, (L->l_data_length + 1) *
            sizeof (wchar_t));
        bufsize = L->l_data_length + 1;
    }

    (void) wcsncpy(buffer, L->l_data.wp, L->l_data_length);
    buffer[L->l_data_length] = L'\0';

    dlength = wcsxfrm(NULL, buffer, 0) + 1;
    L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength *
        sizeof (wchar_t));
    (void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength);

    __S(stats_incr_line_conversions());
}

/*
 * Our convention for collation is
 *
 *  A > B  => r > 0,
 *  A == B => r = 0,
 *  A < B  => r < 0
 *
 * This convention is consistent with the definition of memcmp(), strcmp(), and
 * strncmp() in the C locale.  collated() and collated_wide() have two optional
 * behaviours, which can be activated by setting the appropriate values in
 * coll_flag:  COLL_UNIQUE, which returns 0 if the l_collate fields of the line
 * records being compared are identical; COLL_DATA_ONLY, which ignores the
 * l_collate field for the current comparison; and COLL_REVERSE, which flips the
 * result for comparisons that fall through to an actual data comparison (since
 * the collated vector should already reflect reverse ordering from field
 * conversion).
 */
int
collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
{
    ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
    int r;
    int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
        INT_SIGN_PASS_MASK;
    ssize_t la, lb;

    if (!(coll_flag & COLL_DATA_ONLY)) {
        if (ml > 0) {
            r = memcmp(A->l_collate.sp + depth,
                B->l_collate.sp + depth, ml);

            if (r)
                return (r);
        }

        if (A->l_collate_length < B->l_collate_length)
            return (-1);

        if (A->l_collate_length > B->l_collate_length)
            return (1);
    }

    /*
     * This is where we cut out, if we know that the current sort is over
     * the entire line.
     */
    if (coll_flag & COLL_UNIQUE)
        return (0);

    line_convert(A);
    line_convert(B);

    la = strlen(A->l_raw_collate.sp);
    lb = strlen(B->l_raw_collate.sp);

    r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb));

    if (r)
        return (r ^ mask);

    if (la < lb)
        return (-1 ^ mask);

    if (la > lb)
        return (1 ^ mask);

    return (0);
}

int
collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
{
    ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
    int r;
    int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
        INT_SIGN_PASS_MASK;
    ssize_t la, lb;

    if (!(coll_flag & COLL_DATA_ONLY)) {
        if (ml > 0) {
            r = memcmp(A->l_collate.sp + depth,
                B->l_collate.sp + depth, ml);

            if (r)
                return (r);
        }
        if (A->l_collate_length < B->l_collate_length)
            return (-1);

        if (A->l_collate_length > B->l_collate_length)
            return (1);
    }

    if (coll_flag & COLL_UNIQUE)
        return (0);

    line_convert_wide(A);
    line_convert_wide(B);

    la = wcslen(A->l_raw_collate.wp);
    lb = wcslen(B->l_raw_collate.wp);

    r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp,
        (size_t)MIN(la, lb));

    if (r)
        return (r ^ mask);

    if (la < lb)
        return (-1 ^ mask);

    if (la > lb)
        return (1 ^ mask);

    return (0);
}