/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <_string_table.h>
#include <strings.h>
#include <sgs.h>
#include <stdio.h>
/*
* This file provides the interfaces to build a Str_tbl suitable for use by
* either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
* as created by ld(1).
*
* There are two modes which can be used when constructing a string table:
*
* st_new(0)
* standard string table - no compression. This is the
* traditional, fast method.
*
* st_new(FLG_STTAB_COMPRESS)
* builds a compressed string table which both eliminates
* duplicate strings, and permits strings with common suffixes
* (atexit vs. exit) to overlap in the table. This provides space
* savings for many string tables. Although more work than the
* traditional method, the algorithms used are designed to scale
* and keep any overhead at a minimum.
*
* These string tables are built with a common interface in a two-pass manner.
* The first pass finds all of the strings required for the string-table and
* calculates the size required for the final string table.
*
* The second pass allocates the string table, populates the strings into the
* table and returns the offsets the strings have been assigned.
*
* The calling sequence to build and populate a string table is:
*
* st_new(); // initialize strtab
*
* st_insert(st1); // first pass of strings ...
* // calculates size required for
* // string table
*
* st_delstring(st?); // remove string previously
* // inserted
* st_insert(stN);
*
* st_getstrtab_sz(); // freezes strtab and computes
* // size of table.
*
* st_setstrbuf(); // associates a final destination
* // for the string table
*
* st_setstring(st1); // populate the string table
* ... // offsets are based off of second
* // pass through the string table
* st_setstring(stN);
*
* st_destroy(); // tear down string table
* // structures.
*
* String Suffix Compression Algorithm:
*
* Here's a quick high level overview of the Suffix String
* compression algorithm used. First - the heart of the algorithm
* is a Hash table list which represents a dictionary of all unique
* strings inserted into the string table. The hash function for
* this table is a standard string hash except that the hash starts
* at the last character in the string (&str[n - 1]) and works towards
* the first character in the function (&str[0]). As we compute the
* HASH value for a given string, we also compute the hash values
* for all of the possible suffix strings for that string.
*
* As we compute the hash - at each character see if the current
* suffix string for that hash is already present in the table. If
* it is, and the string is a master string. Then change that
* string to a suffix string of the new string being inserted.
*
* When the final hash value is found (hash for str[0...n]), check
* to see if it is in the hash table - if so increment the reference
* count for the string. If it is not yet in the table, insert a
* new hash table entry for a master string.
*
* The above method will find all suffixes of a given string given
* that the strings are inserted from shortest to longest. That is
* why this is a two phase method, we first collect all of the
* strings and store them based off of their length in an AVL tree.
* Once all of the strings have been submitted we then start the
* hash table build by traversing the AVL tree in order and
* inserting the strings from shortest to longest as described
* above.
*/
/* LINTLIBRARY */
static int
avl_len_compare(const void *n1, const void *n2)
{
size_t len1, len2;
len1 = ((LenNode *)n1)->ln_strlen;
len2 = ((LenNode *)n2)->ln_strlen;
if (len1 == len2)
return (0);
if (len2 < len1)
return (1);
return (-1);
}
static int
avl_str_compare(const void *n1, const void *n2)
{
const char *str1, *str2;
int rc;
str1 = ((StrNode *)n1)->sn_str;
str2 = ((StrNode *)n2)->sn_str;
rc = strcmp(str1, str2);
if (rc > 0)
return (1);
if (rc < 0)
return (-1);
return (0);
}
/*
* Return an initialized Str_tbl - returns NULL on failure.
*
* flags:
* FLG_STTAB_COMPRESS - build a compressed string table
*/
Str_tbl *
st_new(uint_t flags)
{
Str_tbl *stp;
if ((stp = calloc(sizeof (*stp), 1)) == NULL)
return (NULL);
/*
* Start with a leading '\0' - it's tradition.
*/
stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1;
/*
* Do we compress this string table?
*/
stp->st_flags = flags;
if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
return (stp);
if ((stp->st_lentree = calloc(sizeof (*stp->st_lentree), 1)) == NULL)
return (NULL);
avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode),
SGSOFFSETOF(LenNode, ln_avlnode));
return (stp);
}
/*
* Insert a new string into the Str_tbl. There are two AVL trees used.
*
* - The first LenNode AVL tree maintains a tree of nodes based on string
* sizes.
* - Each LenNode maintains a StrNode AVL tree for each string. Large
* applications have been known to contribute thousands of strings of
* the same size. Should strings need to be removed (-z ignore), then
* the string AVL tree makes this removal efficient and scalable.
*/
int
st_insert(Str_tbl *stp, const char *str)
{
size_t len;
StrNode *snp, sn = { 0 };
LenNode *lnp, ln = { 0 };
avl_index_t where;
/*
* String table can't have been cooked
*/
assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
/*
* Null strings always point to the head of the string
* table - no reason to keep searching.
*/
if ((len = strlen(str)) == 0)
return (0);
stp->st_fullstrsize += len + 1;
stp->st_strcnt++;
if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
return (0);
/*
* From the controlling string table, determine which LenNode AVL node
* provides for this string length. If the node doesn't exist, insert
* a new node to represent this string length.
*/
ln.ln_strlen = len;
if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) {
if ((lnp = calloc(sizeof (*lnp), 1)) == NULL)
return (-1);
lnp->ln_strlen = len;
avl_insert(stp->st_lentree, lnp, where);
if ((lnp->ln_strtree = calloc(sizeof (*lnp->ln_strtree), 1)) ==
NULL)
return (0);
avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode),
SGSOFFSETOF(StrNode, sn_avlnode));
}
/*
* From the string length AVL node determine whether a StrNode AVL node
* provides this string. If the node doesn't exist, insert a new node
* to represent this string.
*/
sn.sn_str = str;
if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) {
if ((snp = calloc(sizeof (*snp), 1)) == NULL)
return (-1);
snp->sn_str = str;
avl_insert(lnp->ln_strtree, snp, where);
}
snp->sn_refcnt++;
return (0);
}
/*
* Remove a previously inserted string from the Str_tbl.
*/
int
st_delstring(Str_tbl *stp, const char *str)
{
size_t len;
LenNode *lnp, ln = { 0 };
StrNode *snp, sn = { 0 };
/*
* String table can't have been cooked
*/
assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
len = strlen(str);
stp->st_fullstrsize -= len + 1;
if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
return (0);
/*
* Determine which LenNode AVL node provides for this string length.
*/
ln.ln_strlen = len;
if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) {
sn.sn_str = str;
if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) {
/*
* Reduce the reference count, and if zero remove the
* node.
*/
if (--snp->sn_refcnt == 0)
avl_remove(lnp->ln_strtree, snp);
return (0);
}
}
/*
* No strings of this length, or no string itself - someone goofed.
*/
return (-1);
}
/*
* Tear down a String_Table structure.
*/
void
st_destroy(Str_tbl *stp)
{
Str_hash *sthash, *psthash;
Str_master *mstr, *pmstr;
uint_t i;
/*
* cleanup the master strings
*/
for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
mstr = mstr->sm_next) {
if (pmstr)
free(pmstr);
pmstr = mstr;
}
if (pmstr)
free(pmstr);
if (stp->st_hashbcks) {
for (i = 0; i < stp->st_hbckcnt; i++) {
for (sthash = stp->st_hashbcks[i], psthash = 0;
sthash; sthash = sthash->hi_next) {
if (psthash)
free(psthash);
psthash = sthash;
}
if (psthash)
free(psthash);
}
free(stp->st_hashbcks);
}
free(stp);
}
/*
* For a given string - copy it into the buffer associated with
* the string table - and return the offset it has been assigned.
*
* If a value of '-1' is returned - the string was not found in
* the Str_tbl.
*/
int
st_setstring(Str_tbl *stp, const char *str, size_t *stoff)
{
size_t stlen;
uint_t hashval;
Str_hash *sthash;
Str_master *mstr;
int i;
/*
* String table *must* have been previously cooked
*/
assert(stp->st_strbuf);
assert(stp->st_flags & FLG_STTAB_COOKED);
stlen = strlen(str);
/*
* Null string always points to head of string table
*/
if (stlen == 0) {
*stoff = 0;
return (0);
}
if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
size_t _stoff;
stlen++; /* count for trailing '\0' */
_stoff = stp->st_nextoff;
/*
* Have we overflowed our assigned buffer?
*/
if ((_stoff + stlen) > stp->st_fullstrsize)
return (-1);
memcpy(stp->st_strbuf + _stoff, str, stlen);
*stoff = _stoff;
stp->st_nextoff += stlen;
return (0);
}
/*
* Calculate reverse hash for string.
*/
hashval = HASHSEED;
for (i = stlen; i >= 0; i--) {
hashval = ((hashval << 5) + hashval) +
str[i]; /* h = ((h * 33) + c) */
}
for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
sthash = sthash->hi_next) {
const char *hstr;
if (sthash->hi_hashval != hashval)
continue;
hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen -
sthash->hi_strlen];
if (strcmp(str, hstr) == 0)
break;
}
/*
* Did we find the string?
*/
if (sthash == 0)
return (-1);
/*
* Has this string been copied into the string table?
*/
mstr = sthash->hi_mstr;
if (mstr->sm_stroff == 0) {
size_t mstrlen = mstr->sm_strlen + 1;
mstr->sm_stroff = stp->st_nextoff;
/*
* Have we overflowed our assigned buffer?
*/
if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize)
return (-1);
(void) memcpy(stp->st_strbuf + mstr->sm_stroff,
mstr->sm_str, mstrlen);
stp->st_nextoff += mstrlen;
}
/*
* Calculate offset of (sub)string.
*/
*stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen;
return (0);
}
static int
st_hash_insert(Str_tbl *stp, const char *str, size_t len)
{
int i;
uint_t hashval = HASHSEED;
uint_t bckcnt = stp->st_hbckcnt;
Str_hash **hashbcks = stp->st_hashbcks;
Str_hash *sthash;
Str_master *mstr = 0;
/*
* We use a classic 'Bernstein k=33' hash function. But
* instead of hashing from the start of the string to the
* end, we do it in reverse.
*
* This way - we are essentially building all of the
* suffix hashvalues as we go. We can check to see if
* any suffixes already exist in the tree as we generate
* the hash.
*/
for (i = len; i >= 0; i--) {
hashval = ((hashval << 5) + hashval) +
str[i]; /* h = ((h * 33) + c) */
for (sthash = hashbcks[hashval % bckcnt];
sthash; sthash = sthash->hi_next) {
const char *hstr;
Str_master *_mstr;
if (sthash->hi_hashval != hashval)
continue;
_mstr = sthash->hi_mstr;
hstr = &_mstr->sm_str[_mstr->sm_strlen -
sthash->hi_strlen];
if (strcmp(&str[i], hstr))
continue;
if (i == 0) {
/*
* Entry already in table, increment refcnt and
* get out.
*/
sthash->hi_refcnt++;
return (0);
} else {
/*
* If this 'suffix' is presently a 'master
* string, then take over it's record.
*/
if (sthash->hi_strlen == _mstr->sm_strlen) {
/*
* we should only do this once.
*/
assert(mstr == 0);
mstr = _mstr;
}
}
}
}
/*
* Do we need a new master string, or can we take over
* one we already found in the table?
*/
if (mstr == 0) {
/*
* allocate a new master string
*/
if ((mstr = calloc(sizeof (*mstr), 1)) == 0)
return (-1);
mstr->sm_next = stp->st_mstrlist;
stp->st_mstrlist = mstr;
stp->st_strsize += len + 1;
} else {
/*
* We are taking over a existing master string, the string size
* only increments by the difference between the current string
* and the previous master.
*/
assert(len > mstr->sm_strlen);
stp->st_strsize += len - mstr->sm_strlen;
}
if ((sthash = calloc(sizeof (*sthash), 1)) == 0)
return (-1);
mstr->sm_hashval = sthash->hi_hashval = hashval;
mstr->sm_strlen = sthash->hi_strlen = len;
mstr->sm_str = str;
sthash->hi_refcnt = 1;
sthash->hi_mstr = mstr;
/*
* Insert string element into head of hash list
*/
hashval = hashval % bckcnt;
sthash->hi_next = hashbcks[hashval];
hashbcks[hashval] = sthash;
return (0);
}
/*
* Return amount of space required for the string table.
*/
size_t
st_getstrtab_sz(Str_tbl *stp)
{
assert(stp->st_fullstrsize > 0);
if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
stp->st_flags |= FLG_STTAB_COOKED;
return (stp->st_fullstrsize);
}
if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
LenNode *lnp;
void *cookie;
stp->st_flags |= FLG_STTAB_COOKED;
/*
* allocate a hash table about the size of # of
* strings input.
*/
stp->st_hbckcnt = findprime(stp->st_strcnt);
if ((stp->st_hashbcks = calloc(sizeof (*stp->st_hashbcks),
stp->st_hbckcnt)) == NULL)
return (0);
/*
* We now walk all of the strings in the list, from shortest to
* longest, and insert them into the hashtable.
*/
if ((lnp = avl_first(stp->st_lentree)) == NULL) {
/*
* Is it possible we have an empty string table, if so,
* the table still contains '\0', so return the size.
*/
if (avl_numnodes(stp->st_lentree) == 0) {
assert(stp->st_strsize == 1);
return (stp->st_strsize);
}
return (0);
}
while (lnp) {
StrNode *snp;
/*
* Walk the string lists and insert them into the hash
* list. Once a string is inserted we no longer need
* it's entry, so the string can be freed.
*/
for (snp = avl_first(lnp->ln_strtree); snp;
snp = AVL_NEXT(lnp->ln_strtree, snp)) {
if (st_hash_insert(stp, snp->sn_str,
lnp->ln_strlen) == -1)
return (0);
}
/*
* Now that the strings have been copied, walk the
* StrNode tree and free all the AVL nodes. Note,
* avl_destroy_nodes() beats avl_remove() as the
* latter balances the nodes as they are removed.
* We just want to tear the whole thing down fast.
*/
cookie = NULL;
while ((snp = avl_destroy_nodes(lnp->ln_strtree,
&cookie)) != NULL)
free(snp);
avl_destroy(lnp->ln_strtree);
free(lnp->ln_strtree);
lnp->ln_strtree = NULL;
/*
* Move on to the next LenNode.
*/
lnp = AVL_NEXT(stp->st_lentree, lnp);
}
/*
* Now that all of the strings have been freed, walk the
* LenNode tree and free all of the AVL nodes. Note,
* avl_destroy_nodes() beats avl_remove() as the latter
* balances the nodes as they are removed. We just want to
* tear the whole thing down fast.
*/
cookie = NULL;
while ((lnp = avl_destroy_nodes(stp->st_lentree,
&cookie)) != NULL)
free(lnp);
avl_destroy(stp->st_lentree);
free(stp->st_lentree);
stp->st_lentree = 0;
}
assert(stp->st_strsize > 0);
assert(stp->st_fullstrsize >= stp->st_strsize);
return (stp->st_strsize);
}
/*
* Associate a buffer with a string table.
*/
const char *
st_getstrbuf(Str_tbl *stp)
{
return (stp->st_strbuf);
}
int
st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize)
{
assert(stp->st_flags & FLG_STTAB_COOKED);
if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
if (bufsize < stp->st_fullstrsize)
return (-1);
} else {
if (bufsize < stp->st_strsize)
return (-1);
}
stp->st_strbuf = stbuf;
#ifdef DEBUG
/*
* for debug builds - start with a stringtable filled in
* with '0xff'. This makes it very easy to spot unfilled
* holes in the strtab.
*/
memset(stbuf, 0xff, bufsize);
stbuf[0] = '\0';
#else
memset(stbuf, 0x0, bufsize);
#endif
return (0);
}