modules/metadata/mod_unique_id.c

/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * mod_unique_id.c: generate a unique identifier for each request
 *
 * Original author: Dean Gaudet <dgaudet@arctic.org>
 * UUencoding modified by: Alvaro Martinez Echevarria <alvaro@lander.es>
 */

#define APR_WANT_BYTEFUNC   /* for htons() et al */
#include "apr_want.h"
#include "apr_general.h"    /* for APR_OFFSETOF                */
#include "apr_network_io.h"

#include "httpd.h"
#include "http_config.h"
#include "http_log.h"
#include "http_protocol.h"  /* for ap_hook_post_read_request */

#define ROOT_SIZE 10

typedef struct {
    unsigned int stamp;
    char root[ROOT_SIZE];
    unsigned short counter;
    unsigned int thread_index;
} unique_id_rec;

/* We are using thread_index (the index into the scoreboard), because we
 * cannot guarantee the thread_id will be an integer.
 *
 * This code looks like it won't give a unique ID with the new thread logic.
 * It will.  The reason is, we don't increment the counter in a thread_safe
 * manner.  Because the thread_index is also in the unique ID now, this does
 * not matter.  In order for the id to not be unique, the same thread would
 * have to get the same counter twice in the same second.
 */

/* Comments:
 *
 * We want an identifier which is unique across all hits, everywhere.
 * "everywhere" includes multiple httpd instances on the same machine, or on
 * multiple machines.  Essentially "everywhere" should include all possible
 * httpds across all servers at a particular "site".  We make some assumptions
 * that if the site has a cluster of machines then their time is relatively
 * synchronized.  We also assume that the first address returned by a
 * gethostbyname (gethostname()) is unique across all the machines at the
 * "site".
 *
 * The root is assumed to absolutely uniquely identify this one child
 * from all other currently running children on all servers (including
 * this physical server if it is running multiple httpds) from each
 * other.
 *
 * The stamp and counter are used to distinguish all hits for a
 * particular root.  The stamp is updated using r->request_time,
 * saving cpu cycles.  The counter is never reset, and is used to
 * permit up to 64k requests in a single second by a single child.
 *
 * The 144-bits of unique_id_rec are encoded using the alphabet
 * [A-Za-z0-9@-], resulting in 24 bytes of printable characters.  That is then
 * stuffed into the environment variable UNIQUE_ID so that it is available to
 * other modules.  The alphabet choice differs from normal base64 encoding
 * [A-Za-z0-9+/] because + and / are special characters in URLs and we want to
 * make it easy to use UNIQUE_ID in URLs.
 *
 * Note that UNIQUE_ID should be considered an opaque token by other
 * applications.  No attempt should be made to dissect its internal components.
 * It is an abstraction that may change in the future as the needs of this
 * module change.
 *
 * It is highly desirable that identifiers exist for "eternity".  But future
 * needs (such as much faster webservers, or moving to a
 * multithreaded server) may dictate a need to change the contents of
 * unique_id_rec.  Such a future implementation should ensure that the first
 * field is still a time_t stamp.  By doing that, it is possible for a site to
 * have a "flag second" in which they stop all of their old-format servers,
 * wait one entire second, and then start all of their new-servers.  This
 * procedure will ensure that the new space of identifiers is completely unique
 * from the old space.  (Since the first four unencoded bytes always differ.)
 *
 * Note: previous implementations used 32-bits of IP address plus pid
 * in place of the PRNG output in the "root" field.  This was
 * insufficient for IPv6-only hosts, required working DNS to determine
 * a unique IP address (fragile), and needed a [0, 1) second sleep
 * call at startup to avoid pid reuse.  Use of the PRNG avoids all
 * these issues.
 */

/*
 * Sun Jun  7 05:43:49 CEST 1998 -- Alvaro
 * More comments:
 * 1) The UUencoding prodecure is now done in a general way, avoiding the problems
 * with sizes and paddings that can arise depending on the architecture. Now the
 * offsets and sizes of the elements of the unique_id_rec structure are calculated
 * in unique_id_global_init; and then used to duplicate the structure without the
 * paddings that might exist. The multithreaded server fix should be now very easy:
 * just add a new "tid" field to the unique_id_rec structure, and increase by one
 * UNIQUE_ID_REC_MAX.
 * 2) unique_id_rec.stamp has been changed from "time_t" to "unsigned int", because
 * its size is 64bits on some platforms (linux/alpha), and this caused problems with
 * htonl/ntohl. Well, this shouldn't be a problem till year 2106.
 */

/*
 * XXX: We should have a per-thread counter and not use cur_unique_id.counter
 * XXX: in all threads, because this is bad for performance on multi-processor
 * XXX: systems: Writing to the same address from several CPUs causes cache
 * XXX: thrashing.
 */
static unique_id_rec cur_unique_id;

/*
 * Number of elements in the structure unique_id_rec.
 */
#define UNIQUE_ID_REC_MAX 4

static unsigned short unique_id_rec_offset[UNIQUE_ID_REC_MAX],
                      unique_id_rec_size[UNIQUE_ID_REC_MAX],
                      unique_id_rec_total_size,
                      unique_id_rec_size_uu;

static int unique_id_global_init(apr_pool_t *p, apr_pool_t *plog, apr_pool_t *ptemp, server_rec *main_server)
{
    /*
     * Calculate the sizes and offsets in cur_unique_id.
     */
    unique_id_rec_offset[0] = APR_OFFSETOF(unique_id_rec, stamp);
    unique_id_rec_size[0] = sizeof(cur_unique_id.stamp);
    unique_id_rec_offset[1] = APR_OFFSETOF(unique_id_rec, root);
    unique_id_rec_size[1] = sizeof(cur_unique_id.root);
    unique_id_rec_offset[2] = APR_OFFSETOF(unique_id_rec, counter);
    unique_id_rec_size[2] = sizeof(cur_unique_id.counter);
    unique_id_rec_offset[3] = APR_OFFSETOF(unique_id_rec, thread_index);
    unique_id_rec_size[3] = sizeof(cur_unique_id.thread_index);
    unique_id_rec_total_size = unique_id_rec_size[0] + unique_id_rec_size[1] +
                               unique_id_rec_size[2] + unique_id_rec_size[3];

    /*
     * Calculate the size of the structure when encoded.
     */
    unique_id_rec_size_uu = (unique_id_rec_total_size*8+5)/6;

    return OK;
}

static void unique_id_child_init(apr_pool_t *p, server_rec *s)
{
    ap_random_insecure_bytes(&cur_unique_id.root,
                             sizeof(cur_unique_id.root));

    /*
     * If we use 0 as the initial counter we have a little less protection
     * against restart problems, and a little less protection against a clock
     * going backwards in time.
     */
    ap_random_insecure_bytes(&cur_unique_id.counter,
                             sizeof(cur_unique_id.counter));
}

/* NOTE: This is *NOT* the same encoding used by base64encode ... the last two
 * characters should be + and /.  But those two characters have very special
 * meanings in URLs, and we want to make it easy to use identifiers in
 * URLs.  So we replace them with @ and -.
 */
static const char uuencoder[64] = {
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '@', '-',
};

static const char *gen_unique_id(const request_rec *r)
{
    char *str;
    /*
     * Buffer padded with two final bytes, used to copy the unique_id_red
     * structure without the internal paddings that it could have.
     */
    unique_id_rec new_unique_id;
    struct {
        unique_id_rec foo;
        unsigned char pad[2];
    } paddedbuf;
    unsigned char *x,*y;
    unsigned short counter;
    int i,j,k;

    memcpy(&new_unique_id.root, &cur_unique_id.root, ROOT_SIZE);
    new_unique_id.counter = cur_unique_id.counter;
    new_unique_id.stamp = htonl((unsigned int)apr_time_sec(r->request_time));
    new_unique_id.thread_index = htonl((unsigned int)r->connection->id);

    /* we'll use a temporal buffer to avoid uuencoding the possible internal
     * paddings of the original structure */
    x = (unsigned char *) &paddedbuf;
    k = 0;
    for (i = 0; i < UNIQUE_ID_REC_MAX; i++) {
        y = ((unsigned char *) &new_unique_id) + unique_id_rec_offset[i];
        for (j = 0; j < unique_id_rec_size[i]; j++, k++) {
            x[k] = y[j];
        }
    }
    /*
     * We reset two more bytes just in case padding is needed for the uuencoding.
     */
    x[k++] = '\0';
    x[k++] = '\0';

    /* alloc str and do the uuencoding */
    str = (char *)apr_palloc(r->pool, unique_id_rec_size_uu + 1);
    k = 0;
    for (i = 0; i < unique_id_rec_total_size; i += 3) {
        y = x + i;
        str[k++] = uuencoder[y[0] >> 2];
        str[k++] = uuencoder[((y[0] & 0x03) << 4) | ((y[1] & 0xf0) >> 4)];
        if (k == unique_id_rec_size_uu) break;
        str[k++] = uuencoder[((y[1] & 0x0f) << 2) | ((y[2] & 0xc0) >> 6)];
        if (k == unique_id_rec_size_uu) break;
        str[k++] = uuencoder[y[2] & 0x3f];
    }
    str[k++] = '\0';

    /* and increment the identifier for the next call */

    counter = ntohs(new_unique_id.counter) + 1;
    cur_unique_id.counter = htons(counter);

    return str;
}

/*
 * There are two ways the generation of a unique id can be triggered:
 *
 * - from the post_read_request hook which calls set_unique_id()
 * - from error logging via the generate_log_id hook which calls
 *   generate_log_id(). This may happen before or after set_unique_id()
 *   has been called, or not at all.
 */

static int generate_log_id(const conn_rec *c, const request_rec *r,
                           const char **id)
{
    /* we do not care about connection ids */
    if (r == NULL)
        return DECLINED;

    /* XXX: do we need special handling for internal redirects? */

    /* if set_unique_id() has been called for this request, use it */
    *id = apr_table_get(r->subprocess_env, "UNIQUE_ID");

    if (!*id)
        *id = gen_unique_id(r);
    return OK;
}

static int set_unique_id(request_rec *r)
{
    const char *id = NULL;
    /* copy the unique_id if this is an internal redirect (we're never
     * actually called for sub requests, so we don't need to test for
     * them) */
    if (r->prev) {
       id = apr_table_get(r->subprocess_env, "REDIRECT_UNIQUE_ID");
    }

    if (!id) {
        /* if we have a log id, it was set by our generate_log_id() function
         * and we should reuse the same id
         */
        id = r->log_id;
    }

    if (!id) {
        id = gen_unique_id(r);
    }

    /* set the environment variable */
    apr_table_setn(r->subprocess_env, "UNIQUE_ID", id);

    return DECLINED;
}

static void register_hooks(apr_pool_t *p)
{
    ap_hook_post_config(unique_id_global_init, NULL, NULL, APR_HOOK_MIDDLE);
    ap_hook_child_init(unique_id_child_init, NULL, NULL, APR_HOOK_MIDDLE);
    ap_hook_post_read_request(set_unique_id, NULL, NULL, APR_HOOK_MIDDLE);
    ap_hook_generate_log_id(generate_log_id, NULL, NULL, APR_HOOK_MIDDLE);
}

AP_DECLARE_MODULE(unique_id) = {
    STANDARD20_MODULE_STUFF,
    NULL,                       /* dir config creater */
    NULL,                       /* dir merger --- default is to override */
    NULL,                       /* server config */
    NULL,                       /* merge server configs */
    NULL,                       /* command apr_table_t */
    register_hooks              /* register hooks */
};