mod_charset_lite.c revision d68fa89456ac940c29b795e5fe13ecf76e05cbfb
/* Copyright 2000-2006 The Apache Software Foundation or its licensors, as
* applicable.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* simple hokey charset recoding configuration module
*
* See mod_ebcdic and mod_charset for more thought-out examples. This
* one is just so Jeff can learn how a module works and experiment with
* basic character set recoding configuration.
*
* !!!This is an extremely cheap ripoff of mod_charset.c from Russian Apache!!!
*/
#include "httpd.h"
#include "http_config.h"
#define CORE_PRIVATE
#include "http_core.h"
#include "http_log.h"
#include "http_main.h"
#include "http_protocol.h"
#include "http_request.h"
#include "util_charset.h"
#include "apr_buckets.h"
#include "util_filter.h"
#include "apr_strings.h"
#include "apr_lib.h"
#include "apr_xlate.h"
#define APR_WANT_STRFUNC
#include "apr_want.h"
* space left in the translation buffer
*/
* two buckets
*/
/* extended error status codes; this is used in addition to an apr_status_t to
* track errors in the translation filter
*/
typedef enum {
EES_INIT = 0, /* no error info yet; value must be 0 for easy init */
EES_LIMIT, /* built-in restriction encountered */
EES_INCOMPLETE_CHAR, /* incomplete multi-byte char at end of content */
EES_DOWNSTREAM, /* something bad happened in a filter below xlate */
EES_BAD_INPUT /* input data invalid */
} ees_t;
/* registered name of the output translation filter */
#define XLATEOUT_FILTER_NAME "XLATEOUT"
/* registered name of input translation filter */
#define XLATEIN_FILTER_NAME "XLATEIN"
typedef struct charset_dir_t {
/** debug level; -1 means uninitialized, 0 means no debug */
int debug;
const char *charset_source; /* source encoding */
const char *charset_default; /* how to ship on wire */
/** module does ap_add_*_filter()? */
/* charset_filter_ctx_t is created for each filter instance; because the same
* filter code is used for translating in both directions, we need this context
* data to tell the filter which translation handle to use; it also can hold a
* character which was split between buckets
*/
typedef struct charset_filter_ctx_t {
int is_sb; /* single-byte translation? */
int ran; /* has filter instance run before? */
int noop; /* should we pass brigades through unchanged? */
char *tmp; /* buffer for input filtering */
/* charset_req_t is available via r->request_config if any translation is
* being performed
*/
typedef struct charset_req_t {
/* debug level definitions */
* each request */
{
return dc;
}
{
/* If it is defined in the current container, use it. Otherwise, use the one
* from the enclosing container.
*/
a->debug =
a->charset_default =
a->charset_source =
a->implicit_add =
return a;
}
/* CharsetSourceEnc charset
*/
const char *name)
{
return NULL;
}
/* CharsetDefault charset
*/
const char *name)
{
return NULL;
}
/* CharsetOptions optionflag...
*/
const char *flag)
{
}
}
}
else {
"Invalid CharsetOptions option: ",
flag,
NULL);
}
return NULL;
}
/* find_code_page() is a fixup hook that decides if translation should be
* enabled; if so, it sets up request data for use by the filter registration
* hook so that it knows what to do
*/
static int find_code_page(request_rec *r)
{
const char *mime_type;
"uri: %s file: %s method: %d "
"imt: %s flags: %s%s%s %s->%s",
}
/* If we don't have a full directory configuration, bail out.
*/
"incomplete configuration: src %s, dst %s",
}
return DECLINED;
}
/* catch proxy requests */
/* mod_rewrite indicators */
/* If mime type isn't text or message, bail out.
*/
/* XXX When we handle translation of the request body, watch out here as
* 1.3 allowed additional mime types: multipart and
*/
/* On an EBCDIC machine, be willing to translate mod_autoindex-
* generated output. Otherwise, it doesn't look too cool.
*
* XXX This isn't a perfect fix because this doesn't trigger us
* to convert from the charset of the source code to ASCII. The
* general solution seems to be to allow a generator to set an
* indicator in the r specifying that the body is coded in the
* implementation character set (i.e., the charset of the source
* code). This would get several different types of documents
* translated properly: mod_autoindex output, mod_status output,
* mod_info output, hard-coded error documents, etc.
*/
#endif
"mime type is %s; no translation selected",
}
/* We must not bail out here (i.e., the MIME test must be in the filter
* itself, not in the fixup, because only then is the final MIME type known.
* Examples for late changes to the MIME type include CGI handling (MIME
* type is set in the Content-Type header produced by the CGI script), or
* PHP (until PHP runs, the MIME type is set to application/x-httpd-php)
*/
}
"charset_source: %s charset_default: %s",
}
/* Get storage for the request data and the output filter context.
* We rarely need the input filter context, so allocate that separately.
*/
sizeof(charset_req_t) +
sizeof(charset_filter_ctx_t));
/* We must not open the xlation table here yet, because the final MIME
* type is not known until we are actually called in the output filter.
* With POST or PUT request, the case is different, because their MIME
* type is set in the request headers, and their data are prerequisites
* for actually calling, e.g., the CGI handler later on.
*/
switch (r->method_number) {
case M_PUT:
case M_POST:
/* Set up input translation. Note: A request body can be included
* with the OPTIONS method, but for now we don't set up translation
* of it.
*/
r->connection->bucket_alloc);
if (rv != APR_SUCCESS) {
"can't open translation %s->%s",
return HTTP_INTERNAL_SERVER_ERROR;
}
}
}
return DECLINED;
}
struct ap_filter_t *filter_list)
{
while (filter) {
return 1;
}
}
return 0;
}
{
}
{
}
/* xlate_insert_filter() is a filter hook which decides whether or not
* to insert a translation filter for the current request.
*/
static void xlate_insert_filter(request_rec *r)
{
/* Hey... don't be so quick to use reqinfo->dc here; reqinfo may be NULL */
if (reqinfo) {
r->connection);
}
"xlate output filter not added implicitly because %s",
!reqinfo->output_ctx ?
"no output configuration available" :
"another module added the filter");
}
r->connection);
}
"xlate input filter not added implicitly because %s",
"no input configuration available" :
"another module added the filter");
}
}
}
/* stuff that sucks that I know of:
*
* bucket handling:
* why create an eos bucket when we see it come down the stream? just send the one
* passed as input... news flash: this will be fixed when xlate_out_filter() starts
* using the more generic xlate_brigade()
*
* translation mechanics:
* we don't handle characters that straddle more than two buckets; an error
* will be generated
*/
/* send_downstream() is passed the translated data; it puts it in a single-
* bucket brigade and passes the brigade to the next filter
*/
{
request_rec *r = f->r;
conn_rec *c = r->connection;
apr_bucket *b;
if (rv != APR_SUCCESS) {
}
return rv;
}
{
request_rec *r = f->r;
conn_rec *c = r->connection;
apr_bucket *b;
b = apr_bucket_eos_create(c->bucket_alloc);
if (rv != APR_SUCCESS) {
}
return rv;
}
const char *partial,
{
rv = APR_SUCCESS;
}
else {
rv = APR_INCOMPLETE;
* buckets
*/
}
return rv;
}
/* input buffer: */
const char **cur_str,
/* output buffer: */
char **out_str,
{
/* Keep adding bytes from the input string to the saved string until we
* 1) finish the input char
* 2) get an error
* or 3) run out of bytes to add
*/
do {
++*cur_str;
--*cur_len;
*out_str,
out_len);
if (rv == APR_SUCCESS) {
}
else {
* straddling more than two buckets
*/
}
return rv;
}
{
const char *msg;
char msgbuf[100];
int cur;
case EES_LIMIT:
rv = 0;
msg = "xlate filter - a built-in restriction was encountered";
break;
case EES_BAD_INPUT:
rv = 0;
msg = "xlate filter - an input character was invalid";
break;
case EES_BUCKET_READ:
rv = 0;
msg = "xlate filter - bucket read routine failed";
break;
case EES_INCOMPLETE_CHAR:
rv = 0;
cur = 0;
++cur;
}
break;
case EES_DOWNSTREAM:
msg = "xlate filter - an error occurred in a lower filter";
break;
default:
msg = "xlate filter - returning error";
}
"%s", msg);
}
/* chk_filter_chain() is called once per filter instance; it tries to
* determine if the current filter instance should be disabled because
* its translation is incompatible with the translation of an existing
* instance of the translate filter
*
* Example bad scenario:
*
* configured filter chain for the request:
* INCLUDES XLATEOUT(8859-1->UTS-16)
* configured filter chain for the subrequest:
* XLATEOUT(8859-1->UTS-16)
*
* When the subrequest is processed, the filter chain will be
* XLATEOUT(8859-1->UTS-16) XLATEOUT(8859-1->UTS-16)
* This makes no sense, so the instance of XLATEOUT added for the
* subrequest will be noop-ed.
*
* Example good scenario:
*
* configured filter chain for the request:
* INCLUDES XLATEOUT(8859-1->UTS-16)
* configured filter chain for the subrequest:
* XLATEOUT(IBM-1047->8859-1)
*
* When the subrequest is processed, the filter chain will be
* XLATEOUT(IBM-1047->8859-1) XLATEOUT(8859-1->UTS-16)
* This makes sense, so the instance of XLATEOUT added for the
* subrequest will be left alone and it will translate from
* IBM-1047->8859-1.
*/
static void chk_filter_chain(ap_filter_t *f)
{
return;
}
/* walk the filter chain; see if it makes sense for our filter to
* do any translation
*/
while (curf) {
if (!last_xlate_ctx) {
}
else {
/* incompatible translation
* if our filter instance is incompatible with an instance
* already in place, noop our instance
* Notes:
* . We are only willing to noop our own instance.
* . It is possible to noop another instance which has not
* yet run, but this is not currently implemented.
* Hopefully it will not be needed.
* . It is not possible to noop an instance which has
* already run.
*/
if (last_xlate_ctx == f->ctx) {
if (debug >= DBGLVL_PMC) {
0, f->r,
"%s %s - disabling "
"translation %s%s%s; existing "
"translation %s%s%s",
}
}
else {
0, f->r,
"chk_filter_chain() - can't disable "
"translation %s%s%s; existing "
"translation %s%s%s",
}
break;
}
}
}
}
}
/* xlate_brigade() is used to filter request and response bodies
*
* we'll stop when one of the following occurs:
* . we run out of buckets
* . we run out of space in the output buffer
* . we hit an error
*
* inputs:
* bb: brigade to process
* buffer: storage to hold the translated characters
* buffer_size: size of buffer
* (and a few more uninteresting parms)
*
* outputs:
* return value: APR_SUCCESS or some error code
* bb: we've removed any buckets representing the
* translated characters; the eos bucket, if
* present, will be left in the brigade
* buffer: filled in with translated characters
* buffer_size: updated with the bytes remaining
* hit_eos: did we hit an EOS bucket?
*/
char *buffer,
int *hit_eos)
{
const char *bucket;
*hit_eos = 0;
bucket_avail = 0;
while (1) {
if (!bucket_avail) { /* no bytes left to process in the current bucket... */
if (consumed_bucket) {
}
b = APR_BRIGADE_FIRST(bb);
if (b == APR_BRIGADE_SENTINEL(bb) ||
APR_BUCKET_IS_EOS(b)) {
break;
}
if (rv != APR_SUCCESS) {
break;
}
consumed_bucket = b; /* for axing when we're done reading it */
}
if (bucket_avail) {
/* We've got data, so translate it. */
/* Rats... we need to finish a partial character from the previous
* bucket.
*
* Strangely, finish_partial_char() increments the input buffer
* pointer but does not increment the output buffer pointer.
*/
&bucket, &bucket_avail,
&buffer, buffer_avail);
}
else {
/* We need to save the final byte(s) for next time; we can't
* convert it until we look at the next bucket.
*/
bucket_avail = 0;
}
}
if (rv != APR_SUCCESS) {
/* bad input byte or partial char too big to store */
break;
}
if (*buffer_avail < XLATE_MIN_BUFF_LEFT) {
/* if any data remains in the current bucket, split there */
if (bucket_avail) {
}
break;
}
}
}
if (!APR_BRIGADE_EMPTY(bb)) {
b = APR_BRIGADE_FIRST(bb);
if (APR_BUCKET_IS_EOS(b)) {
/* Leave the eos bucket in the brigade for reporting to
* subsequent filters.
*/
*hit_eos = 1;
/* Oops... we have a partial char from the previous bucket
* that won't be completed because there's no more data.
*/
rv = APR_INCOMPLETE;
}
}
}
return rv;
}
/* xlate_out_filter() handles (almost) arbitrary conversions from one charset
* to another...
* translation is determined in the fixup hook (find_code_page), which is
* where the filter's context data is set up... the context data gives us
* the translation handle
*/
{
const char *cur_str;
char tmp[OUTPUT_XLATE_BUF_SIZE];
int done;
if (!ctx) {
/* this is SetOutputFilter path; grab the preallocated context,
* if any; note that if we decided not to do anything in an earlier
* handler, we won't even have a reqinfo
*/
if (reqinfo) {
* in the filter chain; we can't have two
* instances using the same context
*/
}
if (!ctx) { /* no idea how to translate; don't do anything */
}
}
/* Opening the output translation (this used to be done in the fixup hook,
* but that was too early: a subsequent type modification, e.g., by a
* CGI script, would go unnoticed. Now we do it in the filter itself.)
*/
{
/* XXX When we handle translation of the request body, watch out here as
* 1.3 allowed additional mime types: multipart and
*/
/* On an EBCDIC machine, be willing to translate mod_autoindex-
* generated output. Otherwise, it doesn't look too cool.
*
* XXX This isn't a perfect fix because this doesn't trigger us
* to convert from the charset of the source code to ASCII. The
* general solution seems to be to allow a generator to set an
* indicator in the r specifying that the body is coded in the
* implementation character set (i.e., the charset of the source
* code). This would get several different types of documents
* translated properly: mod_autoindex output, mod_status output,
* mod_info output, hard-coded error documents, etc.
*/
#endif
if (rv != APR_SUCCESS) {
"can't open translation %s->%s",
}
else {
}
}
}
else {
"mime type is %s; no translation selected",
}
}
"xlate_out_filter() - "
"charset_source: %s charset_default: %s",
}
chk_filter_chain(f);
/* We're not converting between two single-byte charsets, so unset
* Content-Length since it is unlikely to remain the same.
*/
}
}
}
done = 0;
cur_len = 0;
space_avail = sizeof(tmp);
while (!done) {
if (!cur_len) { /* no bytes left to process in the current bucket... */
if (consumed_bucket) {
}
done = 1;
break;
}
if (APR_BUCKET_IS_EOS(dptr)) {
done = 1;
* eos down; when we minimize our bb construction
* we'll fix this crap */
/* Oops... we have a partial char from the previous bucket
* that won't be completed because there's no more data.
*/
rv = APR_INCOMPLETE;
}
break;
}
if (rv != APR_SUCCESS) {
done = 1;
break;
}
* next bucket */
}
/* Try to fill up our tmp buffer with translated data. */
if (cur_len) { /* maybe we just hit the end of a pipe (len = 0) ? */
/* Rats... we need to finish a partial character from the previous
* bucket.
*/
char *tmp_tmp;
&tmp_tmp, &space_avail);
}
else {
/* Update input ptr and len after consuming some bytes */
/* We need to save the final byte(s) for next time; we can't
* convert it until we look at the next bucket.
*/
cur_len = 0;
}
}
}
if (rv != APR_SUCCESS) {
/* bad input byte or partial char too big to store */
done = 1;
}
if (space_avail < XLATE_MIN_BUFF_LEFT) {
/* It is time to flush, as there is not enough space left in the
* current output buffer to bother with converting more data.
*/
if (rv != APR_SUCCESS) {
done = 1;
}
/* tmp is now empty */
space_avail = sizeof(tmp);
}
}
if (rv == APR_SUCCESS) {
}
}
if (rv == APR_SUCCESS) {
if (cur_len == -1) {
}
}
else {
log_xlate_error(f, rv);
}
return rv;
}
{
int hit_eos;
if (!ctx) {
/* this is SetInputFilter path; grab the preallocated context,
* if any; note that if we decided not to do anything in an earlier
* handler, we won't even have a reqinfo
*/
if (reqinfo) {
* in the filter chain; we can't have two
* instances using the same context
*/
}
if (!ctx) { /* no idea how to translate; don't do anything */
}
}
"xlate_in_filter() - "
"charset_source: %s charset_default: %s",
}
chk_filter_chain(f);
/* We're not converting between two single-byte charsets, so note
* that some handlers can't deal with it.
* It doesn't help to unset Content-Length in the input header
* table since in all likelihood the handler has already seen it.
*/
"Request body length may change, breaking some requests");
}
}
}
}
readbytes)) != APR_SUCCESS) {
return rv;
}
}
else {
}
if (rv == APR_SUCCESS) {
if (!hit_eos) {
/* move anything leftover into our context for next time;
* we don't currently "set aside" since the data came from
* down below, but I suspect that for long-term we need to
* do that
*/
}
apr_bucket *e;
/* make sure we insert at the head, because there may be
* an eos bucket already there, and the eos bucket should
* come after the data
*/
}
else {
/* XXX need to get some more data... what if the last brigade
* we got had only the first byte of a multibyte char? we need
* to grab more data from the network instead of returning an
* empty brigade
*/
}
}
else {
log_xlate_error(f, rv);
}
return rv;
}
static const command_rec cmds[] =
{
AP_INIT_TAKE1("CharsetSourceEnc",
NULL,
"source (html,cgi,ssi) file charset"),
AP_INIT_TAKE1("CharsetDefault",
NULL,
"name of default charset"),
AP_INIT_ITERATE("CharsetOptions",
NULL,
"valid options: ImplicitAdd, NoImplicitAdd, DebugLevel=n"),
{NULL}
};
static void charset_register_hooks(apr_pool_t *p)
{
}
{
NULL,
NULL,
cmds,
};