f4d3a92b319b23e2b8d67298acc289d52bc1c517niq/* Copyright (c) 2007-11, WebThing Ltd
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * Copyright (c) 2011-, The Apache Software Foundation
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * Licensed to the Apache Software Foundation (ASF) under one or more
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * contributor license agreements. See the NOTICE file distributed with
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * this work for additional information regarding copyright ownership.
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * The ASF licenses this file to You under the Apache License, Version 2.0
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * (the "License"); you may not use this file except in compliance with
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * the License. You may obtain a copy of the License at
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * Unless required by applicable law or agreed to in writing, software
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * distributed under the License is distributed on an "AS IS" BASIS,
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * See the License for the specific language governing permissions and
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * limitations under the License.
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq#if defined(WIN32)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq/* libxml2 */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq#define APR_BRIGADE_DO(b,bb) for (b = APR_BRIGADE_FIRST(bb); \
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq (((enc)!=XML_CHAR_ENCODING_NONE)&&((enc)!=XML_CHAR_ENCODING_ERROR))
7307da1208ba7db743a5911cfccd2a549f1d3e34sf * XXX: Check all those ap_assert()s ans replace those that should not happen
7307da1208ba7db743a5911cfccd2a549f1d3e34sf * XXX: with AP_DEBUG_ASSERT and those that may happen with proper error
7307da1208ba7db743a5911cfccd2a549f1d3e34sf * XXX: handling.
f4d3a92b319b23e2b8d67298acc289d52bc1c517niqtypedef struct {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niqtypedef struct {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niqtypedef struct {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niqstatic apr_status_t xml2enc_filter(request_rec* r, const char* enc,
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq unsigned int mode)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* set up a ready-initialised ctx to convert to enc, and insert filter */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq rv = APR_SUCCESS; /* we'll initialise later by sniffing */
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01426)
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, APLOGNO(01427)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq/* This needs to operate only when we're using htmlParser */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq/* Different modules may apply different rules here. Ho, hum. */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq xml2cfg* cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module);
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if ((cfg->skipto != NULL) && (ctx->flags | ENC_SKIPTO)) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq while (!found && p && *p) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (!strncasecmp(p+1, starts[i].val, strlen(starts[i].val))) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* found a starting element. Strip all that comes before. */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq while (b = APR_BRIGADE_FIRST(ctx->bbsave), b != bstart) {
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01428)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "Skipped to first <%s> element",
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (p == NULL) {
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r, APLOGNO(01429)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "Failed to find start of recognised HTML!");
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq xml2cfg* cfg = NULL; /* initialise to shut compiler warnings up */
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01430)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* If we've got it in the HTTP headers, there's nothing to do */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (ctype && (p = ap_strcasestr(ctype, "charset=") , p != NULL)) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (ctx->encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ),
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(01431)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* to sniff, first we look for BOM */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq ctx->xml2enc = xmlDetectCharEncoding((const xmlChar*)ctx->buf,
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(01432)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "Got charset from XML rules.") ;
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* If none of the above, look for a META-thingey */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* also we're probably about to invalidate it, so we remove it. */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (ap_regexec(seek_meta_ctype, ctx->buf, 1, match, 0) == 0 ) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* get markers on the start and end of the match */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq rv = apr_brigade_partition(ctx->bbsave, match[0].rm_eo, &cute);
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq rv = apr_brigade_partition(ctx->bbsave, match[0].rm_so, &cutb);
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* now set length of useful buf for start-of-data hooks */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (ctx->encoding = apr_pstrndup(r->pool, p+match[1].rm_so,
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(01433)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* cut out the <meta> we're invalidating */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* and leave a string */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* either it's set to something we found or it's still the default */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* Aaargh! libxml2 has undocumented <META-crap> support. So this fails
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * if metafix is not active. Have to make it conditional.
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * No, that means no-metafix breaks things. Deal immediately with
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * this particular instance of metafix.
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module);
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq ctx->encoding = cfg->default_charset?cfg->default_charset:"ISO-8859-1";
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* Unsupported charset. Can we get (iconv) support through apr_xlate? */
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01434)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "Charset %s not supported by libxml2; trying apr_xlate",
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (apr_xlate_open(&ctx->convset, "UTF-8", ctx->encoding, r->pool)
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01435)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "Charset %s not supported. Consider aliasing it?",
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* Use configuration default as a last resort */
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r, APLOGNO(01436)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "No usable charset information; using configuration default");
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq ctx->xml2enc = (cfg->default_encoding == XML_CHAR_ENCODING_NONE)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq r->content_type = apr_pstrcat(r->pool, ctype, ";charset=utf-8",
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq char* str = apr_palloc(r->pool, strlen(r->content_type) + 13
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq strcpy(str + match[1].rm_so + 5, r->content_type+match[1].rm_eo);
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (!f->ctx) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq xml2cfg* cfg = ap_get_module_config(f->r->per_dir_config,
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq f->ctx = ctx = apr_pcalloc(f->r->pool, sizeof(xml2ctx));
f4d3a92b319b23e2b8d67298acc289d52bc1c517niqstatic apr_status_t xml2enc_ffunc(ap_filter_t* f, apr_bucket_brigade* bb)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* log error about configuring this */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq for (p = ctype; *p; ++p)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* only act if starts-with "text/" or contains "xml" */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (strncmp(ctype, "text/", 5) && !strstr(ctype, "xml")) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* append to any data left over from last time */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* some kind of initialisation required */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* Turn all this off when post-processing */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* if we don't have enough data to sniff but more's to come, wait */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* not enough data to sniff. Wait for more */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* flatten it into a NULL-terminated string */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq ctx->buf = apr_palloc(f->r->pool, (apr_size_t)(ctx->bblen+1));
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq rv = apr_brigade_flatten(ctx->bbsave, ctx->buf, &ctx->bytes);
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* FIXME: hook here for rewriting start-of-data? */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* nah, we only have one action here - call it inline */
7f40ab64e74b7d1057b5ee6abc349e32e74b1b4cjim /* we might change the Content-Length, so let's force its re-calculation */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* consume the data we just sniffed */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* we need to omit any <meta> we just invalidated */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq ap_set_module_config(f->r->request_config, &xml2enc_module, ctx);
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* move the data back to bb */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq while (b = APR_BRIGADE_FIRST(bb), b != APR_BRIGADE_SENTINEL(bb)) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* send remaining data */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq } else if (APR_BUCKET_IS_FLUSH(b)) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq else { /* data bucket */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq if (insz > 0) { /* we have dangling data. Flatten it. */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* this is only what we've already tried to convert.
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * The brigade is exhausted.
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * Save remaining data for next time round
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01437)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* remove the data we've just read */
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01438)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq bdestroy = b; /* can't destroy until finished with the data */
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01439)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* OK, we've got some input we can use in [buf,bytes] */
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq while (insz > 0) {
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* nothing was converted last time!
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * break out of this loop!
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq b = apr_bucket_transient_create(buf+(bytes - insz), insz,
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01440)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq rv = apr_xlate_conv_buffer(ctx->convset, buf+(bytes - insz),
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, f->r, APLOGNO(01441)
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv2, f->r, APLOGNO(01442)
7307da1208ba7db743a5911cfccd2a549f1d3e34sf "ap_fwrite failed");
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq switch (rv) {
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01443)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "INCOMPLETE");
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq continue; /* If outbuf too small, go round again.
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * If it was inbuf, we'll break out when
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * we test ctx->bytes == ctx->bblen
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, APLOGNO(01444)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "Skipping invalid byte(s) in input stream!");
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq /* Erk! What's this?
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq * Bail out, flush, and hope to eat the buf raw
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(01445)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "Failed to convert input; trying it raw") ;
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, f->r, APLOGNO(01446)
7307da1208ba7db743a5911cfccd2a549f1d3e34sf "ap_fflush failed");
185aa71728867671e105178b4c66fbc22b65ae26sf ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(01447)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "xml2enc: error reading data") ;
f4d3a92b319b23e2b8d67298acc289d52bc1c517niqstatic apr_status_t xml2enc_charset(request_rec* r, xmlCharEncoding* encp,
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq const char** encoding)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq xml2ctx* ctx = ap_get_module_config(r->request_config, &xml2enc_module);
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq return HAVE_ENCODING(ctx->xml2enc) ? APR_SUCCESS : APR_EGENERAL;
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq#define PROTO_FLAGS AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq ap_register_output_filter_protocol("xml2enc", xml2enc_ffunc,
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq seek_charset = ap_pregcomp(pool, "charset=([A-Za-z0-9_-]+)",
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq const char* errmsg = ap_check_cmd_context(cmd, GLOBAL_ONLY);
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq return "Error setting charset alias";
f4d3a92b319b23e2b8d67298acc289d52bc1c517niqstatic const char* set_default(cmd_parms* cmd, void* CFG, const char* charset)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq return "Default charset not found";
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq return "Invalid or unsupported default charset";
f4d3a92b319b23e2b8d67298acc289d52bc1c517niqstatic const char* set_skipto(cmd_parms* cmd, void* CFG, const char* arg)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq cfg->skipto = apr_array_make(cmd->pool, 4, sizeof(tattr));
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq AP_INIT_TAKE1("xml2EncDefault", set_default, NULL, OR_ALL,
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "Usage: xml2EncDefault charset"),
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq AP_INIT_ITERATE2("xml2EncAlias", set_alias, NULL, RSRC_CONF,
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "EncodingAlias charset alias [more aliases]"),
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq AP_INIT_ITERATE("xml2StartParse", set_skipto, NULL, OR_ALL,
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq "Ignore anything in front of the first of these elements"),
f4d3a92b319b23e2b8d67298acc289d52bc1c517niqstatic void* xml2enc_merge(apr_pool_t* pool, void* BASE, void* ADD)
f4d3a92b319b23e2b8d67298acc289d52bc1c517niq ret->default_encoding = (add->default_encoding == XML_CHAR_ENCODING_NONE)