mod_xml2enc.c revision e6a70e704cf27a66177a1e8979ae05bc65385a0e
/* Copyright (c) 2007-11, WebThing Ltd
* Copyright (c) 2011-, The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if defined(WIN32)
#define XML2ENC_DECLARE_EXPORT
#endif
#include <ctype.h>
/* libxml2 */
#include <libxml/encoding.h>
#include "http_protocol.h"
#include "http_config.h"
#include "http_log.h"
#include "apr_strings.h"
#include "apr_xlate.h"
#include "apr_optional.h"
#include "mod_xml2enc.h"
#define BUFLEN 8192
#define BUF_MIN 4096
b != APR_BRIGADE_SENTINEL(bb); \
b = APR_BUCKET_NEXT(b))
#define ENC_INITIALISED 0x100
#define ENC_SEEN_EOS 0x200
#define ENC_SKIPTO ENCIO_SKIPTO
#define HAVE_ENCODING(enc) \
/*
* XXX: Check all those ap_assert()s ans replace those that should not happen
* XXX: with AP_DEBUG_ASSERT and those that may happen with proper error
* XXX: handling.
*/
typedef struct {
char* buf;
unsigned int flags;
const char* encoding;
} xml2ctx;
typedef struct {
const char* default_charset;
} xml2cfg;
typedef struct {
const char* val;
} tattr;
static ap_regex_t* seek_meta_ctype;
static ap_regex_t* seek_charset;
unsigned int mode)
{
/* set up a ready-initialised ctx to convert to enc, and insert filter */
flags |= ENC_INITIALISED;
}
flags |= ENC_INITIALISED;
}
}
else {
rv = APR_EGENERAL;
"xml2enc: bad mode %x", mode);
}
if (rv == APR_SUCCESS) {
if (flags & ENC_INITIALISED) {
}
}
else {
"xml2enc: Charset %s not supported.", enc) ;
}
return rv;
}
/* This needs to operate only when we're using htmlParser */
/* Different modules may apply different rules here. Ho, hum. */
{
int found = 0;
while (!found && p && *p) {
int i;
/* found a starting element. Strip all that comes before. */
apr_bucket* b;
&bstart);
}
found = 1;
"Skipped to first <%s> element",
break;
}
}
}
if (p == NULL) {
"Failed to find start of recognised HTML!");
}
}
}
{
char* p ;
apr_bucket* b;
const char* ctype = r->content_type;
if (ctype) {
"Content-Type is %s", ctype) ;
/* If we've got it in the HTTP headers, there's nothing to do */
p += 8 ;
}
}
}
/* to sniff, first we look for BOM */
"Got charset from XML rules.") ;
}
}
/* If none of the above, look for a META-thingey */
/* also we're probably about to invalidate it, so we remove it. */
/* get markers on the start and end of the match */
/* now set length of useful buf for start-of-data hooks */
}
}
}
/* cut out the <meta> we're invalidating */
b = APR_BUCKET_NEXT(cutb);
cutb = b;
}
/* and leave a string */
}
/* either it's set to something we found or it's still the default */
/* Aaargh! libxml2 has undocumented <META-crap> support. So this fails
* if metafix is not active. Have to make it conditional.
*
* No, that means no-metafix breaks things. Deal immediately with
* this particular instance of metafix.
*/
}
/* Unsupported charset. Can we get (iconv) support through apr_xlate? */
"Charset %s not supported by libxml2; trying apr_xlate",
== APR_SUCCESS) {
} else {
"Charset %s not supported. Consider aliasing it?",
}
}
/* Use configuration default as a last resort */
"No usable charset information; using configuration default");
}
NULL);
} else {
r->content_type = str;
}
}
}
{
if (!f->ctx) {
}
}
return APR_SUCCESS;
}
{
apr_bucket* b;
apr_size_t insz = 0;
char *ctype;
char *p;
if (!ctx || !f->r->content_type) {
/* log error about configuring this */
}
for (p = ctype; *p; ++p)
if (isupper(*p))
*p = tolower(*p);
/* only act if starts-with "text/" or contains "xml" */
}
f->r->connection->bucket_alloc);
}
/* append to any data left over from last time */
/* some kind of initialisation required */
/* Turn all this off when post-processing */
/* if we don't have enough data to sniff but more's to come, wait */
if (APR_BUCKET_IS_EOS(b)) {
break;
}
}
/* not enough data to sniff. Wait for more */
}
return APR_SUCCESS;
}
}
}
/* flatten it into a NULL-terminated string */
sniff_encoding(f->r, ctx);
/* FIXME: hook here for rewriting start-of-data? */
/* nah, we only have one action here - call it inline */
fix_skipto(f->r, ctx);
/* consume the data we just sniffed */
/* we need to omit any <meta> we just invalidated */
}
f->r->connection->bucket_alloc);
}
return rv;
}
/* move the data back to bb */
if (APR_BUCKET_IS_METADATA(b)) {
if (APR_BUCKET_IS_EOS(b)) {
/* send remaining data */
} else if (APR_BUCKET_IS_FLUSH(b)) {
}
}
else { /* data bucket */
char* buf;
apr_size_t bytes = 0;
if (insz > 0) { /* we have dangling data. Flatten it. */
/* this is only what we've already tried to convert.
* The brigade is exhausted.
* Save remaining data for next time round
*/
"xml2enc: Setting aside %" APR_SIZE_T_FMT
" unconverted bytes", bytes);
== APR_SUCCESS);
}
return rv;
}
/* remove the data we've just read */
}
"xml2enc: consuming %" APR_SIZE_T_FMT
" bytes flattened", bytes);
}
else {
bdestroy = b; /* can't destroy until finished with the data */
"xml2enc: consuming %" APR_SIZE_T_FMT
" bytes from bucket", bytes);
}
/* OK, we've got some input we can use in [buf,bytes] */
if (rv == APR_SUCCESS) {
while (insz > 0) {
/* nothing was converted last time!
* break out of this loop!
*/
bb->bucket_alloc);
"xml2enc: reinserting %" APR_SIZE_T_FMT
" unconsumed bytes from bucket", insz);
break;
}
"xml2enc: converted %" APR_SIZE_T_FMT
if (rv2 != APR_SUCCESS) {
"ap_fwrite failed");
return rv2;
}
switch (rv) {
case APR_SUCCESS:
continue;
case APR_EINCOMPLETE:
"INCOMPLETE");
continue; /* If outbuf too small, go round again.
* If it was inbuf, we'll break out when
* we test ctx->bytes == ctx->bblen
*/
case APR_EINVAL: /* try skipping one bad byte */
"Skipping invalid byte(s) in input stream!");
--insz;
continue;
default:
/* Erk! What's this?
* Bail out, flush, and hope to eat the buf raw
*/
"Failed to convert input; trying it raw") ;
if (rv != APR_SUCCESS)
"ap_fflush failed");
else
}
}
} else {
"xml2enc: error reading data") ;
}
if (bdestroy)
if (rv != APR_SUCCESS)
return rv;
}
}
return APR_SUCCESS;
}
const char** encoding)
{
return APR_EAGAIN;
}
}
{
"(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
}
{
return errmsg ;
return NULL;
else
return "Error setting charset alias";
}
{
switch(cfg->default_encoding) {
case XML_CHAR_ENCODING_NONE:
return "Default charset not found";
case XML_CHAR_ENCODING_ERROR:
return "Invalid or unsupported default charset";
default:
return NULL;
}
}
{
return NULL;
}
static const command_rec xml2enc_cmds[] = {
"Usage: xml2EncDefault charset"),
"EncodingAlias charset alias [more aliases]"),
"Ignore anything in front of the first of these elements"),
{ NULL }
};
{
return ret;
}
{
return ret;
}
AP_DECLARE_MODULE(xml2enc) = {
NULL,
NULL,
};