mod_speling.c revision 43c3e6a4b559b76b750c245ee95e2782c15b4296
/* Copyright 1999-2005 The Apache Software Foundation or its licensors, as
* applicable.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "apr.h"
#include "apr_file_io.h"
#include "apr_strings.h"
#include "apr_lib.h"
#define APR_WANT_STRFUNC
#include "apr_want.h"
#define WANT_BASENAME_MATCH
#include "httpd.h"
#include "http_core.h"
#include "http_config.h"
#include "http_request.h"
#include "http_log.h"
/* mod_speling.c - by Alexei Kosut <akosut@organic.com> June, 1996
*
* This module is transparent, and simple. It attempts to correct
* misspellings of URLs that users might have entered, namely by checking
* capitalizations. If it finds a match, it sends a redirect.
*
* 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De>
* o Upgraded module interface to apache_1.3a2-dev API (more NULL's in
* speling_module).
* o Integrated tcsh's "spelling correction" routine which allows one
* misspelling (character insertion/omission/typo/transposition).
* Rewrote it to ignore case as well. This ought to catch the majority
* of misspelled requests.
* o Commented out the second pass where files' suffixes are stripped.
* Given the better hit rate of the first pass, this rather ugly
* (request index.html, receive index.db ?!?!) solution can be
* omitted.
* o wrote a "kind of" html page for mod_speling
*
* Activate it with "CheckSpelling On"
*/
typedef struct {
int enabled;
} spconfig;
/*
* Create a configuration specific to this module for a server or directory
* location, and fill it with the default settings.
*
* The API says that in the absence of a merge function, the record for the
* closest ancestor is used exclusively. That's what we want, so we don't
* bother to have such a function.
*/
static void *mkconfig(apr_pool_t *p)
{
return cfg;
}
/*
* Respond to a callback to create configuration record for a server or
* vhost environment.
*/
{
return mkconfig(p);
}
/*
* Respond to a callback to create a config record for a specific directory.
*/
{
return mkconfig(p);
}
/*
* Handler for the CheckSpelling directive, which is FLAG.
*/
{
return NULL;
}
/*
* Define the directives specific to this module. This structure is referenced
* later by the 'module' structure.
*/
static const command_rec speling_cmds[] =
{
"whether or not to fix miscapitalized/misspelled requests"),
{ NULL }
};
typedef enum {
SP_IDENTICAL = 0,
SP_MISCAPITALIZED = 1,
SP_TRANSPOSITION = 2,
SP_MISSINGCHAR = 3,
SP_EXTRACHAR = 4,
SP_SIMPLETYPO = 5,
SP_VERYDIFFERENT = 6
} sp_reason;
static const char *sp_reason_str[] =
{
"identical",
"miscapitalized",
"transposed characters",
"character missing",
"extra character",
"mistyped character",
"common basename",
};
typedef struct {
const char *name;
/*
* spdist() is taken from Kernighan & Pike,
* _The_UNIX_Programming_Environment_
* and adapted somewhat to correspond better to psychological reality.
* (Note the changes to the return values)
*
* According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4),
* page 363, the correct order for this is:
* OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION
* thus, it was exactly backwards in the old version. -- PWP
*
* This routine was taken out of tcsh's spelling correction code
* (tcsh-6.07.04) and re-converted to apache data types ("char" type
* instead of tcsh's NLS'ed "Char"). Plus it now ignores the case
* during comparisons, so is a "approximate strcasecmp()".
* NOTE that is still allows only _one_ real "typo",
* it does NOT try to correct multiple errors.
*/
{
for (; apr_tolower(*s) == apr_tolower(*t); t++, s++) {
if (*t == '\0') {
return SP_MISCAPITALIZED; /* exact match (sans case) */
}
}
if (*s) {
if (*t) {
return SP_TRANSPOSITION; /* transposition */
}
return SP_SIMPLETYPO; /* 1 char mismatch */
}
}
if (strcasecmp(s + 1, t) == 0) {
return SP_EXTRACHAR; /* extra character */
}
}
if (*t && strcasecmp(s, t + 1) == 0) {
return SP_MISSINGCHAR; /* missing character */
}
return SP_VERYDIFFERENT; /* distance too large to fix. */
}
{
}
static int check_speling(request_rec *r)
{
return DECLINED;
}
/* We only want to worry about GETs */
if (r->method_number != M_GET) {
return DECLINED;
}
/* We've already got a file of some kind or another */
return DECLINED;
}
/* This is a sub request - don't mess with it */
if (r->main) {
return DECLINED;
}
/* we default to reject path info (same as core handler) */
if ((r->used_path_info != AP_REQ_ACCEPT_PATH_INFO) &&
return DECLINED;
}
/*
* The request should end up looking like this:
* r->uri: /correct-url/mispelling/more
* r->filename: /correct-file/mispelling r->path_info: /more
*
* So we do this in steps. First break r->filename into two pieces
*/
/*
* Don't do anything if the request doesn't contain a slash, or
* requests "/"
*/
return DECLINED;
}
/* good = /correct-file */
/* bad = mispelling */
/* postgood = mispelling/more */
/* Check to see if the URL pieces add up */
return DECLINED;
}
/* url = /correct-url */
/* Now open the directory and do ourselves a check... */
/* Oops, not a directory... */
return DECLINED;
}
if (dotloc == -1) {
}
sp_reason q;
/*
* If we end up with a "fixed" URL which is identical to the
* requested one, we must have found a broken symlink or some such.
* Do _not_ try to redirect this, it causes a loop!
*/
return OK;
}
/*
* miscapitalization errors are checked first (like, e.g., lower case
* file, upper case request)
*/
}
/*
* simple typing errors are checked next (like, e.g.,
* missing/extra/transposed char)
*/
}
/*
* The spdist() should have found the majority of the misspelled
* requests. It is of questionable use to continue looking for
* files with the same base name, but potentially of totally wrong
* type (index.html <-> index.db).
* I would propose to not set the WANT_BASENAME_MATCH define.
* 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De>
*
* However, Alexei replied giving some reasons to add it anyway:
* > Oh, by the way, I remembered why having the
* > extension-stripping-and-matching stuff is a good idea:
* >
* > If you're using MultiViews, and have a file named foobar.html,
* > which you refer to as "foobar", and someone tried to access
* > "Foobar", mod_speling won't find it, because it won't find
* > anything matching that spelling. With the extension-munging,
* > it would locate "foobar.html". Not perfect, but I ran into
* > that problem when I first wrote the module.
*/
else {
#ifdef WANT_BASENAME_MATCH
/*
* Okay... we didn't find anything. Now we take out the hard-core
* power tools. There are several cases here. Someone might have
* entered a wrong extension (.htm instead of .html or vice
* versa) or the document could be negotiated. At any rate, now
* we just compare stuff before the first dot. If it matches, we
* figure we got us a match. This can result in wrong things if
* there are files of different content types but the same prefix
* (e.g. foo.gif and foo.html) This code will pick the first one
* it finds. Better than a Not Found, though.
*/
if (entloc == -1) {
}
}
#endif
}
}
if (candidates->nelts != 0) {
/* Wow... we found us a mispelling. Construct a fixed url */
char *nuri;
const char *ref;
int i;
sizeof(misspelled_file), sort_by_quality);
/*
* Conditions for immediate redirection:
* a) the first candidate was not found by stripping the suffix
* AND b) there exists only one candidate OR the best match is not
* ambiguous
* then return a redirection right away.
*/
if (r->parsed_uri.query)
r,
ref ? "Fixed spelling: %s to %s from %s"
: "Fixed spelling: %s to %s",
return HTTP_MOVED_PERMANENTLY;
}
/*
* Otherwise, a "[300] Multiple Choices" list with the variants is
* returned.
*/
else {
apr_pool_t *p;
p = r->pool;
}
else {
}
return DECLINED;
sizeof(char *));
sizeof(char *));
/* Generate the response text. */
*(const char **)apr_array_push(t) =
"The document name you requested (<code>";
*(const char **)apr_array_push(t) =
"</code>) could not be found on this server.\n"
"However, we found documents with names similar "
"to the one you requested.<p>"
"Available documents:\n<ul>\n";
for (i = 0; i < candidates->nelts; ++i) {
char *vuri;
const char *reason;
/* The format isn't very neat... */
NULL);
*(const char **)apr_array_push(v) = "\"";
*(const char **)apr_array_push(v) = "\";\"";
*(const char **)apr_array_push(v) = reason;
*(const char **)apr_array_push(v) = "\"";
*(const char **)apr_array_push(t) = "<li><a href=\"";
*(const char **)apr_array_push(t) = "\">";
*(const char **)apr_array_push(t) = "</a> (";
*(const char **)apr_array_push(t) = reason;
*(const char **)apr_array_push(t) = ")\n";
/*
* when we have printed the "close matches" and there are
* more "distant matches" (matched by stripping the suffix),
* then we insert an additional separator text to suggest
* that the user LOOK CLOSELY whether these are really the
* files she wanted.
*/
*(const char **)apr_array_push(t) =
"</ul>\nFurthermore, the following related "
"documents were found:\n<ul>\n";
}
}
*(const char **)apr_array_push(t) = "</ul>\n";
/* If we know there was a referring page, add a note: */
*(const char **)apr_array_push(t) =
"Please consider informing the owner of the "
"<a href=\"";
*(const char **)apr_array_push(t) = "\">referring page</a> "
"about the broken link.\n";
}
/* Pass our apr_table_t to http_protocol.c (see mod_negotiation): */
apr_array_pstrcat(p, v, ','));
ref ? "Spelling fix: %s: %d candidates from %s"
: "Spelling fix: %s: %d candidates",
return HTTP_MULTIPLE_CHOICES;
}
}
return OK;
}
static void register_hooks(apr_pool_t *p)
{
}
{
create_mconfig_for_directory, /* create per-dir config */
NULL, /* merge per-dir config */
create_mconfig_for_server, /* server config */
NULL, /* merge server config */
speling_cmds, /* command apr_table_t */
register_hooks /* register hooks */
};