mod_mime_magic.c revision 9ede6357edc9aff1fb2f7edebefab473673298aa
0N/A/* ==================================================================== 2204N/A * Copyright (c) 1995-1999 The Apache Group. All rights reserved. 0N/A * Redistribution and use in source and binary forms, with or without 0N/A * modification, are permitted provided that the following conditions 0N/A * 1. Redistributions of source code must retain the above copyright 0N/A * notice, this list of conditions and the following disclaimer. 0N/A * 2. Redistributions in binary form must reproduce the above copyright 0N/A * notice, this list of conditions and the following disclaimer in 0N/A * the documentation and/or other materials provided with the 0N/A * 3. All advertising materials mentioning features or use of this 0N/A * software must display the following acknowledgment: 0N/A * "This product includes software developed by the Apache Group 1472N/A * 4. The names "Apache Server" and "Apache Group" must not be used to 0N/A * endorse or promote products derived from this software without 0N/A * prior written permission. For written permission, please contact 0N/A * apache@apache.org. 1879N/A * 5. Products derived from this software may not be called "Apache" 1879N/A * nor may "Apache" appear in their names without prior written 1879N/A * permission of the Apache Group. 1879N/A * 6. Redistributions of any form whatsoever must retain the following 1879N/A * "This product includes software developed by the Apache Group 1879N/A * THIS SOFTWARE IS PROVIDED BY THE APACHE GROUP ``AS IS'' AND ANY 1879N/A * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1879N/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 1879N/A * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE APACHE GROUP OR 1879N/A * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 1879N/A * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 1879N/A * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 1879N/A * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 1879N/A * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 2073N/A * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 2073N/A * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 2073N/A * OF THE POSSIBILITY OF SUCH DAMAGE. 2073N/A * ==================================================================== 2073N/A * This software consists of voluntary contributions made by many 1879N/A * individuals on behalf of the Apache Group and was originally based 1879N/A * on public domain software written at the National Center for 1879N/A * Supercomputing Applications, University of Illinois, Urbana-Champaign. 1879N/A * For more information on the Apache Group and the Apache HTTP server 1879N/A * mod_mime_magic: MIME type lookup via file magic numbers 605N/A * Copyright (c) 1996-1997 Cisco Systems, Inc. 0N/A * This software was submitted by Cisco Systems to the Apache Group in July 0N/A * 1997. Future revisions and derivatives of this source code must 0N/A * acknowledge Cisco Systems as the original contributor of this module. 0N/A * All other licensing and usage conditions are those of the Apache Group. 0N/A * Some of this code is derived from the free version of the file command 0N/A * originally posted to comp.sources.unix. Copyright info for that program 0N/A * is included below as required. 0N/A * --------------------------------------------------------------------------- 0N/A * - Copyright (c) Ian F. Darwin, 1987. Written by Ian F. Darwin. 0N/A * This software is not subject to any license of the American Telephone and 0N/A * Telegraph Company or of the Regents of the University of California. 0N/A * Permission is granted to anyone to use this software for any purpose on any 0N/A * computer system, and to alter it and redistribute it freely, subject to 0N/A * the following restrictions: 0N/A * 1. The author is not responsible for the consequences of use of this 0N/A * software, no matter how awful, even if they arise from flaws in it. 0N/A * 2. The origin of this software must not be misrepresented, either by 0N/A * explicit claim or by omission. Since few users ever read sources, credits 0N/A * must appear in the documentation. 0N/A * 3. Altered versions must be plainly marked as such, and must not be 0N/A * misrepresented as being the original software. Since few users ever read 0N/A * sources, credits must appear in the documentation. 0N/A * 4. This notice may not be removed or altered. 0N/A * ------------------------------------------------------------------------- 0N/A * For compliance with Mr Darwin's terms: this has been very significantly 0N/A * modified from the free "file" command. 0N/A * - all-in-one file for compilation convenience when moving from one 0N/A * version of Apache to the next. 0N/A * - Memory allocation is done through the Apache API's ap_context_t structure. 2204N/A * - All functions have had necessary Apache API request or server 0N/A * structures passed to them where necessary to call other Apache API 0N/A * routines. (i.e. usually for logging, files, or memory allocation in 0N/A * itself or a called function.) 0N/A * - struct magic has been converted from an array to a single-ended linked 0N/A * list because it only grows one record at a time, it's only accessed 0N/A * sequentially, and the Apache API has no equivalent of realloc(). 0N/A * - Functions have been changed to get their parameters from the server 0N/A * configuration instead of globals. (It should be reentrant now but has 0N/A * not been tested in a threaded environment.) 0N/A * - Places where it used to print results to stdout now saves them in a 0N/A * list where they're used to set the MIME type in the Apache request 0N/A * - Command-line flags have been removed since they will never be used here. 0N/A * Ian Kluft <ikluft@cisco.com> 0N/A * Engineering Information Framework 0N/A * Central Engineering 0N/A * Cisco Systems, Inc. 0N/A * Misc bug fixes May 1997 0N/A * Submission to Apache Group July 1997 0N/A * data structures and related constants 0N/A/* HOWMANY must be at least 4096 to make gzip -dcq work */ 0N/A/* SMALL_HOWMANY limits how much work we do to figure out text files */ 0N/A#
define MAXDESC 50 /* max leng of text description */ 0N/A int lineno;
/* line number from magic file */ 0N/A#
define INDIR 1 /* if '>(...)' appears, */ 900N/A unsigned char reln;
/* relation (0=eq, '>'=gt, etc) */ 900N/A char type;
/* int, short, long or string. */ 0N/A char vallen;
/* length of string value, if any */ 0N/A unsigned char hs[
2];
/* 2 bytes of a fixed-endian "short" */ 0N/A unsigned char hl[
4];
/* 2 bytes of a fixed-endian "long" */ 0N/A unsigned long mask;
/* mask before comparison with value */ 0N/A /* NOTE: this string is suspected of overrunning - find it! */ 0N/A * data structures for tar file recognition 0N/A * -------------------------------------------------------------------------- 0N/A * Header file for public domain tar (tape archive) program. 0N/A * @(#)tar.h 1.20 86/10/29 Public Domain. Created 25 August 1985 by John 0N/A * Gilmore, ihnp4!hoptoad!gnu. 1426N/A * I'm going to use traditional DP naming conventions here. A "block" is a big 1426N/A * chunk of stuff that we do I/O on. A "record" is a piece of info that we 0N/A * care about. Typically many "record"s fit into a "block". 1426N/A/* The magic field is filled with this if uname and gname are valid. */ 726N/A * file-function prototypes 726N/A unsigned char **,
int);
726N/A * includes for ASCII substring recognition formerly "names.h" in file 0N/A * Original notes: names and types used by ascmagic in file(1). These tokens are 0N/A * here because they can appear anywhere in the first HOWMANY bytes, while 0N/A * tokens in /etc/magic must appear at fixed offsets into the file. Don't 0N/A * make HOWMANY too high unless you have a very fast CPU. 0N/A/* these types are used to index the ap_table_t 'types': keep em in sync! */ 0N/A/* HTML inserted in first because this is a web server module now */ 0N/A#
define L_C 1 /* first and foremost on UNIX */ /* These must be sorted by eye for optimal hit rate */ /* Add to this list only after substantial meditation */ },
/* must precede "The", "the", etc. */ * Too many files of text have these words in them. Find another way to * Result String List (RSL) * The file(1) command prints its output. Instead, we store the various * "printed" strings in a list (allocating memory as we go) and concatenate * them at the end when we finally know how much space they'll need. char *
str;
/* string, possibly a fragment */ * Apache module configuration structures struct magic *
magic;
/* head of magic config list */ * configuration functions - called by Apache API routines /* allocate the config - use pcalloc because it needs to be zeroed */ return MODNAME ": server structure not allocated";
* configuration file commands - exported to Apache API "Path to MIME Magic file (in file(1) format)"},
* RSL (result string list) processing routines * These collect strings that would have been printed in fragments by file(1) * into a list of magic_rsl structures with the strings. When complete, * they're concatenated together to become the MIME content and encoding * return value conventions for these functions: functions which return int: * failure = -1, other = result functions which return pointers: failure = 0, /* allocate a per-request structure and put it in the request record */ /* add a string to the result string list for this request */ /* it is the responsibility of the caller to allocate "str" */ /* make sure we have a list to put it in */ MODNAME ": request config should not be NULL");
/* allocate the list entry */ /* RSL hook for puts-type functions */ /* RSL hook for printf-type functions */ /* assemble the string into the buffer */ /* add the buffer to the list */ /* RSL hook for putchar-type functions */ /* high overhead for 1 char - just hope they don't do this much */ /* allocate and copy a contiguous string from a result string list */ char *
result;
/* return value */ cur_pos,
/* current position within fragment */ res_pos;
/* position in result string */ /* allocate the result string */ /* loop through and collect the string */ /* loop to the first fragment */ /* loop through and collect chars */ /* clean up and return */ /* states for the state-machine algorithm in magic_rsl_to_request() */ /* process the RSL and set the MIME info in the request record */ cur_pos,
/* current position within fragment */ type_frag,
/* content type starting point: fragment */ type_pos,
/* content type starting point: position */ encoding_pos,
/* content encoding starting point: position */ /* check if we have a result */ /* empty - no match, we defer to other Apache modules */ /* start searching for the type and encoding */ /* loop through the characters in the fragment */ /* process whitespace actions for each state */ /* eat whitespace in this state */ /* whitespace: type has no slash! */ /* whitespace: end of MIME type */ /* eat whitespace in this state */ /* whitespace: end of MIME encoding */ /* should not be possible */ /* abandon malfunctioning module */ /* copy the char and go to rsl_subtype state */ /* process non-space actions for each state */ /* non-space: begin MIME type */ /* non-space: adds to type */ /* non-space: begin MIME encoding */ /* non-space: adds to encoding */ /* should not be possible */ /* abandon malfunctioning module */ /* if we ended prior to state rsl_subtype, we had incomplete info */ /* defer to other modules */ /* save the info in the request record */ /* XXX: this could be done at config time I'm sure... but I'm * confused by all this magic_rsl stuff. -djg */ /* XXX: this could be done at config time I'm sure... but I'm * confused by all this magic_rsl stuff. -djg */ /* detect memory allocation errors */ * magic_process - process input file r Apache API request record * (formerly called "process" in file command, prefix added for clarity) Opens * the file and reads a fixed-size buffer to begin processing the contents. unsigned char buf[
HOWMANY +
1];
/* one extra for terminating '\0' */ int nbytes = 0;
/* number of bytes read from a datafile */ * first try judging the file based on its filesystem status /* fatal error, bail out */ /* We can't open it, but we were able to stat it. */ /* let some other handler decide what the problem is */ * try looking at the first HOWMANY bytes buf[
nbytes++] =
'\0';
/* null-terminate it */ * try tests in /etc/magic (or surrogate magic file) * try known keywords, check for ascii-ness too. * abandon hope, all ye who remain here * apprentice - load configuration from the magic file r /* set up the magic list (empty) */ /* skip leading whitespace */ /* comment, do not parse */ /* if we get here, we're going to use it so count it */ MODNAME ": apprentice conf=%x file=%s m=%s m->next=%s last=%s",
MODNAME ": apprentice read %d lines, %d rules, %d errors",
if (
ap_isprint((((
unsigned long) m) >>
24) &
255) &&
MODNAME ": apprentice: POINTER CLOBBERED! " "m=\"%c%c%c%c\" line=%d",
(((
unsigned long) m) >>
24) &
255,
(((
unsigned long) m) >>
16) &
255,
(((
unsigned long) m) >>
8) &
255,
((
unsigned long) m) &
255,
* extend the sign bit if the comparison is to be signed * Do not remove the casts below. They are vital. When later * compared with the data, the sign extension must have happened. * parse one line from magic file, put into magic[index++] if valid /* allocate magic structure entry */ /* append to linked list */ /* set values in magic structure */ /* get offset, then skip over it */ MODNAME ": indirect offset type %c invalid", *l);
if (*l ==
'+' || *l ==
'-')
MODNAME ": missing ')' in indirect offset");
/* New-style anding: "0 byte&0x80 =0x80 dynamically linked" */ /* Old-style anding: "0 byte &0x80 dynamically linked" */ if (*l ==
'x' &&
ap_isspace((
unsigned char) l[
1])) {
* now get last part - the description else if ((l[0] ==
'\\') && (l[
1] ==
'b')) {
MODNAME ": parse line=%d m=%x next=%x cont=%d desc=%s",
#
endif /* MIME_MAGIC_DEBUG */ * Read a numeric value from a pointer, into the value union of a magic * pointer, according to the magic type. Update the string pointer to point * just after the number read. Return 0 for success, non-zero for failure. * Convert a string containing C character escapes. Stop at an unescaped * space or tab. Copy the converted version to "p", returning its length in * *slen. Return updated scan pointer as function result. while ((c = *s++) !=
'\0') {
/* \ and up to 3 octal digits */ c = *s++;
/* try for 2 */ if (c >=
'0' && c <=
'7') {
c = *s++;
/* try for 3 */ if (c >=
'0' && c <=
'7')
/* \x and up to 3 hex digits */ val =
'x';
/* Default if no digits */ /* Single hex char to int; -1 if not a hex char. */ if ((c >=
'a') && (c <=
'f'))
if ((c >=
'A') && (c <=
'F'))
* return DONE to indicate it's been handled * return OK to indicate it's a regular file still needing handling * other returns indicate a failure of some sort * (void) magic_rsl_printf(r,"character special (%d/%d)", * major(sb->st_rdev), minor(sb->st_rdev)); * (void) magic_rsl_printf(r,"block special (%d/%d)", * major(sb->st_rdev), minor(sb->st_rdev)); /* TODO add code to handle V7 MUX and Blit MUX files */ * magic_rsl_puts(r,"fifo (named pipe)"); /* We used stat(), the only possible reason for this is that the * regular file, check next possibility * softmagic - lookup one file in database (already read from /etc/magic by * apprentice.c). Passed the name and FILE * of one file to be typed. /* ARGSUSED1 *//* nbytes passed for regularity, maybe need later */ * Go through the whole list, stopping if you find a match. Process all the * continuations of that match before returning. * We support multi-level continuations: * At any time when processing a successful top-level match, there is a current * continuation level; it represents the level of the last successfully * Continuations above that level are skipped as, if we see one, it means that * the continuation that controls them - i.e, the lower-level continuation * preceding them - failed to match. * Continuations below that level are processed as, if we see one, it means * we've finished processing or skipping higher-level continuations under the * control of a successful or unsuccessful lower-level continuation, and are * now seeing the next lower-level continuation and should process it. The * current continuation level reverts to the level of the one we're seeing. * Continuations at the current level are processed as, if we see one, there's * no lower-level continuation that may have failed. * If a continuation matches, we bump the current continuation level so that * higher-level continuations are processed. MODNAME ": match conf=%x file=%s m=%s m->next=%s last=%s",
if (
ap_isprint((((
unsigned long) m) >>
24) &
255) &&
MODNAME ": match: POINTER CLOBBERED! " (((
unsigned long) m) >>
24) &
255,
(((
unsigned long) m) >>
16) &
255,
(((
unsigned long) m) >>
8) &
255,
((
unsigned long) m) &
255);
/* check if main entry matches */ * main entry didn't match, flush its continuations MODNAME ": line=%d mc=%x mc->next=%x cont=%d desc=%s",
* this trick allows us to keep *m in sync when the continue /* if we get here, the main entry rule was a match */ /* this will be the last run through the loop */ MODNAME ": rule matched, line=%d type=%d %s",
* If we printed something, we'll need to print a blank before we /* and any continuations that match */ * while (m && m->next && m->next->cont_level != 0 && ( m = m->next MODNAME ": match line=%d cont=%d type=%d %s",
* We're at the end of the level "cont_level" * This continuation matched. Print its message, with a * blank before it if the previous item printed and this /* space if previous printed */ * If we see any continuations at a higher level, process /* move to next continuation record */ return 1;
/* all through */ return 0;
/* no match at all */ /* XXX: not multithread safe */ MODNAME ": invalid m->type (%d) in mprint().",
* Convert the byte order of the data we are looking at /* Null terminate and eat the return */ p->s[
sizeof(p->s) -
1] =
'\0';
p->h = (
short) ((p->
hs[0] <<
8) | (p->
hs[
1]));
((p->
hl[0] <<
24) | (p->
hl[
1] <<
16) | (p->
hl[
2] <<
8) | (p->
hl[
3]));
p->h = (
short) ((p->
hs[
1] <<
8) | (p->
hs[0]));
((p->
hl[
3] <<
24) | (p->
hl[
2] <<
16) | (p->
hl[
1] <<
8) | (p->
hl[0]));
register unsigned long l = m->
value.l;
register unsigned long v;
if ((m->
value.s[0] ==
'x') && (m->
value.s[
1] ==
'\0')) {
* What we want here is: v = strncmp(m->value.s, p->s, m->vallen); * but ignoring any nulls. bcmp doesn't give -/+/0 and isn't * universally available anyway. register unsigned char *a = (
unsigned char *) m->
value.s;
register unsigned char *b = (
unsigned char *) p->s;
if ((v = *b++ - *a++) != 0)
/* bogosity, pretend that it just wasn't a match */ "((%lx & %lx) == %lx) = %d", v, l, l,
matched);
"((%lx & %lx) != %lx) = %d", v, l, l,
matched);
/* bogosity, pretend it didn't match */ MODNAME ": mcheck: can't happen: invalid relation %d.",
/* an optimization over plain strcmp() */ #
define STREQ(a, b) (*(a) == *(b) &&
strcmp((a), (b)) == 0)
char nbuf[
HOWMANY +
1];
/* one extra for terminating '\0' */ register struct names *p;
/* these are easy, do them first */ * for troff, look for . + letter + letter or .\"; this must be done to * disambiguate tar archives' ./file and other trash from real troff unsigned char *
tp =
buf +
1;
++
tp;
/* skip leading whitespace */ /* look for tokens from names.h - this is expensive!, so we'll limit * ourselves to only SMALL_HOWMANY bytes */ /* make a copy of the buffer here because strtok() will destroy it */ /* XXX: not multithread safe */ s =
NULL;
/* make strtok() keep on tokin' */ /* all else fails, but it is ascii... */ /* text with escape sequences */ /* we leave this open for further differentiation later */ * compress routines: zmagic() - returns 0 if not recognized, uncompresses * and prints information if recognized uncompress(s, method, old, n, newch) * - uncompress old into new, using method, return sizeof new /* we use gzip here rather than uncompress because we have to pass * it a full filename -- and uncompress only considers filenames * XXX pcat does not work, cause I don't know how to make it read stdin, for (i = 0; i <
ncompr; i++) {
/* set encoding type in the request record */ /* Something bad happened, tell the world. */ "couldn't setup child process: %s", r->
filename);
/* Bad things happened. Everyone should have cleaned up. */ MODNAME ": could not execute `%s'.",
/* Fill in BUFF structure for parents pipe to child's stdout */ unsigned char **
newch,
int n)
/* We make a sub_pool so that we can collect our child early, otherwise * there are cases (i.e. generating directory indicies with mod_autoindex) * where we would end up with LOTS of zombies. MODNAME ": couldn't spawn uncompress process: %s", r->
uri);
* is_tar() -- figure out whether file is a tar archive. * Stolen (by author of file utility) from the public domain tar program: Public * Domain version written 26 Aug 1985 John Gilmore (ihnp4!hoptoad!gnu). * 1997/06/24 00:41:02 ikluft Exp ikluft $ * Comments changed and some code/comments reformatted for file command by Ian #
define isodigit(c) ( ((c) >=
'0') && ((c) <=
'7') )
* Return 0 if the checksum is bad (i.e., probably not a tar archive), 1 for * old UNIX tar file, 2 for Unix Std (POSIX) tar file. for (i =
sizeof(
union record); --i >= 0;) {
* We can't use unsigned char here because of old compilers, e.g. V7. /* Adjust checksum to count the "chksum" field as blanks. */ return 0;
/* Not a tar archive */ return 2;
/* Unix Standard tar archive */ return 1;
/* Old fashioned tar archive */ * Quick and dirty octal conversion. * Result is -1 if the field is invalid (all blank, or nonoctal). return -
1;
/* All blank field */ * Check for file-revision suffix * This is for an obscure document control system used on an intranet. * The web representation of each file's revision has an @1, @2, etc * appended with the revision number. This needs to be stripped off to * find the file suffix, which can be recognized by sending the name back * through a sub-request. The base file name (without the @num suffix) * must exist because its type will be used as the result. #
endif /* MIME_MAGIC_DEBUG */ /* check for recognized revision suffix */ /* perform sub-request for the file name without the suffix */ #
endif /* MIME_MAGIC_DEBUG */#
endif /* MIME_MAGIC_DEBUG */#
endif /* MIME_MAGIC_DEBUG */ /* inherits from the parent */ if (
ap_isprint((((
unsigned long) m) >>
24) &
255) &&
MODNAME ": magic_init 1: POINTER CLOBBERED! " "m=\"%c%c%c%c\" line=%d",
(((
unsigned long) m) >>
24) &
255,
(((
unsigned long) m) >>
16) &
255,
(((
unsigned long) m) >>
8) &
255,
((
unsigned long) m) &
255,
* Find the Content-Type from any resource this module has available /* the file has to exist */ /* was someone else already here? */ /* initialize per-request info */ /* try excluding file-revision suffixes */ /* process it based on the file contents */ /* if we have any results, put them in the request structure */ * Apache API module interface NULL,
/* dir config creator */ NULL,
/* dir merger --- default is to override */