/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @(#)MimeUtility.java 1.45 03/03/10
*/
/**
* This is a utility class that provides various MIME related
* functionality. <p>
*
* There are a set of methods to encode and decode MIME headers as
* per RFC 2047. A brief description on handling such headers is
* given below: <p>
*
* RFC 822 mail headers <strong>must</strong> contain only US-ASCII
* characters. Headers that contain non US-ASCII characters must be
* encoded so that they contain only US-ASCII characters. Basically,
* this process involves using either BASE64 or QP to encode certain
* characters. RFC 2047 describes this in detail. <p>
*
* In Java, Strings contain (16 bit) Unicode characters. ASCII is a
* subset of Unicode (and occupies the range 0 - 127). A String
* that contains only ASCII characters is already mail-safe. If the
* String contains non US-ASCII characters, it must be encoded. An
* additional complexity in this step is that since Unicode is not
* yet a widely used charset, one might want to first charset-encode
* the String into another charset and then do the transfer-encoding.
* <p>
* Note that to get the actual bytes of a mail-safe String (say,
* for sending over SMTP), one must do
* <p><blockquote><pre>
*
* byte[] bytes = string.getBytes("iso-8859-1");
*
* </pre></blockquote><p>
*
* The <code>setHeader</code> and <code>addHeader</code> methods
* on MimeMessage and MimeBodyPart assume that the given header values
* are Unicode strings that contain only US-ASCII characters. Hence
* the callers of those methods must insure that the values they pass
* do not contain non US-ASCII characters. The methods in this class
* help do this. <p>
*
* The <code>getHeader</code> family of methods on MimeMessage and
* MimeBodyPart return the raw header value. These might be encoded
* as per RFC 2047, and if so, must be decoded into Unicode Strings.
* The methods in this class help to do this. <p>
*
* Several System properties control strict conformance to the MIME
* spec. Note that these are not session properties but must be set
* globally as System properties. <p>
*
* The <code>mail.mime.decodetext.strict</code> property controls
* decoding of MIME encoded words. The MIME spec requires that encoded
* words start at the beginning of a whitespace separated word. Some
* mailers incorrectly include encoded words in the middle of a word.
* If the <code>mail.mime.decodetext.strict</code> System property is
* set to <code>"false"</code>, an attempt will be made to decode these
* illegal encoded words. The default is true. <p>
*
* The <code>mail.mime.encodeeol.strict</code> property controls the
* choice of Content-Transfer-Encoding for MIME parts that are not of
* type "text". Often such parts will contain textual data for which
* an encoding that allows normal end of line conventions is appropriate.
* In rare cases, such a part will appear to contain entirely textual
* data, but will require an encoding that preserves CR and LF characters
* without change. If the <code>mail.mime.decodetext.strict</code>
* System property is set to <code>"true"</code>, such an encoding will
* be used when necessary. The default is false. <p>
*
* In addition, the <code>mail.mime.charset</code> System property can
* be used to specify the default MIME charset to use for encoded words
* and text parts that don't otherwise specify a charset. Normally, the
* default MIME charset is derived from the default Java charset, as
* specified in the <code>file.encoding</code> System property. Most
* applications will have no need to explicitly set the default MIME
* charset. In cases where the default MIME charset to be used for
* mail messages is different than the charset used for files stored on
* the system, this property should be set.
*
* @version 1.45, 03/03/10
* @author John Mani
* @author Bill Shannon
*/
public class MimeUtility {
// This class cannot be instantiated
private MimeUtility() { }
private static boolean decodeStrict = true;
private static boolean encodeEolStrict = false;
private static boolean foldEncodedWords = false;
private static boolean foldText = true;
static {
try {
// default to true
// default to false
// default to false
// default to true
} catch (SecurityException sex) {
// ignore it
}
}
/**
* Get the content-transfer-encoding that should be applied
* to the input stream of this datasource, to make it mailsafe. <p>
*
* The algorithm used here is: <br>
* <ul>
* <li>
* If the primary type of this datasource is "text" and if all
* the bytes in its input stream are US-ASCII, then the encoding
* is "7bit". If more than half of the bytes are non-US-ASCII, then
* the encoding is "base64". If less than half of the bytes are
* non-US-ASCII, then the encoding is "quoted-printable".
* <li>
* If the primary type of this datasource is not "text", then if
* all the bytes of its input stream are US-ASCII, the encoding
* is "7bit". If there is even one non-US-ASCII character, the
* encoding is "base64".
* </ul>
*
* @param ds DataSource
* @return the encoding. This is either "7bit",
* "quoted-printable" or "base64"
*/
try {
return "base64"; // what else ?!
}
// if not text, stop processing when we see non-ASCII
switch (i) {
case ALL_ASCII:
break;
case MOSTLY_ASCII:
break;
default:
break;
}
// Close the input stream
try {
} catch (IOException ioex) { }
return encoding;
}
/**
* Same as <code>getEncoding(DataSource)</code> except that instead
* of reading the data from an <code>InputStream</code> it uses the
* <code>writeTo</code> method to examine the data. This is more
* efficient in the common case of a <code>DataHandler</code>
* created with an object and a MIME type (for example, a
* thread. In the case requiring an <code>InputStream</code> the
* <code>DataHandler</code> uses a thread, a pair of pipe streams,
* and the <code>writeTo</code> method to produce the data. <p>
*
* @since JavaMail 1.2
*/
/*
* Try to pick the most efficient means of determining the
* encoding. If this DataHandler was created using a DataSource,
* the getEncoding(DataSource) method is typically faster. If
* the DataHandler was created with an object, this method is
* much faster. To distinguish the two cases, we use a heuristic.
* A DataHandler created with an object will always have a null name.
* A DataHandler created with a DataSource will usually have a
* non-null name.
*
* XXX - This is actually quite a disgusting hack, but it makes
* a common case run over twice as fast.
*/
try {
return "base64"; // what else ?!
}
// Check all of the available bytes
try {
case ALL_ASCII:
break;
case MOSTLY_ASCII:
break;
default:
break;
}
} else { // not "text"
// Check all of available bytes, break out if we find
// at least one non-US-ASCII character
new AsciiOutputStream(true, encodeEolStrict);
try {
encoding = "7bit";
else // found atleast one non-ascii character, use b64
encoding = "base64";
}
return encoding;
}
/**
* Decode the given input stream. The Input stream returned is
* the decoded input stream. All the encodings defined in RFC 2045
* are supported here. They include "base64", "quoted-printable",
* "7bit", "8bit", and "binary". In addition, "uuencode" is also
* supported.
*
* @param is input stream
* @param encoding the encoding of the stream.
* @return decoded input stream.
*/
throws MessagingException {
return new BASE64DecoderStream(is);
return new QPDecoderStream(is);
return new UUDecoderStream(is);
return is;
else
}
/**
* Wrap an encoder around the given output stream.
* All the encodings defined in RFC 2045 are supported here.
* They include "base64", "quoted-printable", "7bit", "8bit" and
* "binary". In addition, "uuencode" is also supported.
*
* @param os output stream
* @param encoding the encoding of the stream.
* @return output stream that applies the
* specified encoding.
*/
throws MessagingException {
return os;
return new BASE64EncoderStream(os);
return new QPEncoderStream(os);
return new UUEncoderStream(os);
return os;
else
}
/**
* Wrap an encoder around the given output stream.
* All the encodings defined in RFC 2045 are supported here.
* They include "base64", "quoted-printable", "7bit", "8bit" and
* "binary". In addition, "uuencode" is also supported.
* The <code>filename</code> parameter is used with the "uuencode"
* encoding and is included in the encoded output.
*
* @param os output stream
* @param encoding the encoding of the stream.
* @param filename name for the file being encoded (only used
* with uuencode)
* @return output stream that applies the
* specified encoding.
* @since JavaMail 1.2
*/
throws MessagingException {
return os;
return new BASE64EncoderStream(os);
return new QPEncoderStream(os);
return os;
else
}
/**
* Encode a RFC 822 "text" token into mail-safe form as per
* RFC 2047. <p>
*
* The given Unicode string is examined for non US-ASCII
* characters. If the string contains only US-ASCII characters,
* it is returned as-is. If the string contains non US-ASCII
* characters, it is first character-encoded using the platform's
* default charset, then transfer-encoded using either the B or
* Q encoding. The resulting bytes are then returned as a Unicode
* string containing only ASCII characters. <p>
*
* Note that this method should be used to encode only
* "unstructured" RFC 822 headers. <p>
*
* Example of usage:
* <p><blockquote><pre>
*
* MimeBodyPart part = ...
* String rawvalue = "FooBar Mailer, Japanese version 1.1"
* try {
* // If we know for sure that rawvalue contains only US-ASCII
* // characters, we can skip the encoding part
* part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
* } catch (UnsupportedEncodingException e) {
* // encoding failure
* } catch (MessagingException me) {
* // setHeader() failure
* }
*
* </pre></blockquote><p>
*
* @param text unicode string
* @return Unicode string containing only US-ASCII characters
* @exception UnsupportedEncodingException if the encoding fails
*/
throws UnsupportedEncodingException {
}
/**
* Encode a RFC 822 "text" token into mail-safe form as per
* RFC 2047. <p>
*
* The given Unicode string is examined for non US-ASCII
* characters. If the string contains only US-ASCII characters,
* it is returned as-is. If the string contains non US-ASCII
* characters, it is first character-encoded using the specified
* charset, then transfer-encoded using either the B or Q encoding.
* The resulting bytes are then returned as a Unicode string
* containing only ASCII characters. <p>
*
* Note that this method should be used to encode only
* "unstructured" RFC 822 headers.
*
* @param text the header value
* @param charset the charset. If this parameter is null, the
* platform's default chatset is used.
* @param encoding the encoding to be used. Currently supported
* values are "B" and "Q". If this parameter is null, then
* the "Q" encoding is used if most of characters to be
* encoded are in the ASCII charset, otherwise "B" encoding
* is used.
* @return Unicode string containing only US-ASCII characters
*/
throws UnsupportedEncodingException {
}
/**
* Decode "unstructured" headers, that is, headers that are defined
* as '*text' as per RFC 822. <p>
*
* The string is decoded using the algorithm specified in
* RFC 2047, Section 6.1.1. If the charset-conversion fails
* for any sequence, an UnsupportedEncodingException is thrown.
* If the String is not an RFC 2047 style encoded header, it is
* returned as-is <p>
*
* Example of usage:
* <p><blockquote><pre>
*
* MimeBodyPart part = ...
* String rawvalue = null;
* String value = null;
* try {
* if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
* value = MimeUtility.decodeText(rawvalue);
* } catch (UnsupportedEncodingException e) {
* // Don't care
* value = rawvalue;
* } catch (MessagingException me) { }
*
* return value;
*
* </pre></blockquote><p>
*
* @param etext the possibly encoded value
* @exception UnsupportedEncodingException if the charset
* conversion failed.
*/
throws UnsupportedEncodingException {
/*
* We look for sequences separated by "linear-white-space".
* (as per RFC 2047, Section 6.1.1)
* RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
*/
/*
* First, lets do a quick run thru the string and check
* whether the sequence "=?" exists at all. If none exists,
* we know there are no encoded-words in here and we can just
* return the string as-is, without suffering thru the later
* decoding logic.
* This handles the most common case of unencoded headers
* efficiently.
*/
return etext;
// Encoded words found. Start decoding ...
boolean prevWasEncoded = false;
while (st.hasMoreTokens()) {
char c;
// If whitespace, append it to the whitespace buffer
(c == '\r') || (c == '\n'))
else {
// Check if token is an 'encoded-word' ..
try {
word = decodeWord(s);
// Yes, this IS an 'encoded-word'.
// if the previous word was also encoded, we
// should ignore the collected whitespace. Else
// we include the whitespace as well.
}
prevWasEncoded = true;
} catch (ParseException pex) {
// This is NOT an 'encoded-word'.
word = s;
// possibly decode inner encoded words
if (!decodeStrict)
// include colleced whitespace ..
prevWasEncoded = false;
}
}
}
}
/**
* Encode a RFC 822 "word" token into mail-safe form as per
* RFC 2047. <p>
*
* The given Unicode string is examined for non US-ASCII
* characters. If the string contains only US-ASCII characters,
* it is returned as-is. If the string contains non US-ASCII
* characters, it is first character-encoded using the platform's
* default charset, then transfer-encoded using either the B or
* Q encoding. The resulting bytes are then returned as a Unicode
* string containing only ASCII characters. <p>
*
* This method is meant to be used when creating RFC 822 "phrases".
* The InternetAddress class, for example, uses this to encode
* it's 'phrase' component.
*
* @param text unicode string
* @return Array of Unicode strings containing only US-ASCII
* characters.
* @exception UnsupportedEncodingException if the encoding fails
*/
throws UnsupportedEncodingException {
}
/**
* Encode a RFC 822 "word" token into mail-safe form as per
* RFC 2047. <p>
*
* The given Unicode string is examined for non US-ASCII
* characters. If the string contains only US-ASCII characters,
* it is returned as-is. If the string contains non US-ASCII
* characters, it is first character-encoded using the specified
* charset, then transfer-encoded using either the B or Q encoding.
* The resulting bytes are then returned as a Unicode string
* containing only ASCII characters. <p>
*
* @param text unicode string
* @param charset the MIME charset
* @param encoding the encoding to be used. Currently supported
* values are "B" and "Q". If this parameter is null, then
* the "Q" encoding is used if most of characters to be
* encoded are in the ASCII charset, otherwise "B" encoding
* is used.
* @return Unicode string containing only US-ASCII characters
* @exception UnsupportedEncodingException if the encoding fails
*/
throws UnsupportedEncodingException {
}
/*
* Encode the given string. The parameter 'encodingWord' should
* be true if a RFC 822 "word" token is being encoded and false if a
* RFC 822 "text" token is being encoded. This is because the
* "Q" encoding defined in RFC 2047 has more restrictions when
* encoding "word" tokens. (Sigh)
*/
throws UnsupportedEncodingException {
// If 'string' contains only US-ASCII characters, just
// return it.
return string;
// Else, apply the specified charset conversion.
} else // MIME charset -> java charset
// If no transfer-encoding is specified, figure one out.
if (ascii != MOSTLY_NONASCII)
encoding = "Q";
else
encoding = "B";
}
boolean b64;
b64 = true;
b64 = false;
else
throw new UnsupportedEncodingException(
"Unknown transfer encoding: " + encoding);
// As per RFC 2047, size of an encoded string should not
// exceed 75 bytes.
// 7 = size of "=?", '?', 'B'/'Q', '?', "?="
true, encodingWord, outb);
}
throws UnsupportedEncodingException {
// First find out what the length of the encoded version of
// 'string' would be.
int len;
if (b64) // "B" encoding
else // "Q"
int size;
// If the length is greater than 'avail', split 'string'
// into two and recurse.
} else {
// length <= than 'avail'. Encode the given string
if (b64) // "B" encoding
else // "Q" encoding
try { // do the encoding
} catch (IOException ioex) { }
// Now write out the encoded (all ASCII) bytes into our
// StringBuffer
if (!first) // not the first line of this sequence
if (foldEncodedWords)
else
}
}
/**
* The string is parsed using the rules in RFC 2047 for parsing
* an "encoded-word". If the parse fails, a ParseException is
* thrown. Otherwise, it is transfer-decoded, and then
* charset-converted into Unicode. If the charset-conversion
* fails, an UnsupportedEncodingException is thrown.<p>
*
* @param eword the possibly encoded value
* @exception ParseException if the string is not an
* encoded-word as per RFC 2047.
* @exception UnsupportedEncodingException if the charset
* conversion failed.
*/
throws ParseException, UnsupportedEncodingException {
throw new ParseException();
// get charset
throw new ParseException();
// get encoding
throw new ParseException();
// get encoded-sequence
throw new ParseException();
try {
// Extract the bytes from word
// Get the appropriate decoder
else
throw new UnsupportedEncodingException(
"unknown encoding: " + encoding);
// For b64 & q, size of decoded word <= size of word. So
// the decoded bytes must fit into the 'bytes' array. This
// is certainly more efficient than writing bytes into a
// ByteArrayOutputStream and then pulling out the byte[]
// from it.
// count is set to the actual number of decoded bytes
// Finally, convert the decoded bytes into a String using
// the specified charset
// there's still more text in the string
if (!decodeStrict)
s += rest;
}
return s;
} catch (UnsupportedEncodingException uex) {
// explicitly catch and rethrow this exception, otherwise
// the below IOException catch will swallow this up!
throw uex;
} catch (IOException ioex) {
// Shouldn't happen.
throw new ParseException();
} catch (IllegalArgumentException iex) {
/* An unknown charset of the form ISO-XXX-XXX, will cause
* the JDK to throw an IllegalArgumentException ... Since the
* JDK will attempt to create a classname using this string,
* but valid classnames must not contain the character '-',
* and this results in an IllegalArgumentException, rather than
* the expected UnsupportedEncodingException. Yikes
*/
throw new UnsupportedEncodingException();
}
}
/**
* Look for encoded words within a word. The MIME spec doesn't
* allow this, but many broken mailers, especially Japanese mailers,
* produce such incorrect encodings.
*/
throws UnsupportedEncodingException {
int start = 0, i;
if (end < 0)
break;
try {
s = decodeWord(s);
} catch (ParseException pex) {
// ignore it, just use the original string
}
}
if (start == 0)
return word;
}
/**
* A utility method to quote a word, if the word contains any
* characters from the specified 'specials' list.<p>
*
* The <code>HeaderTokenizer</code> class defines two special
* sets of delimiters - MIME and RFC 822. <p>
*
* This method is typically used during the generation of
* RFC 822 and MIME header fields.
*
* @param word word to be quoted
* @param specials the set of special characters
* @return the possibly quoted word
* @see javax.mail.internet.HeaderTokenizer#MIME
* @see javax.mail.internet.HeaderTokenizer#RFC822
*/
/*
* Look for any "bad" characters, Escape and
* quote the entire string if necessary.
*/
boolean needQuoting = false;
for (int i = 0; i < len; i++) {
if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
// need to escape them and then quote the whole string
int lastc = 0;
for (int j = i; j < len; j++) {
; // do nothing, CR was already escaped
else
}
// These characters cause the string to be quoted
needQuoting = true;
}
if (needQuoting) {
} else
return word;
}
/**
* Fold a string at linear whitespace so that each line is no longer
* than 76 characters, if possible. If there are more than 76
* non-whitespace characters consecutively, the string is folded at
* the first whitespace after that sequence. The parameter
* <code>used</code> indicates how many characters have been used in
* the current line; it is usually the length of the header name. <p>
*
* Note that line breaks in the string aren't escaped; they probably
* should be.
*
* @param used characters used in line so far
* @param s the string to fold
* @return the folded string
*/
if (!foldText)
return s;
int end;
char c;
// Strip trailing spaces
if (c != ' ' && c != '\t')
break;
}
// if the string fits now, just return it
return s;
// have to actually fold the string
char lastc = 0;
int lastspace = -1;
for (int i = 0; i < s.length(); i++) {
break;
c = s.charAt(i);
if (c == ' ' || c == '\t')
lastspace = i;
lastc = c;
}
if (lastspace == -1) {
// no space, use the whole thing
s = "";
used = 0;
break;
}
used = 1;
}
}
/**
* Unfold a folded header. Any line breaks that aren't escaped and
* are followed by whitespace are removed.
*
* @param s the string to unfold
* @return the unfolded string
*/
if (!foldText)
return s;
int i;
int start = i;
int l = s.length();
i++; // skip CR or NL
i++; // skip LF
char c;
// if next line starts with whitespace, skip all of it
// XXX - always has to be true?
i++; // skip whitespace
i++;
if (start != 0) {
}
s = s.substring(i);
continue;
}
// it's not a continuation line, just leave it in
s = s.substring(i);
} else {
// there's a backslash at "start - 1"
// strip it out, but leave in the line break
s = s.substring(i);
}
}
} else
return s;
}
/**
* Return the first index of any of the characters in "any" in "s",
* or -1 if none are found.
*
* This should be a method on String.
*/
}
try {
return i;
}
return -1;
} catch (StringIndexOutOfBoundsException e) {
return -1;
}
}
/**
* Convert a MIME charset name into a valid Java charset name. <p>
*
* @param charset the MIME charset name
* @return the Java charset equivalent. If a suitable mapping is
* not available, the passed in charset is itself returned.
*/
// no mapping table, or charset parameter is null
return charset;
}
/**
* Convert a java charset into its MIME charset name. <p>
*
* Note that a future version of JDK (post 1.2) might provide
* this functionality, in which case, we may deprecate this
* method then.
*
* @param charset the JDK charset
* is not possible, the passed in charset itself
* is returned.
* @since JavaMail 1.1
*/
// no mapping table or charset param is null
return charset;
}
/**
* Get the default charset corresponding to the system's current
* default locale. If the System property <code>mail.mime.charset</code>
* is set, a system charset corresponding to this MIME charset will be
* returned. <p>
*
* @return the default charset of the system's default locale,
* as a Java charset. (NOT a MIME charset)
* @since JavaMail 1.1
*/
if (defaultJavaCharset == null) {
/*
* If mail.mime.charset is set, it controls the default
* Java charset as well.
*/
return defaultJavaCharset;
}
try {
"8859_1");
} catch (SecurityException sex) {
class NullInputStream extends InputStream {
public int read() {
return 0;
}
}
new InputStreamReader(new NullInputStream());
if (defaultJavaCharset == null)
defaultJavaCharset = "8859_1";
}
}
return defaultJavaCharset;
}
/*
* Get the default MIME charset for this locale.
*/
if (defaultMIMECharset == null) {
}
if (defaultMIMECharset == null)
return defaultMIMECharset;
}
// Tables to map MIME charset names to Java names and vice versa.
// XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
static {
try {
// Use this class's classloader to load the mapping file
// XXX - we should use SecuritySupport, but it's in another package
// Load the JDK-to-MIME charset mapping table
// Load the MIME-to-JDK charset mapping table
}
// If we didn't load the tables, e.g., because we didn't have
// permission, load them manually. The entries here should be
// the same as the default javamail.charset.map.
}
}
}
while (true) {
try {
} catch (IOException ioex) {
break; // error in reading, stop
}
break;
// end of this table
break;
// ignore empty lines and comments
continue;
// A valid entry is of the form <key><separator><value>
// where, <separator> := SPACE | HT. Parse this
try {
} catch (NoSuchElementException nex) { }
}
}
/**
* Check if the given string contains non US-ASCII characters.
* @param s string
* @return ALL_ASCII if all characters in the string
* belong to the US-ASCII charset. MOSTLY_ASCII
* if more than half of the available characters
* are US-ASCII characters. Else MOSTLY_NONASCII.
*/
int l = s.length();
for (int i = 0; i < l; i++) {
non_ascii++;
else
ascii++;
}
if (non_ascii == 0)
return ALL_ASCII;
return MOSTLY_ASCII;
return MOSTLY_NONASCII;
}
/**
* Check if the given byte array contains non US-ASCII characters.
* @param b byte array
* @return ALL_ASCII if all characters in the string
* belong to the US-ASCII charset. MOSTLY_ASCII
* if more than half of the available characters
* are US-ASCII characters. Else MOSTLY_NONASCII.
*
* XXX - this method is no longer used
*/
static int checkAscii(byte[] b) {
for (int i=0; i < b.length; i++) {
// The '&' operator automatically causes b[i] to be promoted
// to an int, and we mask out the higher bytes in the int
// so that the resulting value is not a negative integer.
non_ascii++;
else
ascii++;
}
if (non_ascii == 0)
return ALL_ASCII;
return MOSTLY_ASCII;
return MOSTLY_NONASCII;
}
/**
* Check if the given input stream contains non US-ASCII characters.
* Upto <code>max</code> bytes are checked. If <code>max</code> is
* set to <code>ALL</code>, then all the bytes available in this
* input stream are checked. If <code>breakOnNonAscii</code> is true
* the check terminates when the first non-US-ASCII character is
* found and MOSTLY_NONASCII is returned. Else, the check continues
* till <code>max</code> bytes or till the end of stream.
*
* @param is the input stream
* @param max maximum bytes to check for. The special value
* ALL indicates that all the bytes in this input
* stream must be checked.
* @param breakOnNonAscii if <code>true</code>, then terminate the
* the check when the first non-US-ASCII character
* is found.
* @return ALL_ASCII if all characters in the string
* belong to the US-ASCII charset. MOSTLY_ASCII
* if more than half of the available characters
* are US-ASCII characters. Else MOSTLY_NONASCII.
*/
int len;
int block = 4096;
int linelen = 0;
if (max != 0) {
}
while (max != 0) {
try {
break;
int lastb = 0;
for (int i = 0; i < len; i++) {
// The '&' operator automatically causes b[i] to
// be promoted to an int, and we mask out the higher
// bytes in the int so that the resulting value is
// not a negative integer.
int b = buf[i] & 0xff;
if (checkEOL &&
badEOL = true;
if (b == '\r' || b == '\n')
linelen = 0;
else {
linelen++;
longLine = true;
}
if (nonascii(b)) { // non-ascii
if (breakOnNonAscii) // we are done
return MOSTLY_NONASCII;
else
non_ascii++;
} else
ascii++;
lastb = b;
}
} catch (IOException ioex) {
break;
}
}
// We have been told to break on the first non-ascii character.
// We haven't got any non-ascii character yet, but then we
// have not checked all of the available bytes either. So we
// cannot say for sure that this input stream is ALL_ASCII,
// and hence we must play safe and return MOSTLY_NONASCII
return MOSTLY_NONASCII;
// If we're looking at non-text data, and we saw CR without LF
// or vice versa, consider this mostly non-ASCII so that it
// will be base64 encoded (since the quoted-printable encoder
// doesn't encode this case properly).
if (badEOL)
return MOSTLY_NONASCII;
// if we've seen a long line, we degrade to mostly ascii
else if (longLine)
return MOSTLY_ASCII;
else
return ALL_ASCII;
}
return MOSTLY_ASCII;
return MOSTLY_NONASCII;
}
static final boolean nonascii(int b) {
return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
}
}
/**
* An OutputStream that determines whether the data written to
* it is all ASCII, mostly ASCII, or mostly non-ASCII.
*/
private boolean breakOnNonAscii;
private boolean longLine = false;
private boolean badEOL = false;
private boolean checkEOL = false;
this.breakOnNonAscii = breakOnNonAscii;
}
check(b);
}
}
check(b[i]);
}
b &= 0xff;
if (checkEOL &&
badEOL = true;
if (b == '\r' || b == '\n')
linelen = 0;
else {
linelen++;
longLine = true;
}
non_ascii++;
if (breakOnNonAscii) { // we are done
throw new EOFException();
}
} else
ascii++;
lastb = b;
}
/**
* Return ASCII-ness of data stream.
*/
public int getAscii() {
if (ret != 0)
return ret;
// If we're looking at non-text data, and we saw CR without LF
// or vice versa, consider this mostly non-ASCII so that it
// will be base64 encoded (since the quoted-printable encoder
// doesn't encode this case properly).
if (badEOL)
return MimeUtility.MOSTLY_NONASCII;
// if we've seen a long line, we degrade to mostly ascii
if (longLine)
return MimeUtility.MOSTLY_ASCII;
else
return MimeUtility.ALL_ASCII;
}
return MimeUtility.MOSTLY_ASCII;
return MimeUtility.MOSTLY_NONASCII;
}
}