View Javadoc

1   /****************************************************************
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   ****************************************************************/
19  
20  package org.apache.james.mime4j.codec;
21  
22  import java.nio.ByteBuffer;
23  import java.nio.charset.Charset;
24  import java.util.BitSet;
25  import java.util.Locale;
26  
27  import org.apache.james.mime4j.util.CharsetUtil;
28  
29  /**
30   * Static methods for encoding header field values. This includes encoded-words
31   * as defined in <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a>
32   * or display-names of an e-mail address, for example.
33   */
34  public class EncoderUtil {
35      private static final byte[] BASE64_TABLE = Base64OutputStream.BASE64_TABLE;
36      private static final char BASE64_PAD = '=';
37  
38      private static final BitSet Q_REGULAR_CHARS = initChars("=_?");
39  
40      private static final BitSet Q_RESTRICTED_CHARS = initChars("=_?\"#$%&'(),.:;<>@[\\]^`{|}~");
41  
42      private static final int MAX_USED_CHARACTERS = 50;
43  
44      private static final String ENC_WORD_PREFIX = "=?";
45      private static final String ENC_WORD_SUFFIX = "?=";
46  
47      private static final int ENCODED_WORD_MAX_LENGTH = 75; // RFC 2047
48  
49      private static final BitSet TOKEN_CHARS = initChars("()<>@,;:\\\"/[]?=");
50  
51      private static final BitSet ATEXT_CHARS = initChars("()<>@.,;:\\\"[]");
52  
53      private static BitSet initChars(String specials) {
54          BitSet bs = new BitSet(128);
55          for (char ch = 33; ch < 127; ch++) {
56              if (specials.indexOf(ch) == -1) {
57                  bs.set(ch);
58              }
59          }
60          return bs;
61      }
62  
63      /**
64       * Selects one of the two encodings specified in RFC 2047.
65       */
66      public enum Encoding {
67          /** The B encoding (identical to base64 defined in RFC 2045). */
68          B,
69          /** The Q encoding (similar to quoted-printable defined in RFC 2045). */
70          Q
71      }
72  
73      /**
74       * Indicates the intended usage of an encoded word.
75       */
76      public enum Usage {
77          /**
78           * Encoded word is used to replace a 'text' token in any Subject or
79           * Comments header field.
80           */
81          TEXT_TOKEN,
82          /**
83           * Encoded word is used to replace a 'word' entity within a 'phrase',
84           * for example, one that precedes an address in a From, To, or Cc
85           * header.
86           */
87          WORD_ENTITY
88      }
89  
90      private EncoderUtil() {
91      }
92  
93      /**
94       * Encodes the display-name portion of an address. See <a
95       * href='http://www.faqs.org/rfcs/rfc5322.html'>RFC 5322</a> section 3.4
96       * and <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a> section
97       * 5.3. The specified string should not be folded.
98       * 
99       * @param displayName
100      *            display-name to encode.
101      * @return encoded display-name.
102      */
103     public static String encodeAddressDisplayName(String displayName) {
104         // display-name = phrase
105         // phrase = 1*( encoded-word / word )
106         // word = atom / quoted-string
107         // atom = [CFWS] 1*atext [CFWS]
108         // CFWS = comment or folding white space
109 
110         if (isAtomPhrase(displayName)) {
111             return displayName;
112         } else if (hasToBeEncoded(displayName, 0)) {
113             return encodeEncodedWord(displayName, Usage.WORD_ENTITY);
114         } else {
115             return quote(displayName);
116         }
117     }
118 
119     /**
120      * Encodes the local part of an address specification as described in RFC
121      * 5322 section 3.4.1. Leading and trailing CFWS should have been removed
122      * before calling this method. The specified string should not contain any
123      * illegal (control or non-ASCII) characters.
124      * 
125      * @param localPart
126      *            the local part to encode
127      * @return the encoded local part.
128      */
129     public static String encodeAddressLocalPart(String localPart) {
130         // local-part = dot-atom / quoted-string
131         // dot-atom = [CFWS] dot-atom-text [CFWS]
132         // CFWS = comment or folding white space
133 
134         if (isDotAtomText(localPart)) {
135             return localPart;
136         } else {
137             return quote(localPart);
138         }
139     }
140 
141     /**
142      * Encodes the specified strings into a header parameter as described in RFC
143      * 2045 section 5.1 and RFC 2183 section 2. The specified strings should not
144      * contain any illegal (control or non-ASCII) characters.
145      * 
146      * @param name
147      *            parameter name.
148      * @param value
149      *            parameter value.
150      * @return encoded result.
151      */
152     public static String encodeHeaderParameter(String name, String value) {
153         name = name.toLowerCase(Locale.US);
154 
155         // value := token / quoted-string
156         if (isToken(value)) {
157             return name + "=" + value;
158         } else {
159             return name + "=" + quote(value);
160         }
161     }
162 
163     /**
164      * Shortcut method that encodes the specified text into an encoded-word if
165      * the text has to be encoded.
166      * 
167      * @param text
168      *            text to encode.
169      * @param usage
170      *            whether the encoded-word is to be used to replace a text token
171      *            or a word entity (see RFC 822).
172      * @param usedCharacters
173      *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
174      * @return the specified text if encoding is not necessary or an encoded
175      *         word or a sequence of encoded words otherwise.
176      */
177     public static String encodeIfNecessary(String text, Usage usage,
178             int usedCharacters) {
179         if (hasToBeEncoded(text, usedCharacters))
180             return encodeEncodedWord(text, usage, usedCharacters);
181         else
182             return text;
183     }
184 
185     /**
186      * Determines if the specified string has to encoded into an encoded-word.
187      * Returns <code>true</code> if the text contains characters that don't
188      * fall into the printable ASCII character set or if the text contains a
189      * 'word' (sequence of non-whitespace characters) longer than 77 characters
190      * (including characters already used up in the line).
191      * 
192      * @param text
193      *            text to analyze.
194      * @param usedCharacters
195      *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
196      * @return <code>true</code> if the specified text has to be encoded into
197      *         an encoded-word, <code>false</code> otherwise.
198      */
199     public static boolean hasToBeEncoded(String text, int usedCharacters) {
200         if (text == null)
201             throw new IllegalArgumentException();
202         if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
203             throw new IllegalArgumentException();
204 
205         int nonWhiteSpaceCount = usedCharacters;
206 
207         for (int idx = 0; idx < text.length(); idx++) {
208             char ch = text.charAt(idx);
209             if (ch == '\t' || ch == ' ') {
210                 nonWhiteSpaceCount = 0;
211             } else {
212                 nonWhiteSpaceCount++;
213                 if (nonWhiteSpaceCount > 77) {
214                     // Line cannot be folded into multiple lines with no more
215                     // than 78 characters each. Encoding as encoded-words makes
216                     // that possible. One character has to be reserved for
217                     // folding white space; that leaves 77 characters.
218                     return true;
219                 }
220 
221                 if (ch < 32 || ch >= 127) {
222                     // non-printable ascii character has to be encoded
223                     return true;
224                 }
225             }
226         }
227 
228         return false;
229     }
230 
231     /**
232      * Encodes the specified text into an encoded word or a sequence of encoded
233      * words separated by space. The text is separated into a sequence of
234      * encoded words if it does not fit in a single one.
235      * <p>
236      * The charset to encode the specified text into a byte array and the
237      * encoding to use for the encoded-word are detected automatically.
238      * <p>
239      * This method assumes that zero characters have already been used up in the
240      * current line.
241      * 
242      * @param text
243      *            text to encode.
244      * @param usage
245      *            whether the encoded-word is to be used to replace a text token
246      *            or a word entity (see RFC 822).
247      * @return the encoded word (or sequence of encoded words if the given text
248      *         does not fit in a single encoded word).
249      * @see #hasToBeEncoded(String, int)
250      */
251     public static String encodeEncodedWord(String text, Usage usage) {
252         return encodeEncodedWord(text, usage, 0, null, null);
253     }
254 
255     /**
256      * Encodes the specified text into an encoded word or a sequence of encoded
257      * words separated by space. The text is separated into a sequence of
258      * encoded words if it does not fit in a single one.
259      * <p>
260      * The charset to encode the specified text into a byte array and the
261      * encoding to use for the encoded-word are detected automatically.
262      * 
263      * @param text
264      *            text to encode.
265      * @param usage
266      *            whether the encoded-word is to be used to replace a text token
267      *            or a word entity (see RFC 822).
268      * @param usedCharacters
269      *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
270      * @return the encoded word (or sequence of encoded words if the given text
271      *         does not fit in a single encoded word).
272      * @see #hasToBeEncoded(String, int)
273      */
274     public static String encodeEncodedWord(String text, Usage usage,
275             int usedCharacters) {
276         return encodeEncodedWord(text, usage, usedCharacters, null, null);
277     }
278 
279     /**
280      * Encodes the specified text into an encoded word or a sequence of encoded
281      * words separated by space. The text is separated into a sequence of
282      * encoded words if it does not fit in a single one.
283      * 
284      * @param text
285      *            text to encode.
286      * @param usage
287      *            whether the encoded-word is to be used to replace a text token
288      *            or a word entity (see RFC 822).
289      * @param usedCharacters
290      *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
291      * @param charset
292      *            the Java charset that should be used to encode the specified
293      *            string into a byte array. A suitable charset is detected
294      *            automatically if this parameter is <code>null</code>.
295      * @param encoding
296      *            the encoding to use for the encoded-word (either B or Q). A
297      *            suitable encoding is automatically chosen if this parameter is
298      *            <code>null</code>.
299      * @return the encoded word (or sequence of encoded words if the given text
300      *         does not fit in a single encoded word).
301      * @see #hasToBeEncoded(String, int)
302      */
303     public static String encodeEncodedWord(String text, Usage usage,
304             int usedCharacters, Charset charset, Encoding encoding) {
305         if (text == null)
306             throw new IllegalArgumentException();
307         if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
308             throw new IllegalArgumentException();
309 
310         if (charset == null)
311             charset = determineCharset(text);
312 
313         String mimeCharset = CharsetUtil.toMimeCharset(charset.name());
314         if (mimeCharset == null) {
315             // cannot happen if charset was originally null
316             throw new IllegalArgumentException("Unsupported charset");
317         }
318 
319         byte[] bytes = encode(text, charset);
320 
321         if (encoding == null)
322             encoding = determineEncoding(bytes, usage);
323 
324         if (encoding == Encoding.B) {
325             String prefix = ENC_WORD_PREFIX + mimeCharset + "?B?";
326             return encodeB(prefix, text, usedCharacters, charset, bytes);
327         } else {
328             String prefix = ENC_WORD_PREFIX + mimeCharset + "?Q?";
329             return encodeQ(prefix, text, usage, usedCharacters, charset, bytes);
330         }
331     }
332 
333     /**
334      * Encodes the specified byte array using the B encoding defined in RFC
335      * 2047.
336      * 
337      * @param bytes
338      *            byte array to encode.
339      * @return encoded string.
340      */
341     public static String encodeB(byte[] bytes) {
342         StringBuilder sb = new StringBuilder();
343 
344         int idx = 0;
345         final int end = bytes.length;
346         for (; idx < end - 2; idx += 3) {
347             int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8
348                     | bytes[idx + 2] & 0xff;
349             sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
350             sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
351             sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
352             sb.append((char) BASE64_TABLE[data & 0x3f]);
353         }
354 
355         if (idx == end - 2) {
356             int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8;
357             sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
358             sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
359             sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
360             sb.append(BASE64_PAD);
361 
362         } else if (idx == end - 1) {
363             int data = (bytes[idx] & 0xff) << 16;
364             sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
365             sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
366             sb.append(BASE64_PAD);
367             sb.append(BASE64_PAD);
368         }
369 
370         return sb.toString();
371     }
372 
373     /**
374      * Encodes the specified byte array using the Q encoding defined in RFC
375      * 2047.
376      * 
377      * @param bytes
378      *            byte array to encode.
379      * @param usage
380      *            whether the encoded-word is to be used to replace a text token
381      *            or a word entity (see RFC 822).
382      * @return encoded string.
383      */
384     public static String encodeQ(byte[] bytes, Usage usage) {
385         BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
386                 : Q_RESTRICTED_CHARS;
387 
388         StringBuilder sb = new StringBuilder();
389 
390         final int end = bytes.length;
391         for (int idx = 0; idx < end; idx++) {
392             int v = bytes[idx] & 0xff;
393             if (v == 32) {
394                 sb.append('_');
395             } else if (!qChars.get(v)) {
396                 sb.append('=');
397                 sb.append(hexDigit(v >>> 4));
398                 sb.append(hexDigit(v & 0xf));
399             } else {
400                 sb.append((char) v);
401             }
402         }
403 
404         return sb.toString();
405     }
406 
407     /**
408      * Tests whether the specified string is a token as defined in RFC 2045
409      * section 5.1.
410      * 
411      * @param str
412      *            string to test.
413      * @return <code>true</code> if the specified string is a RFC 2045 token,
414      *         <code>false</code> otherwise.
415      */
416     public static boolean isToken(String str) {
417         // token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
418         // tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\" /
419         // <"> / "/" / "[" / "]" / "?" / "="
420         // CTL := 0.- 31., 127.
421 
422         final int length = str.length();
423         if (length == 0)
424             return false;
425 
426         for (int idx = 0; idx < length; idx++) {
427             char ch = str.charAt(idx);
428             if (!TOKEN_CHARS.get(ch))
429                 return false;
430         }
431 
432         return true;
433     }
434 
435     private static boolean isAtomPhrase(String str) {
436         // atom = [CFWS] 1*atext [CFWS]
437 
438         boolean containsAText = false;
439 
440         final int length = str.length();
441         for (int idx = 0; idx < length; idx++) {
442             char ch = str.charAt(idx);
443             if (ATEXT_CHARS.get(ch)) {
444                 containsAText = true;
445             } else if (!CharsetUtil.isWhitespace(ch)) {
446                 return false;
447             }
448         }
449 
450         return containsAText;
451     }
452 
453     // RFC 5322 section 3.2.3
454     private static boolean isDotAtomText(String str) {
455         // dot-atom-text = 1*atext *("." 1*atext)
456         // atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / "&" / "'" / "*" /
457         // "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" / "~"
458 
459         char prev = '.';
460 
461         final int length = str.length();
462         if (length == 0)
463             return false;
464 
465         for (int idx = 0; idx < length; idx++) {
466             char ch = str.charAt(idx);
467 
468             if (ch == '.') {
469                 if (prev == '.' || idx == length - 1)
470                     return false;
471             } else {
472                 if (!ATEXT_CHARS.get(ch))
473                     return false;
474             }
475 
476             prev = ch;
477         }
478 
479         return true;
480     }
481 
482     // RFC 5322 section 3.2.4
483     private static String quote(String str) {
484         // quoted-string = [CFWS] DQUOTE *([FWS] qcontent) [FWS] DQUOTE [CFWS]
485         // qcontent = qtext / quoted-pair
486         // qtext = %d33 / %d35-91 / %d93-126
487         // quoted-pair = ("\" (VCHAR / WSP))
488         // VCHAR = %x21-7E
489         // DQUOTE = %x22
490 
491         String escaped = str.replaceAll("[\\\\\"]", "\\\\$0");
492         return "\"" + escaped + "\"";
493     }
494 
495     private static String encodeB(String prefix, String text,
496             int usedCharacters, Charset charset, byte[] bytes) {
497         int encodedLength = bEncodedLength(bytes);
498 
499         int totalLength = prefix.length() + encodedLength
500                 + ENC_WORD_SUFFIX.length();
501         if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
502             return prefix + encodeB(bytes) + ENC_WORD_SUFFIX;
503         } else {
504             String part1 = text.substring(0, text.length() / 2);
505             byte[] bytes1 = encode(part1, charset);
506             String word1 = encodeB(prefix, part1, usedCharacters, charset,
507                     bytes1);
508 
509             String part2 = text.substring(text.length() / 2);
510             byte[] bytes2 = encode(part2, charset);
511             String word2 = encodeB(prefix, part2, 0, charset, bytes2);
512 
513             return word1 + " " + word2;
514         }
515     }
516 
517     private static int bEncodedLength(byte[] bytes) {
518         return (bytes.length + 2) / 3 * 4;
519     }
520 
521     private static String encodeQ(String prefix, String text, Usage usage,
522             int usedCharacters, Charset charset, byte[] bytes) {
523         int encodedLength = qEncodedLength(bytes, usage);
524 
525         int totalLength = prefix.length() + encodedLength
526                 + ENC_WORD_SUFFIX.length();
527         if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
528             return prefix + encodeQ(bytes, usage) + ENC_WORD_SUFFIX;
529         } else {
530             String part1 = text.substring(0, text.length() / 2);
531             byte[] bytes1 = encode(part1, charset);
532             String word1 = encodeQ(prefix, part1, usage, usedCharacters,
533                     charset, bytes1);
534 
535             String part2 = text.substring(text.length() / 2);
536             byte[] bytes2 = encode(part2, charset);
537             String word2 = encodeQ(prefix, part2, usage, 0, charset, bytes2);
538 
539             return word1 + " " + word2;
540         }
541     }
542 
543     private static int qEncodedLength(byte[] bytes, Usage usage) {
544         BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
545                 : Q_RESTRICTED_CHARS;
546 
547         int count = 0;
548 
549         for (int idx = 0; idx < bytes.length; idx++) {
550             int v = bytes[idx] & 0xff;
551             if (v == 32) {
552                 count++;
553             } else if (!qChars.get(v)) {
554                 count += 3;
555             } else {
556                 count++;
557             }
558         }
559 
560         return count;
561     }
562 
563     private static byte[] encode(String text, Charset charset) {
564         ByteBuffer buffer = charset.encode(text);
565         byte[] bytes = new byte[buffer.limit()];
566         buffer.get(bytes);
567         return bytes;
568     }
569 
570     private static Charset determineCharset(String text) {
571         // it is an important property of iso-8859-1 that it directly maps
572         // unicode code points 0000 to 00ff to byte values 00 to ff.
573         boolean ascii = true;
574         final int len = text.length();
575         for (int index = 0; index < len; index++) {
576             char ch = text.charAt(index);
577             if (ch > 0xff) {
578                 return CharsetUtil.UTF_8;
579             }
580             if (ch > 0x7f) {
581                 ascii = false;
582             }
583         }
584         return ascii ? CharsetUtil.US_ASCII : CharsetUtil.ISO_8859_1;
585     }
586 
587     private static Encoding determineEncoding(byte[] bytes, Usage usage) {
588         if (bytes.length == 0)
589             return Encoding.Q;
590 
591         BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
592                 : Q_RESTRICTED_CHARS;
593 
594         int qEncoded = 0;
595         for (int i = 0; i < bytes.length; i++) {
596             int v = bytes[i] & 0xff;
597             if (v != 32 && !qChars.get(v)) {
598                 qEncoded++;
599             }
600         }
601 
602         int percentage = qEncoded * 100 / bytes.length;
603         return percentage > 30 ? Encoding.B : Encoding.Q;
604     }
605 
606     private static char hexDigit(int i) {
607         return i < 10 ? (char) (i + '0') : (char) (i - 10 + 'A');
608     }
609 }