View Javadoc

1   /****************************************************************
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   ****************************************************************/
19  
20  package org.apache.james.mime4j.util;
21  
22  import java.io.UnsupportedEncodingException;
23  import java.nio.charset.IllegalCharsetNameException;
24  import java.nio.charset.UnsupportedCharsetException;
25  import java.util.HashMap;
26  import java.util.Map;
27  import java.util.SortedSet;
28  import java.util.TreeSet;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  
33  /**
34   * Utility class for working with character sets. It is somewhat similar to
35   * the Java 1.4 <code>java.nio.charset.Charset</code> class but knows many
36   * more aliases and is compatible with Java 1.3. It will use a simple detection
37   * mechanism to detect what character sets the current VM supports. This will
38   * be a sub-set of the character sets listed in the
39   * <a href="http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html">
40   * Java 1.5 (J2SE5.0) Supported Encodings</a> document.
41   * <p>
42   * The <a href="http://www.iana.org/assignments/character-sets">
43   * IANA Character Sets</a> document has been used to determine the preferred
44   * MIME character set names and to get a list of known aliases.
45   * <p>
46   * This is a complete list of the character sets known to this class:
47   * <table>
48   *     <tr>
49   *         <td>Canonical (Java) name</td>
50   *         <td>MIME preferred</td>
51   *         <td>Aliases</td>
52   *     </tr>
53   *     <tr>
54   *         <td>ASCII</td>
55   *         <td>US-ASCII</td>
56   *         <td>ANSI_X3.4-1968 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ISO646-US us IBM367 cp367 csASCII ascii7 646 iso_646.irv:1983 </td>
57   *     </tr>
58   *     <tr>
59   *         <td>Big5</td>
60   *         <td>Big5</td>
61   *         <td>csBig5 CN-Big5 BIG-FIVE BIGFIVE </td>
62   *     </tr>
63   *     <tr>
64   *         <td>Big5_HKSCS</td>
65   *         <td>Big5-HKSCS</td>
66   *         <td>big5hkscs </td>
67   *     </tr>
68   *     <tr>
69   *         <td>Big5_Solaris</td>
70   *         <td>?</td>
71   *         <td></td>
72   *     </tr>
73   *     <tr>
74   *         <td>Cp037</td>
75   *         <td>IBM037</td>
76   *         <td>ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 </td>
77   *     </tr>
78   *     <tr>
79   *         <td>Cp1006</td>
80   *         <td>?</td>
81   *         <td></td>
82   *     </tr>
83   *     <tr>
84   *         <td>Cp1025</td>
85   *         <td>?</td>
86   *         <td></td>
87   *     </tr>
88   *     <tr>
89   *         <td>Cp1026</td>
90   *         <td>IBM1026</td>
91   *         <td>csIBM1026 </td>
92   *     </tr>
93   *     <tr>
94   *         <td>Cp1046</td>
95   *         <td>?</td>
96   *         <td></td>
97   *     </tr>
98   *     <tr>
99   *         <td>Cp1047</td>
100  *         <td>IBM1047</td>
101  *         <td>IBM-1047 </td>
102  *     </tr>
103  *     <tr>
104  *         <td>Cp1097</td>
105  *         <td>?</td>
106  *         <td></td>
107  *     </tr>
108  *     <tr>
109  *         <td>Cp1098</td>
110  *         <td>?</td>
111  *         <td></td>
112  *     </tr>
113  *     <tr>
114  *         <td>Cp1112</td>
115  *         <td>?</td>
116  *         <td></td>
117  *     </tr>
118  *     <tr>
119  *         <td>Cp1122</td>
120  *         <td>?</td>
121  *         <td></td>
122  *     </tr>
123  *     <tr>
124  *         <td>Cp1123</td>
125  *         <td>?</td>
126  *         <td></td>
127  *     </tr>
128  *     <tr>
129  *         <td>Cp1124</td>
130  *         <td>?</td>
131  *         <td></td>
132  *     </tr>
133  *     <tr>
134  *         <td>Cp1140</td>
135  *         <td>IBM01140</td>
136  *         <td>CCSID01140 CP01140 ebcdic-us-37+euro </td>
137  *     </tr>
138  *     <tr>
139  *         <td>Cp1141</td>
140  *         <td>IBM01141</td>
141  *         <td>CCSID01141 CP01141 ebcdic-de-273+euro </td>
142  *     </tr>
143  *     <tr>
144  *         <td>Cp1142</td>
145  *         <td>IBM01142</td>
146  *         <td>CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro </td>
147  *     </tr>
148  *     <tr>
149  *         <td>Cp1143</td>
150  *         <td>IBM01143</td>
151  *         <td>CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro </td>
152  *     </tr>
153  *     <tr>
154  *         <td>Cp1144</td>
155  *         <td>IBM01144</td>
156  *         <td>CCSID01144 CP01144 ebcdic-it-280+euro </td>
157  *     </tr>
158  *     <tr>
159  *         <td>Cp1145</td>
160  *         <td>IBM01145</td>
161  *         <td>CCSID01145 CP01145 ebcdic-es-284+euro </td>
162  *     </tr>
163  *     <tr>
164  *         <td>Cp1146</td>
165  *         <td>IBM01146</td>
166  *         <td>CCSID01146 CP01146 ebcdic-gb-285+euro </td>
167  *     </tr>
168  *     <tr>
169  *         <td>Cp1147</td>
170  *         <td>IBM01147</td>
171  *         <td>CCSID01147 CP01147 ebcdic-fr-297+euro </td>
172  *     </tr>
173  *     <tr>
174  *         <td>Cp1148</td>
175  *         <td>IBM01148</td>
176  *         <td>CCSID01148 CP01148 ebcdic-international-500+euro </td>
177  *     </tr>
178  *     <tr>
179  *         <td>Cp1149</td>
180  *         <td>IBM01149</td>
181  *         <td>CCSID01149 CP01149 ebcdic-is-871+euro </td>
182  *     </tr>
183  *     <tr>
184  *         <td>Cp1250</td>
185  *         <td>windows-1250</td>
186  *         <td></td>
187  *     </tr>
188  *     <tr>
189  *         <td>Cp1251</td>
190  *         <td>windows-1251</td>
191  *         <td></td>
192  *     </tr>
193  *     <tr>
194  *         <td>Cp1252</td>
195  *         <td>windows-1252</td>
196  *         <td></td>
197  *     </tr>
198  *     <tr>
199  *         <td>Cp1253</td>
200  *         <td>windows-1253</td>
201  *         <td></td>
202  *     </tr>
203  *     <tr>
204  *         <td>Cp1254</td>
205  *         <td>windows-1254</td>
206  *         <td></td>
207  *     </tr>
208  *     <tr>
209  *         <td>Cp1255</td>
210  *         <td>windows-1255</td>
211  *         <td></td>
212  *     </tr>
213  *     <tr>
214  *         <td>Cp1256</td>
215  *         <td>windows-1256</td>
216  *         <td></td>
217  *     </tr>
218  *     <tr>
219  *         <td>Cp1257</td>
220  *         <td>windows-1257</td>
221  *         <td></td>
222  *     </tr>
223  *     <tr>
224  *         <td>Cp1258</td>
225  *         <td>windows-1258</td>
226  *         <td></td>
227  *     </tr>
228  *     <tr>
229  *         <td>Cp1381</td>
230  *         <td>?</td>
231  *         <td></td>
232  *     </tr>
233  *     <tr>
234  *         <td>Cp1383</td>
235  *         <td>?</td>
236  *         <td></td>
237  *     </tr>
238  *     <tr>
239  *         <td>Cp273</td>
240  *         <td>IBM273</td>
241  *         <td>csIBM273 </td>
242  *     </tr>
243  *     <tr>
244  *         <td>Cp277</td>
245  *         <td>IBM277</td>
246  *         <td>EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 </td>
247  *     </tr>
248  *     <tr>
249  *         <td>Cp278</td>
250  *         <td>IBM278</td>
251  *         <td>CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 </td>
252  *     </tr>
253  *     <tr>
254  *         <td>Cp280</td>
255  *         <td>IBM280</td>
256  *         <td>ebcdic-cp-it csIBM280 </td>
257  *     </tr>
258  *     <tr>
259  *         <td>Cp284</td>
260  *         <td>IBM284</td>
261  *         <td>ebcdic-cp-es csIBM284 </td>
262  *     </tr>
263  *     <tr>
264  *         <td>Cp285</td>
265  *         <td>IBM285</td>
266  *         <td>ebcdic-cp-gb csIBM285 </td>
267  *     </tr>
268  *     <tr>
269  *         <td>Cp297</td>
270  *         <td>IBM297</td>
271  *         <td>ebcdic-cp-fr csIBM297 </td>
272  *     </tr>
273  *     <tr>
274  *         <td>Cp33722</td>
275  *         <td>?</td>
276  *         <td></td>
277  *     </tr>
278  *     <tr>
279  *         <td>Cp420</td>
280  *         <td>IBM420</td>
281  *         <td>ebcdic-cp-ar1 csIBM420 </td>
282  *     </tr>
283  *     <tr>
284  *         <td>Cp424</td>
285  *         <td>IBM424</td>
286  *         <td>ebcdic-cp-he csIBM424 </td>
287  *     </tr>
288  *     <tr>
289  *         <td>Cp437</td>
290  *         <td>IBM437</td>
291  *         <td>437 csPC8CodePage437 </td>
292  *     </tr>
293  *     <tr>
294  *         <td>Cp500</td>
295  *         <td>IBM500</td>
296  *         <td>ebcdic-cp-be ebcdic-cp-ch csIBM500 </td>
297  *     </tr>
298  *     <tr>
299  *         <td>Cp737</td>
300  *         <td>?</td>
301  *         <td></td>
302  *     </tr>
303  *     <tr>
304  *         <td>Cp775</td>
305  *         <td>IBM775</td>
306  *         <td>csPC775Baltic </td>
307  *     </tr>
308  *     <tr>
309  *         <td>Cp838</td>
310  *         <td>IBM-Thai</td>
311  *         <td></td>
312  *     </tr>
313  *     <tr>
314  *         <td>Cp850</td>
315  *         <td>IBM850</td>
316  *         <td>850 csPC850Multilingual </td>
317  *     </tr>
318  *     <tr>
319  *         <td>Cp852</td>
320  *         <td>IBM852</td>
321  *         <td>852 csPCp852 </td>
322  *     </tr>
323  *     <tr>
324  *         <td>Cp855</td>
325  *         <td>IBM855</td>
326  *         <td>855 csIBM855 </td>
327  *     </tr>
328  *     <tr>
329  *         <td>Cp856</td>
330  *         <td>?</td>
331  *         <td></td>
332  *     </tr>
333  *     <tr>
334  *         <td>Cp857</td>
335  *         <td>IBM857</td>
336  *         <td>857 csIBM857 </td>
337  *     </tr>
338  *     <tr>
339  *         <td>Cp858</td>
340  *         <td>IBM00858</td>
341  *         <td>CCSID00858 CP00858 PC-Multilingual-850+euro </td>
342  *     </tr>
343  *     <tr>
344  *         <td>Cp860</td>
345  *         <td>IBM860</td>
346  *         <td>860 csIBM860 </td>
347  *     </tr>
348  *     <tr>
349  *         <td>Cp861</td>
350  *         <td>IBM861</td>
351  *         <td>861 cp-is csIBM861 </td>
352  *     </tr>
353  *     <tr>
354  *         <td>Cp862</td>
355  *         <td>IBM862</td>
356  *         <td>862 csPC862LatinHebrew </td>
357  *     </tr>
358  *     <tr>
359  *         <td>Cp863</td>
360  *         <td>IBM863</td>
361  *         <td>863 csIBM863 </td>
362  *     </tr>
363  *     <tr>
364  *         <td>Cp864</td>
365  *         <td>IBM864</td>
366  *         <td>cp864 csIBM864 </td>
367  *     </tr>
368  *     <tr>
369  *         <td>Cp865</td>
370  *         <td>IBM865</td>
371  *         <td>865 csIBM865 </td>
372  *     </tr>
373  *     <tr>
374  *         <td>Cp866</td>
375  *         <td>IBM866</td>
376  *         <td>866 csIBM866 </td>
377  *     </tr>
378  *     <tr>
379  *         <td>Cp868</td>
380  *         <td>IBM868</td>
381  *         <td>cp-ar csIBM868 </td>
382  *     </tr>
383  *     <tr>
384  *         <td>Cp869</td>
385  *         <td>IBM869</td>
386  *         <td>cp-gr csIBM869 </td>
387  *     </tr>
388  *     <tr>
389  *         <td>Cp870</td>
390  *         <td>IBM870</td>
391  *         <td>ebcdic-cp-roece ebcdic-cp-yu csIBM870 </td>
392  *     </tr>
393  *     <tr>
394  *         <td>Cp871</td>
395  *         <td>IBM871</td>
396  *         <td>ebcdic-cp-is csIBM871 </td>
397  *     </tr>
398  *     <tr>
399  *         <td>Cp875</td>
400  *         <td>?</td>
401  *         <td></td>
402  *     </tr>
403  *     <tr>
404  *         <td>Cp918</td>
405  *         <td>IBM918</td>
406  *         <td>ebcdic-cp-ar2 csIBM918 </td>
407  *     </tr>
408  *     <tr>
409  *         <td>Cp921</td>
410  *         <td>?</td>
411  *         <td></td>
412  *     </tr>
413  *     <tr>
414  *         <td>Cp922</td>
415  *         <td>?</td>
416  *         <td></td>
417  *     </tr>
418  *     <tr>
419  *         <td>Cp930</td>
420  *         <td>?</td>
421  *         <td></td>
422  *     </tr>
423  *     <tr>
424  *         <td>Cp933</td>
425  *         <td>?</td>
426  *         <td></td>
427  *     </tr>
428  *     <tr>
429  *         <td>Cp935</td>
430  *         <td>?</td>
431  *         <td></td>
432  *     </tr>
433  *     <tr>
434  *         <td>Cp937</td>
435  *         <td>?</td>
436  *         <td></td>
437  *     </tr>
438  *     <tr>
439  *         <td>Cp939</td>
440  *         <td>?</td>
441  *         <td></td>
442  *     </tr>
443  *     <tr>
444  *         <td>Cp942</td>
445  *         <td>?</td>
446  *         <td></td>
447  *     </tr>
448  *     <tr>
449  *         <td>Cp942C</td>
450  *         <td>?</td>
451  *         <td></td>
452  *     </tr>
453  *     <tr>
454  *         <td>Cp943</td>
455  *         <td>?</td>
456  *         <td></td>
457  *     </tr>
458  *     <tr>
459  *         <td>Cp943C</td>
460  *         <td>?</td>
461  *         <td></td>
462  *     </tr>
463  *     <tr>
464  *         <td>Cp948</td>
465  *         <td>?</td>
466  *         <td></td>
467  *     </tr>
468  *     <tr>
469  *         <td>Cp949</td>
470  *         <td>?</td>
471  *         <td></td>
472  *     </tr>
473  *     <tr>
474  *         <td>Cp949C</td>
475  *         <td>?</td>
476  *         <td></td>
477  *     </tr>
478  *     <tr>
479  *         <td>Cp950</td>
480  *         <td>?</td>
481  *         <td></td>
482  *     </tr>
483  *     <tr>
484  *         <td>Cp964</td>
485  *         <td>?</td>
486  *         <td></td>
487  *     </tr>
488  *     <tr>
489  *         <td>Cp970</td>
490  *         <td>?</td>
491  *         <td></td>
492  *     </tr>
493  *     <tr>
494  *         <td>EUC_CN</td>
495  *         <td>GB2312</td>
496  *         <td>x-EUC-CN csGB2312 euccn euc-cn gb2312-80 gb2312-1980 CN-GB CN-GB-ISOIR165 </td>
497  *     </tr>
498  *     <tr>
499  *         <td>EUC_JP</td>
500  *         <td>EUC-JP</td>
501  *         <td>csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese eucjis x-eucjp eucjp x-euc-jp </td>
502  *     </tr>
503  *     <tr>
504  *         <td>EUC_JP_LINUX</td>
505  *         <td>?</td>
506  *         <td></td>
507  *     </tr>
508  *     <tr>
509  *         <td>EUC_JP_Solaris</td>
510  *         <td>?</td>
511  *         <td></td>
512  *     </tr>
513  *     <tr>
514  *         <td>EUC_KR</td>
515  *         <td>EUC-KR</td>
516  *         <td>csEUCKR ksc5601 5601 ksc5601_1987 ksc_5601 ksc5601-1987 ks_c_5601-1987 euckr </td>
517  *     </tr>
518  *     <tr>
519  *         <td>EUC_TW</td>
520  *         <td>EUC-TW</td>
521  *         <td>x-EUC-TW cns11643 euctw </td>
522  *     </tr>
523  *     <tr>
524  *         <td>GB18030</td>
525  *         <td>GB18030</td>
526  *         <td>gb18030-2000 </td>
527  *     </tr>
528  *     <tr>
529  *         <td>GBK</td>
530  *         <td>windows-936</td>
531  *         <td>CP936 MS936 ms_936 x-mswin-936 </td>
532  *     </tr>
533  *     <tr>
534  *         <td>ISCII91</td>
535  *         <td>?</td>
536  *         <td>x-ISCII91 iscii </td>
537  *     </tr>
538  *     <tr>
539  *         <td>ISO2022CN</td>
540  *         <td>ISO-2022-CN</td>
541  *         <td></td>
542  *     </tr>
543  *     <tr>
544  *         <td>ISO2022JP</td>
545  *         <td>ISO-2022-JP</td>
546  *         <td>csISO2022JP JIS jis_encoding csjisencoding </td>
547  *     </tr>
548  *     <tr>
549  *         <td>ISO2022KR</td>
550  *         <td>ISO-2022-KR</td>
551  *         <td>csISO2022KR </td>
552  *     </tr>
553  *     <tr>
554  *         <td>ISO2022_CN_CNS</td>
555  *         <td>?</td>
556  *         <td></td>
557  *     </tr>
558  *     <tr>
559  *         <td>ISO2022_CN_GB</td>
560  *         <td>?</td>
561  *         <td></td>
562  *     </tr>
563  *     <tr>
564  *         <td>ISO8859_1</td>
565  *         <td>ISO-8859-1</td>
566  *         <td>ISO_8859-1:1987 iso-ir-100 ISO_8859-1 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 819 IBM-819 ISO8859-1 ISO_8859_1 </td>
567  *     </tr>
568  *     <tr>
569  *         <td>ISO8859_13</td>
570  *         <td>ISO-8859-13</td>
571  *         <td></td>
572  *     </tr>
573  *     <tr>
574  *         <td>ISO8859_15</td>
575  *         <td>ISO-8859-15</td>
576  *         <td>ISO_8859-15 Latin-9 8859_15 csISOlatin9 IBM923 cp923 923 L9 IBM-923 ISO8859-15 LATIN9 LATIN0 csISOlatin0 ISO8859_15_FDIS </td>
577  *     </tr>
578  *     <tr>
579  *         <td>ISO8859_2</td>
580  *         <td>ISO-8859-2</td>
581  *         <td>ISO_8859-2:1987 iso-ir-101 ISO_8859-2 latin2 l2 csISOLatin2 8859_2 iso8859_2 </td>
582  *     </tr>
583  *     <tr>
584  *         <td>ISO8859_3</td>
585  *         <td>ISO-8859-3</td>
586  *         <td>ISO_8859-3:1988 iso-ir-109 ISO_8859-3 latin3 l3 csISOLatin3 8859_3 </td>
587  *     </tr>
588  *     <tr>
589  *         <td>ISO8859_4</td>
590  *         <td>ISO-8859-4</td>
591  *         <td>ISO_8859-4:1988 iso-ir-110 ISO_8859-4 latin4 l4 csISOLatin4 8859_4 </td>
592  *     </tr>
593  *     <tr>
594  *         <td>ISO8859_5</td>
595  *         <td>ISO-8859-5</td>
596  *         <td>ISO_8859-5:1988 iso-ir-144 ISO_8859-5 cyrillic csISOLatinCyrillic 8859_5 </td>
597  *     </tr>
598  *     <tr>
599  *         <td>ISO8859_6</td>
600  *         <td>ISO-8859-6</td>
601  *         <td>ISO_8859-6:1987 iso-ir-127 ISO_8859-6 ECMA-114 ASMO-708 arabic csISOLatinArabic 8859_6 </td>
602  *     </tr>
603  *     <tr>
604  *         <td>ISO8859_7</td>
605  *         <td>ISO-8859-7</td>
606  *         <td>ISO_8859-7:1987 iso-ir-126 ISO_8859-7 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 sun_eu_greek </td>
607  *     </tr>
608  *     <tr>
609  *         <td>ISO8859_8</td>
610  *         <td>ISO-8859-8</td>
611  *         <td>ISO_8859-8:1988 iso-ir-138 ISO_8859-8 hebrew csISOLatinHebrew 8859_8 </td>
612  *     </tr>
613  *     <tr>
614  *         <td>ISO8859_9</td>
615  *         <td>ISO-8859-9</td>
616  *         <td>ISO_8859-9:1989 iso-ir-148 ISO_8859-9 latin5 l5 csISOLatin5 8859_9 </td>
617  *     </tr>
618  *     <tr>
619  *         <td>JISAutoDetect</td>
620  *         <td>?</td>
621  *         <td></td>
622  *     </tr>
623  *     <tr>
624  *         <td>JIS_C6626-1983</td>
625  *         <td>JIS_C6626-1983</td>
626  *         <td>x-JIS0208 JIS0208 csISO87JISX0208 x0208 JIS_X0208-1983 iso-ir-87 </td>
627  *     </tr>
628  *     <tr>
629  *         <td>JIS_X0201</td>
630  *         <td>JIS_X0201</td>
631  *         <td>X0201 JIS0201 csHalfWidthKatakana </td>
632  *     </tr>
633  *     <tr>
634  *         <td>JIS_X0212-1990</td>
635  *         <td>JIS_X0212-1990</td>
636  *         <td>iso-ir-159 x0212 JIS0212 csISO159JISX02121990 </td>
637  *     </tr>
638  *     <tr>
639  *         <td>KOI8_R</td>
640  *         <td>KOI8-R</td>
641  *         <td>csKOI8R koi8 </td>
642  *     </tr>
643  *     <tr>
644  *         <td>MS874</td>
645  *         <td>windows-874</td>
646  *         <td>cp874 </td>
647  *     </tr>
648  *     <tr>
649  *         <td>MS932</td>
650  *         <td>Windows-31J</td>
651  *         <td>windows-932 csWindows31J x-ms-cp932 </td>
652  *     </tr>
653  *     <tr>
654  *         <td>MS949</td>
655  *         <td>windows-949</td>
656  *         <td>windows949 ms_949 x-windows-949 </td>
657  *     </tr>
658  *     <tr>
659  *         <td>MS950</td>
660  *         <td>windows-950</td>
661  *         <td>x-windows-950 </td>
662  *     </tr>
663  *     <tr>
664  *         <td>MS950_HKSCS</td>
665  *         <td></td>
666  *         <td></td>
667  *     </tr>
668  *     <tr>
669  *         <td>MacArabic</td>
670  *         <td>?</td>
671  *         <td></td>
672  *     </tr>
673  *     <tr>
674  *         <td>MacCentralEurope</td>
675  *         <td>?</td>
676  *         <td></td>
677  *     </tr>
678  *     <tr>
679  *         <td>MacCroatian</td>
680  *         <td>?</td>
681  *         <td></td>
682  *     </tr>
683  *     <tr>
684  *         <td>MacCyrillic</td>
685  *         <td>?</td>
686  *         <td></td>
687  *     </tr>
688  *     <tr>
689  *         <td>MacDingbat</td>
690  *         <td>?</td>
691  *         <td></td>
692  *     </tr>
693  *     <tr>
694  *         <td>MacGreek</td>
695  *         <td>MacGreek</td>
696  *         <td></td>
697  *     </tr>
698  *     <tr>
699  *         <td>MacHebrew</td>
700  *         <td>?</td>
701  *         <td></td>
702  *     </tr>
703  *     <tr>
704  *         <td>MacIceland</td>
705  *         <td>?</td>
706  *         <td></td>
707  *     </tr>
708  *     <tr>
709  *         <td>MacRoman</td>
710  *         <td>MacRoman</td>
711  *         <td>Macintosh MAC csMacintosh </td>
712  *     </tr>
713  *     <tr>
714  *         <td>MacRomania</td>
715  *         <td>?</td>
716  *         <td></td>
717  *     </tr>
718  *     <tr>
719  *         <td>MacSymbol</td>
720  *         <td>?</td>
721  *         <td></td>
722  *     </tr>
723  *     <tr>
724  *         <td>MacThai</td>
725  *         <td>?</td>
726  *         <td></td>
727  *     </tr>
728  *     <tr>
729  *         <td>MacTurkish</td>
730  *         <td>?</td>
731  *         <td></td>
732  *     </tr>
733  *     <tr>
734  *         <td>MacUkraine</td>
735  *         <td>?</td>
736  *         <td></td>
737  *     </tr>
738  *     <tr>
739  *         <td>SJIS</td>
740  *         <td>Shift_JIS</td>
741  *         <td>MS_Kanji csShiftJIS shift-jis x-sjis pck </td>
742  *     </tr>
743  *     <tr>
744  *         <td>TIS620</td>
745  *         <td>TIS-620</td>
746  *         <td></td>
747  *     </tr>
748  *     <tr>
749  *         <td>UTF-16</td>
750  *         <td>UTF-16</td>
751  *         <td>UTF_16 </td>
752  *     </tr>
753  *     <tr>
754  *         <td>UTF8</td>
755  *         <td>UTF-8</td>
756  *         <td></td>
757  *     </tr>
758  *     <tr>
759  *         <td>UnicodeBig</td>
760  *         <td>?</td>
761  *         <td></td>
762  *     </tr>
763  *     <tr>
764  *         <td>UnicodeBigUnmarked</td>
765  *         <td>UTF-16BE</td>
766  *         <td>X-UTF-16BE UTF_16BE ISO-10646-UCS-2 </td>
767  *     </tr>
768  *     <tr>
769  *         <td>UnicodeLittle</td>
770  *         <td>?</td>
771  *         <td></td>
772  *     </tr>
773  *     <tr>
774  *         <td>UnicodeLittleUnmarked</td>
775  *         <td>UTF-16LE</td>
776  *         <td>UTF_16LE X-UTF-16LE </td>
777  *     </tr>
778  *     <tr>
779  *         <td>x-Johab</td>
780  *         <td>johab</td>
781  *         <td>johab cp1361 ms1361 ksc5601-1992 ksc5601_1992 </td>
782  *     </tr>
783  *     <tr>
784  *         <td>x-iso-8859-11</td>
785  *         <td>?</td>
786  *         <td></td>
787  *     </tr>
788  * </table>
789  */
790 public class CharsetUtil {
791     private static Log log = LogFactory.getLog(CharsetUtil.class);
792     
793     private static class Charset implements Comparable<Charset> {
794         private String canonical = null;
795         private String mime = null;
796         private String[] aliases = null;
797         
798         private Charset(String canonical, String mime, String[] aliases) {
799             this.canonical = canonical;
800             this.mime = mime;
801             this.aliases = aliases;
802         }
803 
804         public int compareTo(Charset c) {
805             return this.canonical.compareTo(c.canonical);
806         }
807     }
808     
809     private static Charset[] JAVA_CHARSETS = {
810         new Charset("ISO8859_1", "ISO-8859-1", 
811                     new String[] {"ISO_8859-1:1987", "iso-ir-100", "ISO_8859-1", 
812                                   "latin1", "l1", "IBM819", "CP819", 
813                                   "csISOLatin1", "8859_1", "819", "IBM-819", 
814                                   "ISO8859-1", "ISO_8859_1"}),
815         new Charset("ISO8859_2", "ISO-8859-2", 
816                     new String[] {"ISO_8859-2:1987", "iso-ir-101", "ISO_8859-2",  
817                                   "latin2", "l2", "csISOLatin2", "8859_2", 
818                                   "iso8859_2"}),
819         new Charset("ISO8859_3", "ISO-8859-3", new String[] {"ISO_8859-3:1988", "iso-ir-109", "ISO_8859-3", "latin3", "l3", "csISOLatin3", "8859_3"}),
820         new Charset("ISO8859_4", "ISO-8859-4", 
821                     new String[] {"ISO_8859-4:1988", "iso-ir-110", "ISO_8859-4",
822                                   "latin4", "l4", "csISOLatin4", "8859_4"}),
823         new Charset("ISO8859_5", "ISO-8859-5", 
824                     new String[] {"ISO_8859-5:1988", "iso-ir-144", "ISO_8859-5", 
825                                   "cyrillic", "csISOLatinCyrillic", "8859_5"}),
826         new Charset("ISO8859_6", "ISO-8859-6", new String[] {"ISO_8859-6:1987", "iso-ir-127", "ISO_8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic", "8859_6"}),
827         new Charset("ISO8859_7", "ISO-8859-7", 
828                     new String[] {"ISO_8859-7:1987", "iso-ir-126", "ISO_8859-7", 
829                                   "ELOT_928", "ECMA-118", "greek", "greek8", 
830                                   "csISOLatinGreek", "8859_7", "sun_eu_greek"}),
831         new Charset("ISO8859_8", "ISO-8859-8", new String[] {"ISO_8859-8:1988", "iso-ir-138", "ISO_8859-8", "hebrew", "csISOLatinHebrew", "8859_8"}),
832         new Charset("ISO8859_9", "ISO-8859-9", 
833                     new String[] {"ISO_8859-9:1989", "iso-ir-148", "ISO_8859-9",  
834                                   "latin5", "l5", "csISOLatin5", "8859_9"}),
835 
836         new Charset("ISO8859_13", "ISO-8859-13", new String[] {}),
837         new Charset("ISO8859_15", "ISO-8859-15", 
838                     new String[] {"ISO_8859-15", "Latin-9", "8859_15", 
839                                   "csISOlatin9", "IBM923", "cp923", "923", "L9",
840                                   "IBM-923", "ISO8859-15", "LATIN9", "LATIN0", 
841                                   "csISOlatin0", "ISO8859_15_FDIS"}),
842         new Charset("KOI8_R", "KOI8-R", new String[] {"csKOI8R", "koi8"}),
843         new Charset("ASCII", "US-ASCII", 
844                     new String[] {"ANSI_X3.4-1968", "iso-ir-6", 
845                                   "ANSI_X3.4-1986", "ISO_646.irv:1991", 
846                                   "ISO646-US", "us", "IBM367", "cp367", 
847                                   "csASCII", "ascii7", "646", "iso_646.irv:1983"}),
848         new Charset("UTF8", "UTF-8", new String[] {}),
849         new Charset("UTF-16", "UTF-16", new String[] {"UTF_16"}),
850         new Charset("UnicodeBigUnmarked", "UTF-16BE", new String[] {"X-UTF-16BE", "UTF_16BE", "ISO-10646-UCS-2"}),
851         new Charset("UnicodeLittleUnmarked", "UTF-16LE", new String[] {"UTF_16LE", "X-UTF-16LE"}),
852         new Charset("Big5", "Big5", new String[] {"csBig5", "CN-Big5", "BIG-FIVE", "BIGFIVE"}),
853         new Charset("Big5_HKSCS", "Big5-HKSCS", new String[] {"big5hkscs"}),
854         new Charset("EUC_JP", "EUC-JP", 
855                     new String[] {"csEUCPkdFmtJapanese", 
856                               "Extended_UNIX_Code_Packed_Format_for_Japanese",
857                               "eucjis", "x-eucjp", "eucjp", "x-euc-jp"}),
858         new Charset("EUC_KR", "EUC-KR", 
859                     new String[] {"csEUCKR", "ksc5601", "5601", "ksc5601_1987", 
860                                   "ksc_5601", "ksc5601-1987", "ks_c_5601-1987", 
861                                   "euckr"}),
862         new Charset("GB18030", "GB18030", new String[] {"gb18030-2000"}),
863         new Charset("EUC_CN", "GB2312", new String[] {"x-EUC-CN", "csGB2312", "euccn", "euc-cn", "gb2312-80", "gb2312-1980", "CN-GB", "CN-GB-ISOIR165"}),
864         new Charset("GBK", "windows-936", new String[] {"CP936", "MS936", "ms_936", "x-mswin-936"}),
865 
866         new Charset("Cp037", "IBM037", new String[] {"ebcdic-cp-us", "ebcdic-cp-ca", "ebcdic-cp-wt", "ebcdic-cp-nl", "csIBM037"}),
867         new Charset("Cp273", "IBM273", new String[] {"csIBM273"}),
868         new Charset("Cp277", "IBM277", new String[] {"EBCDIC-CP-DK", "EBCDIC-CP-NO", "csIBM277"}),
869         new Charset("Cp278", "IBM278", new String[] {"CP278", "ebcdic-cp-fi", "ebcdic-cp-se", "csIBM278"}),
870         new Charset("Cp280", "IBM280", new String[] {"ebcdic-cp-it", "csIBM280"}),
871         new Charset("Cp284", "IBM284", new String[] {"ebcdic-cp-es", "csIBM284"}),
872         new Charset("Cp285", "IBM285", new String[] {"ebcdic-cp-gb", "csIBM285"}),
873         new Charset("Cp297", "IBM297", new String[] {"ebcdic-cp-fr", "csIBM297"}),
874         new Charset("Cp420", "IBM420", new String[] {"ebcdic-cp-ar1", "csIBM420"}),
875         new Charset("Cp424", "IBM424", new String[] {"ebcdic-cp-he", "csIBM424"}),
876         new Charset("Cp437", "IBM437", new String[] {"437", "csPC8CodePage437"}),
877         new Charset("Cp500", "IBM500", new String[] {"ebcdic-cp-be", "ebcdic-cp-ch", "csIBM500"}),
878         new Charset("Cp775", "IBM775", new String[] {"csPC775Baltic"}),
879         new Charset("Cp838", "IBM-Thai", new String[] {}),
880         new Charset("Cp850", "IBM850", new String[] {"850", "csPC850Multilingual"}),
881         new Charset("Cp852", "IBM852", new String[] {"852", "csPCp852"}),
882         new Charset("Cp855", "IBM855", new String[] {"855", "csIBM855"}),
883         new Charset("Cp857", "IBM857", new String[] {"857", "csIBM857"}),
884         new Charset("Cp858", "IBM00858", 
885                 new String[] {"CCSID00858", "CP00858", 
886                               "PC-Multilingual-850+euro"}),
887         new Charset("Cp860", "IBM860", new String[] {"860", "csIBM860"}),
888         new Charset("Cp861", "IBM861", new String[] {"861", "cp-is", "csIBM861"}),
889         new Charset("Cp862", "IBM862", new String[] {"862", "csPC862LatinHebrew"}),
890         new Charset("Cp863", "IBM863", new String[] {"863", "csIBM863"}),
891         new Charset("Cp864", "IBM864", new String[] {"cp864", "csIBM864"}),
892         new Charset("Cp865", "IBM865", new String[] {"865", "csIBM865"}),
893         new Charset("Cp866", "IBM866", new String[] {"866", "csIBM866"}),
894         new Charset("Cp868", "IBM868", new String[] {"cp-ar", "csIBM868"}),
895         new Charset("Cp869", "IBM869", new String[] {"cp-gr", "csIBM869"}),
896         new Charset("Cp870", "IBM870", new String[] {"ebcdic-cp-roece", "ebcdic-cp-yu", "csIBM870"}),
897         new Charset("Cp871", "IBM871", new String[] {"ebcdic-cp-is", "csIBM871"}),
898         new Charset("Cp918", "IBM918", new String[] {"ebcdic-cp-ar2", "csIBM918"}),
899         new Charset("Cp1026", "IBM1026", new String[] {"csIBM1026"}),
900         new Charset("Cp1047", "IBM1047", new String[] {"IBM-1047"}),
901         new Charset("Cp1140", "IBM01140", 
902                     new String[] {"CCSID01140", "CP01140", 
903                                   "ebcdic-us-37+euro"}),
904         new Charset("Cp1141", "IBM01141", 
905                     new String[] {"CCSID01141", "CP01141", 
906                                   "ebcdic-de-273+euro"}),
907         new Charset("Cp1142", "IBM01142", new String[] {"CCSID01142", "CP01142", "ebcdic-dk-277+euro", "ebcdic-no-277+euro"}),
908         new Charset("Cp1143", "IBM01143", new String[] {"CCSID01143", "CP01143", "ebcdic-fi-278+euro", "ebcdic-se-278+euro"}),
909         new Charset("Cp1144", "IBM01144", new String[] {"CCSID01144", "CP01144", "ebcdic-it-280+euro"}),
910         new Charset("Cp1145", "IBM01145", new String[] {"CCSID01145", "CP01145", "ebcdic-es-284+euro"}),
911         new Charset("Cp1146", "IBM01146", new String[] {"CCSID01146", "CP01146", "ebcdic-gb-285+euro"}),
912         new Charset("Cp1147", "IBM01147", new String[] {"CCSID01147", "CP01147", "ebcdic-fr-297+euro"}),
913         new Charset("Cp1148", "IBM01148", new String[] {"CCSID01148", "CP01148", "ebcdic-international-500+euro"}),
914         new Charset("Cp1149", "IBM01149", new String[] {"CCSID01149", "CP01149", "ebcdic-is-871+euro"}),
915         new Charset("Cp1250", "windows-1250", new String[] {}),
916         new Charset("Cp1251", "windows-1251", new String[] {}),
917         new Charset("Cp1252", "windows-1252", new String[] {}),
918         new Charset("Cp1253", "windows-1253", new String[] {}),
919         new Charset("Cp1254", "windows-1254", new String[] {}),
920         new Charset("Cp1255", "windows-1255", new String[] {}),
921         new Charset("Cp1256", "windows-1256", new String[] {}),
922         new Charset("Cp1257", "windows-1257", new String[] {}),
923         new Charset("Cp1258", "windows-1258", new String[] {}),
924         new Charset("ISO2022CN", "ISO-2022-CN", new String[] {}),
925         new Charset("ISO2022JP", "ISO-2022-JP", new String[] {"csISO2022JP", "JIS", "jis_encoding", "csjisencoding"}),
926         new Charset("ISO2022KR", "ISO-2022-KR", new String[] {"csISO2022KR"}),
927         new Charset("JIS_X0201", "JIS_X0201", new String[] {"X0201", "JIS0201", "csHalfWidthKatakana"}),
928         new Charset("JIS_X0212-1990", "JIS_X0212-1990", new String[] {"iso-ir-159", "x0212", "JIS0212", "csISO159JISX02121990"}),
929         new Charset("JIS_C6626-1983", "JIS_C6626-1983", new String[] {"x-JIS0208", "JIS0208", "csISO87JISX0208", "x0208", "JIS_X0208-1983", "iso-ir-87"}),
930         new Charset("SJIS", "Shift_JIS", new String[] {"MS_Kanji", "csShiftJIS", "shift-jis", "x-sjis", "pck"}),
931         new Charset("TIS620", "TIS-620", new String[] {}),
932         new Charset("MS932", "Windows-31J", new String[] {"windows-932", "csWindows31J", "x-ms-cp932"}),
933         new Charset("EUC_TW", "EUC-TW", new String[] {"x-EUC-TW", "cns11643", "euctw"}),
934         new Charset("x-Johab", "johab", new String[] {"johab", "cp1361", "ms1361", "ksc5601-1992", "ksc5601_1992"}),
935         new Charset("MS950_HKSCS", "", new String[] {}),
936         new Charset("MS874", "windows-874", new String[] {"cp874"}),
937         new Charset("MS949", "windows-949", new String[] {"windows949", "ms_949", "x-windows-949"}),
938         new Charset("MS950", "windows-950", new String[] {"x-windows-950"}),
939 
940         new Charset("Cp737", null, new String[] {}),
941         new Charset("Cp856", null, new String[] {}),
942         new Charset("Cp875", null, new String[] {}),
943         new Charset("Cp921", null, new String[] {}),
944         new Charset("Cp922", null, new String[] {}),
945         new Charset("Cp930", null, new String[] {}),
946         new Charset("Cp933", null, new String[] {}),
947         new Charset("Cp935", null, new String[] {}),
948         new Charset("Cp937", null, new String[] {}),
949         new Charset("Cp939", null, new String[] {}),
950         new Charset("Cp942", null, new String[] {}),
951         new Charset("Cp942C", null, new String[] {}),
952         new Charset("Cp943", null, new String[] {}),
953         new Charset("Cp943C", null, new String[] {}),
954         new Charset("Cp948", null, new String[] {}),
955         new Charset("Cp949", null, new String[] {}),
956         new Charset("Cp949C", null, new String[] {}),
957         new Charset("Cp950", null, new String[] {}),
958         new Charset("Cp964", null, new String[] {}),
959         new Charset("Cp970", null, new String[] {}),
960         new Charset("Cp1006", null, new String[] {}),
961         new Charset("Cp1025", null, new String[] {}),    
962         new Charset("Cp1046", null, new String[] {}),
963         new Charset("Cp1097", null, new String[] {}),
964         new Charset("Cp1098", null, new String[] {}),
965         new Charset("Cp1112", null, new String[] {}),
966         new Charset("Cp1122", null, new String[] {}),
967         new Charset("Cp1123", null, new String[] {}),
968         new Charset("Cp1124", null, new String[] {}),
969         new Charset("Cp1381", null, new String[] {}),
970         new Charset("Cp1383", null, new String[] {}),
971         new Charset("Cp33722", null, new String[] {}),
972         new Charset("Big5_Solaris", null, new String[] {}),
973         new Charset("EUC_JP_LINUX", null, new String[] {}),
974         new Charset("EUC_JP_Solaris", null, new String[] {}),
975         new Charset("ISCII91", null, new String[] {"x-ISCII91", "iscii"}),
976         new Charset("ISO2022_CN_CNS", null, new String[] {}),
977         new Charset("ISO2022_CN_GB", null, new String[] {}),
978         new Charset("x-iso-8859-11", null, new String[] {}),
979         new Charset("JISAutoDetect", null, new String[] {}),
980         new Charset("MacArabic", null, new String[] {}),
981         new Charset("MacCentralEurope", null, new String[] {}),
982         new Charset("MacCroatian", null, new String[] {}),
983         new Charset("MacCyrillic", null, new String[] {}),
984         new Charset("MacDingbat", null, new String[] {}),
985         new Charset("MacGreek", "MacGreek", new String[] {}),
986         new Charset("MacHebrew", null, new String[] {}),
987         new Charset("MacIceland", null, new String[] {}),
988         new Charset("MacRoman", "MacRoman", new String[] {"Macintosh", "MAC", "csMacintosh"}),
989         new Charset("MacRomania", null, new String[] {}),
990         new Charset("MacSymbol", null, new String[] {}),
991         new Charset("MacThai", null, new String[] {}),
992         new Charset("MacTurkish", null, new String[] {}),
993         new Charset("MacUkraine", null, new String[] {}),
994         new Charset("UnicodeBig", null, new String[] {}),
995         new Charset("UnicodeLittle", null, new String[] {})
996     };
997 
998     /**
999      * Contains the canonical names of character sets which can be used to 
1000      * decode bytes into Java chars.
1001      */
1002     private static SortedSet<String> decodingSupported = null;
1003     
1004     /**
1005      * Contains the canonical names of character sets which can be used to 
1006      * encode Java chars into bytes.
1007      */
1008     private static SortedSet<String> encodingSupported = null;
1009     
1010     /**
1011      * Maps character set names to Charset objects. All possible names of
1012      * a charset will be mapped to the Charset.
1013      */
1014     private static Map<String, Charset> charsetMap = null;
1015     
1016     static {
1017         decodingSupported = new TreeSet<String>();
1018         encodingSupported = new TreeSet<String>();
1019         byte[] dummy = new byte[] {'d', 'u', 'm', 'm', 'y'};
1020         for (Charset c : JAVA_CHARSETS) {
1021             try {
1022                 new String(dummy, c.canonical);
1023                 decodingSupported.add(c.canonical.toLowerCase());
1024             } catch (UnsupportedOperationException e) {
1025             } catch (UnsupportedEncodingException e) {
1026             }
1027             try {
1028                 "dummy".getBytes(c.canonical);
1029                 encodingSupported.add(c.canonical.toLowerCase());
1030             } catch (UnsupportedOperationException e) {
1031             } catch (UnsupportedEncodingException e) {
1032             }
1033         }
1034         
1035         charsetMap = new HashMap<String, Charset>();
1036         for (Charset c : JAVA_CHARSETS) {
1037             charsetMap.put(c.canonical.toLowerCase(), c);
1038             if (c.mime != null) {
1039                 charsetMap.put(c.mime.toLowerCase(), c);
1040             }
1041             if (c.aliases != null) {
1042                 for (String str : c.aliases) {
1043                     charsetMap.put(str.toLowerCase(), c);
1044                 }
1045             }
1046         }
1047         
1048         if (log.isDebugEnabled()) {
1049             log.debug("Character sets which support decoding: " 
1050                         + decodingSupported);
1051             log.debug("Character sets which support encoding: " 
1052                         + encodingSupported);
1053         }
1054     }
1055 
1056     /** carriage return - line feed sequence */
1057     public static final String CRLF = "\r\n";
1058 
1059     /** US-ASCII CR, carriage return (13) */
1060     public static final int CR = '\r';
1061 
1062     /** US-ASCII LF, line feed (10) */
1063     public static final int LF = '\n';
1064 
1065     /** US-ASCII SP, space (32) */
1066     public static final int SP = ' ';
1067 
1068     /** US-ASCII HT, horizontal-tab (9) */
1069     public static final int HT = '\t';
1070 
1071     public static final java.nio.charset.Charset US_ASCII = java.nio.charset.Charset
1072             .forName("US-ASCII");
1073 
1074     public static final java.nio.charset.Charset ISO_8859_1 = java.nio.charset.Charset
1075             .forName("ISO-8859-1");
1076 
1077     public static final java.nio.charset.Charset UTF_8 = java.nio.charset.Charset
1078             .forName("UTF-8");
1079 
1080     public static final java.nio.charset.Charset DEFAULT_CHARSET = US_ASCII;
1081 
1082     /**
1083      * Returns <code>true</code> if the specified character falls into the US
1084      * ASCII character set (Unicode range 0000 to 007f).
1085      * 
1086      * @param ch
1087      *            character to test.
1088      * @return <code>true</code> if the specified character falls into the US
1089      *         ASCII character set, <code>false</code> otherwise.
1090      */
1091     public static boolean isASCII(char ch) {
1092         return (0xFF80 & ch) == 0;
1093     }
1094 
1095     /**
1096      * Returns <code>true</code> if the specified string consists entirely of
1097      * US ASCII characters.
1098      * 
1099      * @param s
1100      *            string to test.
1101      * @return <code>true</code> if the specified string consists entirely of
1102      *         US ASCII characters, <code>false</code> otherwise.
1103      */
1104     public static boolean isASCII(final String s) {
1105         if (s == null) {
1106             throw new IllegalArgumentException("String may not be null");
1107         }
1108         final int len = s.length();
1109         for (int i = 0; i < len; i++) {
1110             if (!isASCII(s.charAt(i))) {
1111                 return false;
1112             }
1113         }
1114         return true;
1115     }
1116 
1117     /**
1118      * Returns <code>true</code> if the specified character is a whitespace
1119      * character (CR, LF, SP or HT).
1120      * 
1121      * @param ch
1122      *            character to test.
1123      * @return <code>true</code> if the specified character is a whitespace
1124      *         character, <code>false</code> otherwise.
1125      */
1126     public static boolean isWhitespace(char ch) {
1127         return ch == SP || ch == HT || ch == CR || ch == LF;
1128     }
1129 
1130     /**
1131      * Returns <code>true</code> if the specified string consists entirely of
1132      * whitespace characters.
1133      * 
1134      * @param s
1135      *            string to test.
1136      * @return <code>true</code> if the specified string consists entirely of
1137      *         whitespace characters, <code>false</code> otherwise.
1138      */
1139     public static boolean isWhitespace(final String s) {
1140         if (s == null) {
1141             throw new IllegalArgumentException("String may not be null");
1142         }
1143         final int len = s.length();
1144         for (int i = 0; i < len; i++) {
1145             if (!isWhitespace(s.charAt(i))) {
1146                 return false;
1147             }
1148         }
1149         return true;
1150     }
1151     
1152     /**
1153      * Determines if the VM supports encoding (chars to bytes) the 
1154      * specified character set. NOTE: the given character set name may 
1155      * not be known to the VM even if this method returns <code>true</code>.
1156      * Use {@link #toJavaCharset(String)} to get the canonical Java character
1157      * set name.
1158      * 
1159      * @param charsetName the characters set name.
1160      * @return <code>true</code> if encoding is supported, <code>false</code>
1161      *         otherwise.
1162      */
1163     public static boolean isEncodingSupported(String charsetName) {
1164         return encodingSupported.contains(charsetName.toLowerCase());
1165     }
1166     
1167     /**
1168      * Determines if the VM supports decoding (bytes to chars) the 
1169      * specified character set. NOTE: the given character set name may 
1170      * not be known to the VM even if this method returns <code>true</code>.
1171      * Use {@link #toJavaCharset(String)} to get the canonical Java character
1172      * set name.
1173      * 
1174      * @param charsetName the characters set name.
1175      * @return <code>true</code> if decoding is supported, <code>false</code>
1176      *         otherwise.
1177      */
1178     public static boolean isDecodingSupported(String charsetName) {
1179         return decodingSupported.contains(charsetName.toLowerCase());
1180     }
1181     
1182     /**
1183      * Gets the preferred MIME character set name for the specified
1184      * character set or <code>null</code> if not known.
1185      * 
1186      * @param charsetName the character set name to look for.
1187      * @return the MIME preferred name or <code>null</code> if not known.
1188      */
1189     public static String toMimeCharset(String charsetName) {
1190         Charset c = charsetMap.get(charsetName.toLowerCase());
1191         if (c != null) {
1192             return c.mime;
1193         }
1194         return null;
1195     }
1196     
1197     /**
1198      * Gets the canonical Java character set name for the specified
1199      * character set or <code>null</code> if not known. This should be
1200      * called before doing any conversions using the Java API. NOTE:
1201      * you must use {@link #isEncodingSupported(String)} or
1202      * {@link #isDecodingSupported(String)} to make sure the returned
1203      * Java character set is supported by the current VM.
1204      * 
1205      * @param charsetName the character set name to look for.
1206      * @return the canonical Java name or <code>null</code> if not known.
1207      */
1208     public static String toJavaCharset(String charsetName) {
1209         Charset c = charsetMap.get(charsetName.toLowerCase());
1210         if (c != null) {
1211             return c.canonical;
1212         }
1213         return null;
1214     }
1215 
1216     public static java.nio.charset.Charset getCharset(String charsetName) {
1217         String defaultCharset = "ISO-8859-1";
1218         
1219         // Use the default chareset if given charset is null
1220         if(charsetName == null) charsetName = defaultCharset;
1221             
1222         try {
1223             return java.nio.charset.Charset.forName(charsetName);
1224         } catch (IllegalCharsetNameException e) {
1225             log.info("Illegal charset " + charsetName + ", fallback to " + defaultCharset + ": " + e);
1226             // Use default charset on exception 
1227             return java.nio.charset.Charset.forName(defaultCharset);
1228         } catch (UnsupportedCharsetException ex) {
1229             log.info("Unsupported charset " + charsetName + ", fallback to " + defaultCharset + ": " + ex);
1230             // Use default charset on exception
1231             return java.nio.charset.Charset.forName(defaultCharset);
1232         }
1233         
1234     }
1235     /*
1236      * Uncomment the code below and run the main method to regenerate the
1237      * Javadoc table above when the known charsets change. 
1238      */
1239     
1240     /*
1241     private static String dumpHtmlTable() {
1242         List<Charset> l = new LinkedList<Charset>(Arrays.asList(JAVA_CHARSETS));
1243         Collections.sort(l);
1244         StringBuilder sb = new StringBuilder();
1245         sb.append(" * <table>\n");
1246         sb.append(" *     <tr>\n");
1247         sb.append(" *         <td>Canonical (Java) name</td>\n");
1248         sb.append(" *         <td>MIME preferred</td>\n");
1249         sb.append(" *         <td>Aliases</td>\n");
1250         sb.append(" *     </tr>\n");
1251 
1252         for (Charset c : l) {
1253             sb.append(" *     <tr>\n");
1254             sb.append(" *         <td>" + c.canonical + "</td>\n");
1255             sb.append(" *         <td>" + (c.mime == null ? "?" : c.mime)+ "</td>\n");
1256             sb.append(" *         <td>");
1257             for (int i = 0; c.aliases != null && i < c.aliases.length; i++) {
1258                 sb.append(c.aliases[i] + " ");
1259             }
1260             sb.append("</td>\n");
1261             sb.append(" *     </tr>\n");
1262         }
1263         sb.append(" * </table>\n");
1264         return sb.toString();
1265     }
1266     
1267     public static void main(String[] args) {
1268         System.out.println(dumpHtmlTable());
1269     }
1270     */
1271 }