1 /****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one *
3 * or more contributor license agreements. See the NOTICE file *
4 * distributed with this work for additional information *
5 * regarding copyright ownership. The ASF licenses this file *
6 * to you under the Apache License, Version 2.0 (the *
7 * "License"); you may not use this file except in compliance *
8 * with the License. You may obtain a copy of the License at *
9 * *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, *
13 * software distributed under the License is distributed on an *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15 * KIND, either express or implied. See the License for the *
16 * specific language governing permissions and limitations *
17 * under the License. *
18 ****************************************************************/
19
20 package org.apache.james.mime4j.util;
21
22 import java.io.UnsupportedEncodingException;
23 import java.nio.charset.IllegalCharsetNameException;
24 import java.nio.charset.UnsupportedCharsetException;
25 import java.util.HashMap;
26 import java.util.Map;
27 import java.util.SortedSet;
28 import java.util.TreeSet;
29
30 import org.apache.commons.logging.Log;
31 import org.apache.commons.logging.LogFactory;
32
33 /**
34 * Utility class for working with character sets. It is somewhat similar to
35 * the Java 1.4 <code>java.nio.charset.Charset</code> class but knows many
36 * more aliases and is compatible with Java 1.3. It will use a simple detection
37 * mechanism to detect what character sets the current VM supports. This will
38 * be a sub-set of the character sets listed in the
39 * <a href="http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html">
40 * Java 1.5 (J2SE5.0) Supported Encodings</a> document.
41 * <p>
42 * The <a href="http://www.iana.org/assignments/character-sets">
43 * IANA Character Sets</a> document has been used to determine the preferred
44 * MIME character set names and to get a list of known aliases.
45 * <p>
46 * This is a complete list of the character sets known to this class:
47 * <table>
48 * <tr>
49 * <td>Canonical (Java) name</td>
50 * <td>MIME preferred</td>
51 * <td>Aliases</td>
52 * </tr>
53 * <tr>
54 * <td>ASCII</td>
55 * <td>US-ASCII</td>
56 * <td>ANSI_X3.4-1968 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ISO646-US us IBM367 cp367 csASCII ascii7 646 iso_646.irv:1983 </td>
57 * </tr>
58 * <tr>
59 * <td>Big5</td>
60 * <td>Big5</td>
61 * <td>csBig5 CN-Big5 BIG-FIVE BIGFIVE </td>
62 * </tr>
63 * <tr>
64 * <td>Big5_HKSCS</td>
65 * <td>Big5-HKSCS</td>
66 * <td>big5hkscs </td>
67 * </tr>
68 * <tr>
69 * <td>Big5_Solaris</td>
70 * <td>?</td>
71 * <td></td>
72 * </tr>
73 * <tr>
74 * <td>Cp037</td>
75 * <td>IBM037</td>
76 * <td>ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 </td>
77 * </tr>
78 * <tr>
79 * <td>Cp1006</td>
80 * <td>?</td>
81 * <td></td>
82 * </tr>
83 * <tr>
84 * <td>Cp1025</td>
85 * <td>?</td>
86 * <td></td>
87 * </tr>
88 * <tr>
89 * <td>Cp1026</td>
90 * <td>IBM1026</td>
91 * <td>csIBM1026 </td>
92 * </tr>
93 * <tr>
94 * <td>Cp1046</td>
95 * <td>?</td>
96 * <td></td>
97 * </tr>
98 * <tr>
99 * <td>Cp1047</td>
100 * <td>IBM1047</td>
101 * <td>IBM-1047 </td>
102 * </tr>
103 * <tr>
104 * <td>Cp1097</td>
105 * <td>?</td>
106 * <td></td>
107 * </tr>
108 * <tr>
109 * <td>Cp1098</td>
110 * <td>?</td>
111 * <td></td>
112 * </tr>
113 * <tr>
114 * <td>Cp1112</td>
115 * <td>?</td>
116 * <td></td>
117 * </tr>
118 * <tr>
119 * <td>Cp1122</td>
120 * <td>?</td>
121 * <td></td>
122 * </tr>
123 * <tr>
124 * <td>Cp1123</td>
125 * <td>?</td>
126 * <td></td>
127 * </tr>
128 * <tr>
129 * <td>Cp1124</td>
130 * <td>?</td>
131 * <td></td>
132 * </tr>
133 * <tr>
134 * <td>Cp1140</td>
135 * <td>IBM01140</td>
136 * <td>CCSID01140 CP01140 ebcdic-us-37+euro </td>
137 * </tr>
138 * <tr>
139 * <td>Cp1141</td>
140 * <td>IBM01141</td>
141 * <td>CCSID01141 CP01141 ebcdic-de-273+euro </td>
142 * </tr>
143 * <tr>
144 * <td>Cp1142</td>
145 * <td>IBM01142</td>
146 * <td>CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro </td>
147 * </tr>
148 * <tr>
149 * <td>Cp1143</td>
150 * <td>IBM01143</td>
151 * <td>CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro </td>
152 * </tr>
153 * <tr>
154 * <td>Cp1144</td>
155 * <td>IBM01144</td>
156 * <td>CCSID01144 CP01144 ebcdic-it-280+euro </td>
157 * </tr>
158 * <tr>
159 * <td>Cp1145</td>
160 * <td>IBM01145</td>
161 * <td>CCSID01145 CP01145 ebcdic-es-284+euro </td>
162 * </tr>
163 * <tr>
164 * <td>Cp1146</td>
165 * <td>IBM01146</td>
166 * <td>CCSID01146 CP01146 ebcdic-gb-285+euro </td>
167 * </tr>
168 * <tr>
169 * <td>Cp1147</td>
170 * <td>IBM01147</td>
171 * <td>CCSID01147 CP01147 ebcdic-fr-297+euro </td>
172 * </tr>
173 * <tr>
174 * <td>Cp1148</td>
175 * <td>IBM01148</td>
176 * <td>CCSID01148 CP01148 ebcdic-international-500+euro </td>
177 * </tr>
178 * <tr>
179 * <td>Cp1149</td>
180 * <td>IBM01149</td>
181 * <td>CCSID01149 CP01149 ebcdic-is-871+euro </td>
182 * </tr>
183 * <tr>
184 * <td>Cp1250</td>
185 * <td>windows-1250</td>
186 * <td></td>
187 * </tr>
188 * <tr>
189 * <td>Cp1251</td>
190 * <td>windows-1251</td>
191 * <td></td>
192 * </tr>
193 * <tr>
194 * <td>Cp1252</td>
195 * <td>windows-1252</td>
196 * <td></td>
197 * </tr>
198 * <tr>
199 * <td>Cp1253</td>
200 * <td>windows-1253</td>
201 * <td></td>
202 * </tr>
203 * <tr>
204 * <td>Cp1254</td>
205 * <td>windows-1254</td>
206 * <td></td>
207 * </tr>
208 * <tr>
209 * <td>Cp1255</td>
210 * <td>windows-1255</td>
211 * <td></td>
212 * </tr>
213 * <tr>
214 * <td>Cp1256</td>
215 * <td>windows-1256</td>
216 * <td></td>
217 * </tr>
218 * <tr>
219 * <td>Cp1257</td>
220 * <td>windows-1257</td>
221 * <td></td>
222 * </tr>
223 * <tr>
224 * <td>Cp1258</td>
225 * <td>windows-1258</td>
226 * <td></td>
227 * </tr>
228 * <tr>
229 * <td>Cp1381</td>
230 * <td>?</td>
231 * <td></td>
232 * </tr>
233 * <tr>
234 * <td>Cp1383</td>
235 * <td>?</td>
236 * <td></td>
237 * </tr>
238 * <tr>
239 * <td>Cp273</td>
240 * <td>IBM273</td>
241 * <td>csIBM273 </td>
242 * </tr>
243 * <tr>
244 * <td>Cp277</td>
245 * <td>IBM277</td>
246 * <td>EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 </td>
247 * </tr>
248 * <tr>
249 * <td>Cp278</td>
250 * <td>IBM278</td>
251 * <td>CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 </td>
252 * </tr>
253 * <tr>
254 * <td>Cp280</td>
255 * <td>IBM280</td>
256 * <td>ebcdic-cp-it csIBM280 </td>
257 * </tr>
258 * <tr>
259 * <td>Cp284</td>
260 * <td>IBM284</td>
261 * <td>ebcdic-cp-es csIBM284 </td>
262 * </tr>
263 * <tr>
264 * <td>Cp285</td>
265 * <td>IBM285</td>
266 * <td>ebcdic-cp-gb csIBM285 </td>
267 * </tr>
268 * <tr>
269 * <td>Cp297</td>
270 * <td>IBM297</td>
271 * <td>ebcdic-cp-fr csIBM297 </td>
272 * </tr>
273 * <tr>
274 * <td>Cp33722</td>
275 * <td>?</td>
276 * <td></td>
277 * </tr>
278 * <tr>
279 * <td>Cp420</td>
280 * <td>IBM420</td>
281 * <td>ebcdic-cp-ar1 csIBM420 </td>
282 * </tr>
283 * <tr>
284 * <td>Cp424</td>
285 * <td>IBM424</td>
286 * <td>ebcdic-cp-he csIBM424 </td>
287 * </tr>
288 * <tr>
289 * <td>Cp437</td>
290 * <td>IBM437</td>
291 * <td>437 csPC8CodePage437 </td>
292 * </tr>
293 * <tr>
294 * <td>Cp500</td>
295 * <td>IBM500</td>
296 * <td>ebcdic-cp-be ebcdic-cp-ch csIBM500 </td>
297 * </tr>
298 * <tr>
299 * <td>Cp737</td>
300 * <td>?</td>
301 * <td></td>
302 * </tr>
303 * <tr>
304 * <td>Cp775</td>
305 * <td>IBM775</td>
306 * <td>csPC775Baltic </td>
307 * </tr>
308 * <tr>
309 * <td>Cp838</td>
310 * <td>IBM-Thai</td>
311 * <td></td>
312 * </tr>
313 * <tr>
314 * <td>Cp850</td>
315 * <td>IBM850</td>
316 * <td>850 csPC850Multilingual </td>
317 * </tr>
318 * <tr>
319 * <td>Cp852</td>
320 * <td>IBM852</td>
321 * <td>852 csPCp852 </td>
322 * </tr>
323 * <tr>
324 * <td>Cp855</td>
325 * <td>IBM855</td>
326 * <td>855 csIBM855 </td>
327 * </tr>
328 * <tr>
329 * <td>Cp856</td>
330 * <td>?</td>
331 * <td></td>
332 * </tr>
333 * <tr>
334 * <td>Cp857</td>
335 * <td>IBM857</td>
336 * <td>857 csIBM857 </td>
337 * </tr>
338 * <tr>
339 * <td>Cp858</td>
340 * <td>IBM00858</td>
341 * <td>CCSID00858 CP00858 PC-Multilingual-850+euro </td>
342 * </tr>
343 * <tr>
344 * <td>Cp860</td>
345 * <td>IBM860</td>
346 * <td>860 csIBM860 </td>
347 * </tr>
348 * <tr>
349 * <td>Cp861</td>
350 * <td>IBM861</td>
351 * <td>861 cp-is csIBM861 </td>
352 * </tr>
353 * <tr>
354 * <td>Cp862</td>
355 * <td>IBM862</td>
356 * <td>862 csPC862LatinHebrew </td>
357 * </tr>
358 * <tr>
359 * <td>Cp863</td>
360 * <td>IBM863</td>
361 * <td>863 csIBM863 </td>
362 * </tr>
363 * <tr>
364 * <td>Cp864</td>
365 * <td>IBM864</td>
366 * <td>cp864 csIBM864 </td>
367 * </tr>
368 * <tr>
369 * <td>Cp865</td>
370 * <td>IBM865</td>
371 * <td>865 csIBM865 </td>
372 * </tr>
373 * <tr>
374 * <td>Cp866</td>
375 * <td>IBM866</td>
376 * <td>866 csIBM866 </td>
377 * </tr>
378 * <tr>
379 * <td>Cp868</td>
380 * <td>IBM868</td>
381 * <td>cp-ar csIBM868 </td>
382 * </tr>
383 * <tr>
384 * <td>Cp869</td>
385 * <td>IBM869</td>
386 * <td>cp-gr csIBM869 </td>
387 * </tr>
388 * <tr>
389 * <td>Cp870</td>
390 * <td>IBM870</td>
391 * <td>ebcdic-cp-roece ebcdic-cp-yu csIBM870 </td>
392 * </tr>
393 * <tr>
394 * <td>Cp871</td>
395 * <td>IBM871</td>
396 * <td>ebcdic-cp-is csIBM871 </td>
397 * </tr>
398 * <tr>
399 * <td>Cp875</td>
400 * <td>?</td>
401 * <td></td>
402 * </tr>
403 * <tr>
404 * <td>Cp918</td>
405 * <td>IBM918</td>
406 * <td>ebcdic-cp-ar2 csIBM918 </td>
407 * </tr>
408 * <tr>
409 * <td>Cp921</td>
410 * <td>?</td>
411 * <td></td>
412 * </tr>
413 * <tr>
414 * <td>Cp922</td>
415 * <td>?</td>
416 * <td></td>
417 * </tr>
418 * <tr>
419 * <td>Cp930</td>
420 * <td>?</td>
421 * <td></td>
422 * </tr>
423 * <tr>
424 * <td>Cp933</td>
425 * <td>?</td>
426 * <td></td>
427 * </tr>
428 * <tr>
429 * <td>Cp935</td>
430 * <td>?</td>
431 * <td></td>
432 * </tr>
433 * <tr>
434 * <td>Cp937</td>
435 * <td>?</td>
436 * <td></td>
437 * </tr>
438 * <tr>
439 * <td>Cp939</td>
440 * <td>?</td>
441 * <td></td>
442 * </tr>
443 * <tr>
444 * <td>Cp942</td>
445 * <td>?</td>
446 * <td></td>
447 * </tr>
448 * <tr>
449 * <td>Cp942C</td>
450 * <td>?</td>
451 * <td></td>
452 * </tr>
453 * <tr>
454 * <td>Cp943</td>
455 * <td>?</td>
456 * <td></td>
457 * </tr>
458 * <tr>
459 * <td>Cp943C</td>
460 * <td>?</td>
461 * <td></td>
462 * </tr>
463 * <tr>
464 * <td>Cp948</td>
465 * <td>?</td>
466 * <td></td>
467 * </tr>
468 * <tr>
469 * <td>Cp949</td>
470 * <td>?</td>
471 * <td></td>
472 * </tr>
473 * <tr>
474 * <td>Cp949C</td>
475 * <td>?</td>
476 * <td></td>
477 * </tr>
478 * <tr>
479 * <td>Cp950</td>
480 * <td>?</td>
481 * <td></td>
482 * </tr>
483 * <tr>
484 * <td>Cp964</td>
485 * <td>?</td>
486 * <td></td>
487 * </tr>
488 * <tr>
489 * <td>Cp970</td>
490 * <td>?</td>
491 * <td></td>
492 * </tr>
493 * <tr>
494 * <td>EUC_CN</td>
495 * <td>GB2312</td>
496 * <td>x-EUC-CN csGB2312 euccn euc-cn gb2312-80 gb2312-1980 CN-GB CN-GB-ISOIR165 </td>
497 * </tr>
498 * <tr>
499 * <td>EUC_JP</td>
500 * <td>EUC-JP</td>
501 * <td>csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese eucjis x-eucjp eucjp x-euc-jp </td>
502 * </tr>
503 * <tr>
504 * <td>EUC_JP_LINUX</td>
505 * <td>?</td>
506 * <td></td>
507 * </tr>
508 * <tr>
509 * <td>EUC_JP_Solaris</td>
510 * <td>?</td>
511 * <td></td>
512 * </tr>
513 * <tr>
514 * <td>EUC_KR</td>
515 * <td>EUC-KR</td>
516 * <td>csEUCKR ksc5601 5601 ksc5601_1987 ksc_5601 ksc5601-1987 ks_c_5601-1987 euckr </td>
517 * </tr>
518 * <tr>
519 * <td>EUC_TW</td>
520 * <td>EUC-TW</td>
521 * <td>x-EUC-TW cns11643 euctw </td>
522 * </tr>
523 * <tr>
524 * <td>GB18030</td>
525 * <td>GB18030</td>
526 * <td>gb18030-2000 </td>
527 * </tr>
528 * <tr>
529 * <td>GBK</td>
530 * <td>windows-936</td>
531 * <td>CP936 MS936 ms_936 x-mswin-936 </td>
532 * </tr>
533 * <tr>
534 * <td>ISCII91</td>
535 * <td>?</td>
536 * <td>x-ISCII91 iscii </td>
537 * </tr>
538 * <tr>
539 * <td>ISO2022CN</td>
540 * <td>ISO-2022-CN</td>
541 * <td></td>
542 * </tr>
543 * <tr>
544 * <td>ISO2022JP</td>
545 * <td>ISO-2022-JP</td>
546 * <td>csISO2022JP JIS jis_encoding csjisencoding </td>
547 * </tr>
548 * <tr>
549 * <td>ISO2022KR</td>
550 * <td>ISO-2022-KR</td>
551 * <td>csISO2022KR </td>
552 * </tr>
553 * <tr>
554 * <td>ISO2022_CN_CNS</td>
555 * <td>?</td>
556 * <td></td>
557 * </tr>
558 * <tr>
559 * <td>ISO2022_CN_GB</td>
560 * <td>?</td>
561 * <td></td>
562 * </tr>
563 * <tr>
564 * <td>ISO8859_1</td>
565 * <td>ISO-8859-1</td>
566 * <td>ISO_8859-1:1987 iso-ir-100 ISO_8859-1 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 819 IBM-819 ISO8859-1 ISO_8859_1 </td>
567 * </tr>
568 * <tr>
569 * <td>ISO8859_13</td>
570 * <td>ISO-8859-13</td>
571 * <td></td>
572 * </tr>
573 * <tr>
574 * <td>ISO8859_15</td>
575 * <td>ISO-8859-15</td>
576 * <td>ISO_8859-15 Latin-9 8859_15 csISOlatin9 IBM923 cp923 923 L9 IBM-923 ISO8859-15 LATIN9 LATIN0 csISOlatin0 ISO8859_15_FDIS </td>
577 * </tr>
578 * <tr>
579 * <td>ISO8859_2</td>
580 * <td>ISO-8859-2</td>
581 * <td>ISO_8859-2:1987 iso-ir-101 ISO_8859-2 latin2 l2 csISOLatin2 8859_2 iso8859_2 </td>
582 * </tr>
583 * <tr>
584 * <td>ISO8859_3</td>
585 * <td>ISO-8859-3</td>
586 * <td>ISO_8859-3:1988 iso-ir-109 ISO_8859-3 latin3 l3 csISOLatin3 8859_3 </td>
587 * </tr>
588 * <tr>
589 * <td>ISO8859_4</td>
590 * <td>ISO-8859-4</td>
591 * <td>ISO_8859-4:1988 iso-ir-110 ISO_8859-4 latin4 l4 csISOLatin4 8859_4 </td>
592 * </tr>
593 * <tr>
594 * <td>ISO8859_5</td>
595 * <td>ISO-8859-5</td>
596 * <td>ISO_8859-5:1988 iso-ir-144 ISO_8859-5 cyrillic csISOLatinCyrillic 8859_5 </td>
597 * </tr>
598 * <tr>
599 * <td>ISO8859_6</td>
600 * <td>ISO-8859-6</td>
601 * <td>ISO_8859-6:1987 iso-ir-127 ISO_8859-6 ECMA-114 ASMO-708 arabic csISOLatinArabic 8859_6 </td>
602 * </tr>
603 * <tr>
604 * <td>ISO8859_7</td>
605 * <td>ISO-8859-7</td>
606 * <td>ISO_8859-7:1987 iso-ir-126 ISO_8859-7 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 sun_eu_greek </td>
607 * </tr>
608 * <tr>
609 * <td>ISO8859_8</td>
610 * <td>ISO-8859-8</td>
611 * <td>ISO_8859-8:1988 iso-ir-138 ISO_8859-8 hebrew csISOLatinHebrew 8859_8 </td>
612 * </tr>
613 * <tr>
614 * <td>ISO8859_9</td>
615 * <td>ISO-8859-9</td>
616 * <td>ISO_8859-9:1989 iso-ir-148 ISO_8859-9 latin5 l5 csISOLatin5 8859_9 </td>
617 * </tr>
618 * <tr>
619 * <td>JISAutoDetect</td>
620 * <td>?</td>
621 * <td></td>
622 * </tr>
623 * <tr>
624 * <td>JIS_C6626-1983</td>
625 * <td>JIS_C6626-1983</td>
626 * <td>x-JIS0208 JIS0208 csISO87JISX0208 x0208 JIS_X0208-1983 iso-ir-87 </td>
627 * </tr>
628 * <tr>
629 * <td>JIS_X0201</td>
630 * <td>JIS_X0201</td>
631 * <td>X0201 JIS0201 csHalfWidthKatakana </td>
632 * </tr>
633 * <tr>
634 * <td>JIS_X0212-1990</td>
635 * <td>JIS_X0212-1990</td>
636 * <td>iso-ir-159 x0212 JIS0212 csISO159JISX02121990 </td>
637 * </tr>
638 * <tr>
639 * <td>KOI8_R</td>
640 * <td>KOI8-R</td>
641 * <td>csKOI8R koi8 </td>
642 * </tr>
643 * <tr>
644 * <td>MS874</td>
645 * <td>windows-874</td>
646 * <td>cp874 </td>
647 * </tr>
648 * <tr>
649 * <td>MS932</td>
650 * <td>Windows-31J</td>
651 * <td>windows-932 csWindows31J x-ms-cp932 </td>
652 * </tr>
653 * <tr>
654 * <td>MS949</td>
655 * <td>windows-949</td>
656 * <td>windows949 ms_949 x-windows-949 </td>
657 * </tr>
658 * <tr>
659 * <td>MS950</td>
660 * <td>windows-950</td>
661 * <td>x-windows-950 </td>
662 * </tr>
663 * <tr>
664 * <td>MS950_HKSCS</td>
665 * <td></td>
666 * <td></td>
667 * </tr>
668 * <tr>
669 * <td>MacArabic</td>
670 * <td>?</td>
671 * <td></td>
672 * </tr>
673 * <tr>
674 * <td>MacCentralEurope</td>
675 * <td>?</td>
676 * <td></td>
677 * </tr>
678 * <tr>
679 * <td>MacCroatian</td>
680 * <td>?</td>
681 * <td></td>
682 * </tr>
683 * <tr>
684 * <td>MacCyrillic</td>
685 * <td>?</td>
686 * <td></td>
687 * </tr>
688 * <tr>
689 * <td>MacDingbat</td>
690 * <td>?</td>
691 * <td></td>
692 * </tr>
693 * <tr>
694 * <td>MacGreek</td>
695 * <td>MacGreek</td>
696 * <td></td>
697 * </tr>
698 * <tr>
699 * <td>MacHebrew</td>
700 * <td>?</td>
701 * <td></td>
702 * </tr>
703 * <tr>
704 * <td>MacIceland</td>
705 * <td>?</td>
706 * <td></td>
707 * </tr>
708 * <tr>
709 * <td>MacRoman</td>
710 * <td>MacRoman</td>
711 * <td>Macintosh MAC csMacintosh </td>
712 * </tr>
713 * <tr>
714 * <td>MacRomania</td>
715 * <td>?</td>
716 * <td></td>
717 * </tr>
718 * <tr>
719 * <td>MacSymbol</td>
720 * <td>?</td>
721 * <td></td>
722 * </tr>
723 * <tr>
724 * <td>MacThai</td>
725 * <td>?</td>
726 * <td></td>
727 * </tr>
728 * <tr>
729 * <td>MacTurkish</td>
730 * <td>?</td>
731 * <td></td>
732 * </tr>
733 * <tr>
734 * <td>MacUkraine</td>
735 * <td>?</td>
736 * <td></td>
737 * </tr>
738 * <tr>
739 * <td>SJIS</td>
740 * <td>Shift_JIS</td>
741 * <td>MS_Kanji csShiftJIS shift-jis x-sjis pck </td>
742 * </tr>
743 * <tr>
744 * <td>TIS620</td>
745 * <td>TIS-620</td>
746 * <td></td>
747 * </tr>
748 * <tr>
749 * <td>UTF-16</td>
750 * <td>UTF-16</td>
751 * <td>UTF_16 </td>
752 * </tr>
753 * <tr>
754 * <td>UTF8</td>
755 * <td>UTF-8</td>
756 * <td></td>
757 * </tr>
758 * <tr>
759 * <td>UnicodeBig</td>
760 * <td>?</td>
761 * <td></td>
762 * </tr>
763 * <tr>
764 * <td>UnicodeBigUnmarked</td>
765 * <td>UTF-16BE</td>
766 * <td>X-UTF-16BE UTF_16BE ISO-10646-UCS-2 </td>
767 * </tr>
768 * <tr>
769 * <td>UnicodeLittle</td>
770 * <td>?</td>
771 * <td></td>
772 * </tr>
773 * <tr>
774 * <td>UnicodeLittleUnmarked</td>
775 * <td>UTF-16LE</td>
776 * <td>UTF_16LE X-UTF-16LE </td>
777 * </tr>
778 * <tr>
779 * <td>x-Johab</td>
780 * <td>johab</td>
781 * <td>johab cp1361 ms1361 ksc5601-1992 ksc5601_1992 </td>
782 * </tr>
783 * <tr>
784 * <td>x-iso-8859-11</td>
785 * <td>?</td>
786 * <td></td>
787 * </tr>
788 * </table>
789 */
790 public class CharsetUtil {
791 private static Log log = LogFactory.getLog(CharsetUtil.class);
792
793 private static class Charset implements Comparable<Charset> {
794 private String canonical = null;
795 private String mime = null;
796 private String[] aliases = null;
797
798 private Charset(String canonical, String mime, String[] aliases) {
799 this.canonical = canonical;
800 this.mime = mime;
801 this.aliases = aliases;
802 }
803
804 public int compareTo(Charset c) {
805 return this.canonical.compareTo(c.canonical);
806 }
807 }
808
809 private static Charset[] JAVA_CHARSETS = {
810 new Charset("ISO8859_1", "ISO-8859-1",
811 new String[] {"ISO_8859-1:1987", "iso-ir-100", "ISO_8859-1",
812 "latin1", "l1", "IBM819", "CP819",
813 "csISOLatin1", "8859_1", "819", "IBM-819",
814 "ISO8859-1", "ISO_8859_1"}),
815 new Charset("ISO8859_2", "ISO-8859-2",
816 new String[] {"ISO_8859-2:1987", "iso-ir-101", "ISO_8859-2",
817 "latin2", "l2", "csISOLatin2", "8859_2",
818 "iso8859_2"}),
819 new Charset("ISO8859_3", "ISO-8859-3", new String[] {"ISO_8859-3:1988", "iso-ir-109", "ISO_8859-3", "latin3", "l3", "csISOLatin3", "8859_3"}),
820 new Charset("ISO8859_4", "ISO-8859-4",
821 new String[] {"ISO_8859-4:1988", "iso-ir-110", "ISO_8859-4",
822 "latin4", "l4", "csISOLatin4", "8859_4"}),
823 new Charset("ISO8859_5", "ISO-8859-5",
824 new String[] {"ISO_8859-5:1988", "iso-ir-144", "ISO_8859-5",
825 "cyrillic", "csISOLatinCyrillic", "8859_5"}),
826 new Charset("ISO8859_6", "ISO-8859-6", new String[] {"ISO_8859-6:1987", "iso-ir-127", "ISO_8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic", "8859_6"}),
827 new Charset("ISO8859_7", "ISO-8859-7",
828 new String[] {"ISO_8859-7:1987", "iso-ir-126", "ISO_8859-7",
829 "ELOT_928", "ECMA-118", "greek", "greek8",
830 "csISOLatinGreek", "8859_7", "sun_eu_greek"}),
831 new Charset("ISO8859_8", "ISO-8859-8", new String[] {"ISO_8859-8:1988", "iso-ir-138", "ISO_8859-8", "hebrew", "csISOLatinHebrew", "8859_8"}),
832 new Charset("ISO8859_9", "ISO-8859-9",
833 new String[] {"ISO_8859-9:1989", "iso-ir-148", "ISO_8859-9",
834 "latin5", "l5", "csISOLatin5", "8859_9"}),
835
836 new Charset("ISO8859_13", "ISO-8859-13", new String[] {}),
837 new Charset("ISO8859_15", "ISO-8859-15",
838 new String[] {"ISO_8859-15", "Latin-9", "8859_15",
839 "csISOlatin9", "IBM923", "cp923", "923", "L9",
840 "IBM-923", "ISO8859-15", "LATIN9", "LATIN0",
841 "csISOlatin0", "ISO8859_15_FDIS"}),
842 new Charset("KOI8_R", "KOI8-R", new String[] {"csKOI8R", "koi8"}),
843 new Charset("ASCII", "US-ASCII",
844 new String[] {"ANSI_X3.4-1968", "iso-ir-6",
845 "ANSI_X3.4-1986", "ISO_646.irv:1991",
846 "ISO646-US", "us", "IBM367", "cp367",
847 "csASCII", "ascii7", "646", "iso_646.irv:1983"}),
848 new Charset("UTF8", "UTF-8", new String[] {}),
849 new Charset("UTF-16", "UTF-16", new String[] {"UTF_16"}),
850 new Charset("UnicodeBigUnmarked", "UTF-16BE", new String[] {"X-UTF-16BE", "UTF_16BE", "ISO-10646-UCS-2"}),
851 new Charset("UnicodeLittleUnmarked", "UTF-16LE", new String[] {"UTF_16LE", "X-UTF-16LE"}),
852 new Charset("Big5", "Big5", new String[] {"csBig5", "CN-Big5", "BIG-FIVE", "BIGFIVE"}),
853 new Charset("Big5_HKSCS", "Big5-HKSCS", new String[] {"big5hkscs"}),
854 new Charset("EUC_JP", "EUC-JP",
855 new String[] {"csEUCPkdFmtJapanese",
856 "Extended_UNIX_Code_Packed_Format_for_Japanese",
857 "eucjis", "x-eucjp", "eucjp", "x-euc-jp"}),
858 new Charset("EUC_KR", "EUC-KR",
859 new String[] {"csEUCKR", "ksc5601", "5601", "ksc5601_1987",
860 "ksc_5601", "ksc5601-1987", "ks_c_5601-1987",
861 "euckr"}),
862 new Charset("GB18030", "GB18030", new String[] {"gb18030-2000"}),
863 new Charset("EUC_CN", "GB2312", new String[] {"x-EUC-CN", "csGB2312", "euccn", "euc-cn", "gb2312-80", "gb2312-1980", "CN-GB", "CN-GB-ISOIR165"}),
864 new Charset("GBK", "windows-936", new String[] {"CP936", "MS936", "ms_936", "x-mswin-936"}),
865
866 new Charset("Cp037", "IBM037", new String[] {"ebcdic-cp-us", "ebcdic-cp-ca", "ebcdic-cp-wt", "ebcdic-cp-nl", "csIBM037"}),
867 new Charset("Cp273", "IBM273", new String[] {"csIBM273"}),
868 new Charset("Cp277", "IBM277", new String[] {"EBCDIC-CP-DK", "EBCDIC-CP-NO", "csIBM277"}),
869 new Charset("Cp278", "IBM278", new String[] {"CP278", "ebcdic-cp-fi", "ebcdic-cp-se", "csIBM278"}),
870 new Charset("Cp280", "IBM280", new String[] {"ebcdic-cp-it", "csIBM280"}),
871 new Charset("Cp284", "IBM284", new String[] {"ebcdic-cp-es", "csIBM284"}),
872 new Charset("Cp285", "IBM285", new String[] {"ebcdic-cp-gb", "csIBM285"}),
873 new Charset("Cp297", "IBM297", new String[] {"ebcdic-cp-fr", "csIBM297"}),
874 new Charset("Cp420", "IBM420", new String[] {"ebcdic-cp-ar1", "csIBM420"}),
875 new Charset("Cp424", "IBM424", new String[] {"ebcdic-cp-he", "csIBM424"}),
876 new Charset("Cp437", "IBM437", new String[] {"437", "csPC8CodePage437"}),
877 new Charset("Cp500", "IBM500", new String[] {"ebcdic-cp-be", "ebcdic-cp-ch", "csIBM500"}),
878 new Charset("Cp775", "IBM775", new String[] {"csPC775Baltic"}),
879 new Charset("Cp838", "IBM-Thai", new String[] {}),
880 new Charset("Cp850", "IBM850", new String[] {"850", "csPC850Multilingual"}),
881 new Charset("Cp852", "IBM852", new String[] {"852", "csPCp852"}),
882 new Charset("Cp855", "IBM855", new String[] {"855", "csIBM855"}),
883 new Charset("Cp857", "IBM857", new String[] {"857", "csIBM857"}),
884 new Charset("Cp858", "IBM00858",
885 new String[] {"CCSID00858", "CP00858",
886 "PC-Multilingual-850+euro"}),
887 new Charset("Cp860", "IBM860", new String[] {"860", "csIBM860"}),
888 new Charset("Cp861", "IBM861", new String[] {"861", "cp-is", "csIBM861"}),
889 new Charset("Cp862", "IBM862", new String[] {"862", "csPC862LatinHebrew"}),
890 new Charset("Cp863", "IBM863", new String[] {"863", "csIBM863"}),
891 new Charset("Cp864", "IBM864", new String[] {"cp864", "csIBM864"}),
892 new Charset("Cp865", "IBM865", new String[] {"865", "csIBM865"}),
893 new Charset("Cp866", "IBM866", new String[] {"866", "csIBM866"}),
894 new Charset("Cp868", "IBM868", new String[] {"cp-ar", "csIBM868"}),
895 new Charset("Cp869", "IBM869", new String[] {"cp-gr", "csIBM869"}),
896 new Charset("Cp870", "IBM870", new String[] {"ebcdic-cp-roece", "ebcdic-cp-yu", "csIBM870"}),
897 new Charset("Cp871", "IBM871", new String[] {"ebcdic-cp-is", "csIBM871"}),
898 new Charset("Cp918", "IBM918", new String[] {"ebcdic-cp-ar2", "csIBM918"}),
899 new Charset("Cp1026", "IBM1026", new String[] {"csIBM1026"}),
900 new Charset("Cp1047", "IBM1047", new String[] {"IBM-1047"}),
901 new Charset("Cp1140", "IBM01140",
902 new String[] {"CCSID01140", "CP01140",
903 "ebcdic-us-37+euro"}),
904 new Charset("Cp1141", "IBM01141",
905 new String[] {"CCSID01141", "CP01141",
906 "ebcdic-de-273+euro"}),
907 new Charset("Cp1142", "IBM01142", new String[] {"CCSID01142", "CP01142", "ebcdic-dk-277+euro", "ebcdic-no-277+euro"}),
908 new Charset("Cp1143", "IBM01143", new String[] {"CCSID01143", "CP01143", "ebcdic-fi-278+euro", "ebcdic-se-278+euro"}),
909 new Charset("Cp1144", "IBM01144", new String[] {"CCSID01144", "CP01144", "ebcdic-it-280+euro"}),
910 new Charset("Cp1145", "IBM01145", new String[] {"CCSID01145", "CP01145", "ebcdic-es-284+euro"}),
911 new Charset("Cp1146", "IBM01146", new String[] {"CCSID01146", "CP01146", "ebcdic-gb-285+euro"}),
912 new Charset("Cp1147", "IBM01147", new String[] {"CCSID01147", "CP01147", "ebcdic-fr-297+euro"}),
913 new Charset("Cp1148", "IBM01148", new String[] {"CCSID01148", "CP01148", "ebcdic-international-500+euro"}),
914 new Charset("Cp1149", "IBM01149", new String[] {"CCSID01149", "CP01149", "ebcdic-is-871+euro"}),
915 new Charset("Cp1250", "windows-1250", new String[] {}),
916 new Charset("Cp1251", "windows-1251", new String[] {}),
917 new Charset("Cp1252", "windows-1252", new String[] {}),
918 new Charset("Cp1253", "windows-1253", new String[] {}),
919 new Charset("Cp1254", "windows-1254", new String[] {}),
920 new Charset("Cp1255", "windows-1255", new String[] {}),
921 new Charset("Cp1256", "windows-1256", new String[] {}),
922 new Charset("Cp1257", "windows-1257", new String[] {}),
923 new Charset("Cp1258", "windows-1258", new String[] {}),
924 new Charset("ISO2022CN", "ISO-2022-CN", new String[] {}),
925 new Charset("ISO2022JP", "ISO-2022-JP", new String[] {"csISO2022JP", "JIS", "jis_encoding", "csjisencoding"}),
926 new Charset("ISO2022KR", "ISO-2022-KR", new String[] {"csISO2022KR"}),
927 new Charset("JIS_X0201", "JIS_X0201", new String[] {"X0201", "JIS0201", "csHalfWidthKatakana"}),
928 new Charset("JIS_X0212-1990", "JIS_X0212-1990", new String[] {"iso-ir-159", "x0212", "JIS0212", "csISO159JISX02121990"}),
929 new Charset("JIS_C6626-1983", "JIS_C6626-1983", new String[] {"x-JIS0208", "JIS0208", "csISO87JISX0208", "x0208", "JIS_X0208-1983", "iso-ir-87"}),
930 new Charset("SJIS", "Shift_JIS", new String[] {"MS_Kanji", "csShiftJIS", "shift-jis", "x-sjis", "pck"}),
931 new Charset("TIS620", "TIS-620", new String[] {}),
932 new Charset("MS932", "Windows-31J", new String[] {"windows-932", "csWindows31J", "x-ms-cp932"}),
933 new Charset("EUC_TW", "EUC-TW", new String[] {"x-EUC-TW", "cns11643", "euctw"}),
934 new Charset("x-Johab", "johab", new String[] {"johab", "cp1361", "ms1361", "ksc5601-1992", "ksc5601_1992"}),
935 new Charset("MS950_HKSCS", "", new String[] {}),
936 new Charset("MS874", "windows-874", new String[] {"cp874"}),
937 new Charset("MS949", "windows-949", new String[] {"windows949", "ms_949", "x-windows-949"}),
938 new Charset("MS950", "windows-950", new String[] {"x-windows-950"}),
939
940 new Charset("Cp737", null, new String[] {}),
941 new Charset("Cp856", null, new String[] {}),
942 new Charset("Cp875", null, new String[] {}),
943 new Charset("Cp921", null, new String[] {}),
944 new Charset("Cp922", null, new String[] {}),
945 new Charset("Cp930", null, new String[] {}),
946 new Charset("Cp933", null, new String[] {}),
947 new Charset("Cp935", null, new String[] {}),
948 new Charset("Cp937", null, new String[] {}),
949 new Charset("Cp939", null, new String[] {}),
950 new Charset("Cp942", null, new String[] {}),
951 new Charset("Cp942C", null, new String[] {}),
952 new Charset("Cp943", null, new String[] {}),
953 new Charset("Cp943C", null, new String[] {}),
954 new Charset("Cp948", null, new String[] {}),
955 new Charset("Cp949", null, new String[] {}),
956 new Charset("Cp949C", null, new String[] {}),
957 new Charset("Cp950", null, new String[] {}),
958 new Charset("Cp964", null, new String[] {}),
959 new Charset("Cp970", null, new String[] {}),
960 new Charset("Cp1006", null, new String[] {}),
961 new Charset("Cp1025", null, new String[] {}),
962 new Charset("Cp1046", null, new String[] {}),
963 new Charset("Cp1097", null, new String[] {}),
964 new Charset("Cp1098", null, new String[] {}),
965 new Charset("Cp1112", null, new String[] {}),
966 new Charset("Cp1122", null, new String[] {}),
967 new Charset("Cp1123", null, new String[] {}),
968 new Charset("Cp1124", null, new String[] {}),
969 new Charset("Cp1381", null, new String[] {}),
970 new Charset("Cp1383", null, new String[] {}),
971 new Charset("Cp33722", null, new String[] {}),
972 new Charset("Big5_Solaris", null, new String[] {}),
973 new Charset("EUC_JP_LINUX", null, new String[] {}),
974 new Charset("EUC_JP_Solaris", null, new String[] {}),
975 new Charset("ISCII91", null, new String[] {"x-ISCII91", "iscii"}),
976 new Charset("ISO2022_CN_CNS", null, new String[] {}),
977 new Charset("ISO2022_CN_GB", null, new String[] {}),
978 new Charset("x-iso-8859-11", null, new String[] {}),
979 new Charset("JISAutoDetect", null, new String[] {}),
980 new Charset("MacArabic", null, new String[] {}),
981 new Charset("MacCentralEurope", null, new String[] {}),
982 new Charset("MacCroatian", null, new String[] {}),
983 new Charset("MacCyrillic", null, new String[] {}),
984 new Charset("MacDingbat", null, new String[] {}),
985 new Charset("MacGreek", "MacGreek", new String[] {}),
986 new Charset("MacHebrew", null, new String[] {}),
987 new Charset("MacIceland", null, new String[] {}),
988 new Charset("MacRoman", "MacRoman", new String[] {"Macintosh", "MAC", "csMacintosh"}),
989 new Charset("MacRomania", null, new String[] {}),
990 new Charset("MacSymbol", null, new String[] {}),
991 new Charset("MacThai", null, new String[] {}),
992 new Charset("MacTurkish", null, new String[] {}),
993 new Charset("MacUkraine", null, new String[] {}),
994 new Charset("UnicodeBig", null, new String[] {}),
995 new Charset("UnicodeLittle", null, new String[] {})
996 };
997
998 /**
999 * Contains the canonical names of character sets which can be used to
1000 * decode bytes into Java chars.
1001 */
1002 private static SortedSet<String> decodingSupported = null;
1003
1004 /**
1005 * Contains the canonical names of character sets which can be used to
1006 * encode Java chars into bytes.
1007 */
1008 private static SortedSet<String> encodingSupported = null;
1009
1010 /**
1011 * Maps character set names to Charset objects. All possible names of
1012 * a charset will be mapped to the Charset.
1013 */
1014 private static Map<String, Charset> charsetMap = null;
1015
1016 static {
1017 decodingSupported = new TreeSet<String>();
1018 encodingSupported = new TreeSet<String>();
1019 byte[] dummy = new byte[] {'d', 'u', 'm', 'm', 'y'};
1020 for (Charset c : JAVA_CHARSETS) {
1021 try {
1022 new String(dummy, c.canonical);
1023 decodingSupported.add(c.canonical.toLowerCase());
1024 } catch (UnsupportedOperationException e) {
1025 } catch (UnsupportedEncodingException e) {
1026 }
1027 try {
1028 "dummy".getBytes(c.canonical);
1029 encodingSupported.add(c.canonical.toLowerCase());
1030 } catch (UnsupportedOperationException e) {
1031 } catch (UnsupportedEncodingException e) {
1032 }
1033 }
1034
1035 charsetMap = new HashMap<String, Charset>();
1036 for (Charset c : JAVA_CHARSETS) {
1037 charsetMap.put(c.canonical.toLowerCase(), c);
1038 if (c.mime != null) {
1039 charsetMap.put(c.mime.toLowerCase(), c);
1040 }
1041 if (c.aliases != null) {
1042 for (String str : c.aliases) {
1043 charsetMap.put(str.toLowerCase(), c);
1044 }
1045 }
1046 }
1047
1048 if (log.isDebugEnabled()) {
1049 log.debug("Character sets which support decoding: "
1050 + decodingSupported);
1051 log.debug("Character sets which support encoding: "
1052 + encodingSupported);
1053 }
1054 }
1055
1056 /** carriage return - line feed sequence */
1057 public static final String CRLF = "\r\n";
1058
1059 /** US-ASCII CR, carriage return (13) */
1060 public static final int CR = '\r';
1061
1062 /** US-ASCII LF, line feed (10) */
1063 public static final int LF = '\n';
1064
1065 /** US-ASCII SP, space (32) */
1066 public static final int SP = ' ';
1067
1068 /** US-ASCII HT, horizontal-tab (9) */
1069 public static final int HT = '\t';
1070
1071 public static final java.nio.charset.Charset US_ASCII = java.nio.charset.Charset
1072 .forName("US-ASCII");
1073
1074 public static final java.nio.charset.Charset ISO_8859_1 = java.nio.charset.Charset
1075 .forName("ISO-8859-1");
1076
1077 public static final java.nio.charset.Charset UTF_8 = java.nio.charset.Charset
1078 .forName("UTF-8");
1079
1080 public static final java.nio.charset.Charset DEFAULT_CHARSET = US_ASCII;
1081
1082 /**
1083 * Returns <code>true</code> if the specified character falls into the US
1084 * ASCII character set (Unicode range 0000 to 007f).
1085 *
1086 * @param ch
1087 * character to test.
1088 * @return <code>true</code> if the specified character falls into the US
1089 * ASCII character set, <code>false</code> otherwise.
1090 */
1091 public static boolean isASCII(char ch) {
1092 return (0xFF80 & ch) == 0;
1093 }
1094
1095 /**
1096 * Returns <code>true</code> if the specified string consists entirely of
1097 * US ASCII characters.
1098 *
1099 * @param s
1100 * string to test.
1101 * @return <code>true</code> if the specified string consists entirely of
1102 * US ASCII characters, <code>false</code> otherwise.
1103 */
1104 public static boolean isASCII(final String s) {
1105 if (s == null) {
1106 throw new IllegalArgumentException("String may not be null");
1107 }
1108 final int len = s.length();
1109 for (int i = 0; i < len; i++) {
1110 if (!isASCII(s.charAt(i))) {
1111 return false;
1112 }
1113 }
1114 return true;
1115 }
1116
1117 /**
1118 * Returns <code>true</code> if the specified character is a whitespace
1119 * character (CR, LF, SP or HT).
1120 *
1121 * @param ch
1122 * character to test.
1123 * @return <code>true</code> if the specified character is a whitespace
1124 * character, <code>false</code> otherwise.
1125 */
1126 public static boolean isWhitespace(char ch) {
1127 return ch == SP || ch == HT || ch == CR || ch == LF;
1128 }
1129
1130 /**
1131 * Returns <code>true</code> if the specified string consists entirely of
1132 * whitespace characters.
1133 *
1134 * @param s
1135 * string to test.
1136 * @return <code>true</code> if the specified string consists entirely of
1137 * whitespace characters, <code>false</code> otherwise.
1138 */
1139 public static boolean isWhitespace(final String s) {
1140 if (s == null) {
1141 throw new IllegalArgumentException("String may not be null");
1142 }
1143 final int len = s.length();
1144 for (int i = 0; i < len; i++) {
1145 if (!isWhitespace(s.charAt(i))) {
1146 return false;
1147 }
1148 }
1149 return true;
1150 }
1151
1152 /**
1153 * Determines if the VM supports encoding (chars to bytes) the
1154 * specified character set. NOTE: the given character set name may
1155 * not be known to the VM even if this method returns <code>true</code>.
1156 * Use {@link #toJavaCharset(String)} to get the canonical Java character
1157 * set name.
1158 *
1159 * @param charsetName the characters set name.
1160 * @return <code>true</code> if encoding is supported, <code>false</code>
1161 * otherwise.
1162 */
1163 public static boolean isEncodingSupported(String charsetName) {
1164 return encodingSupported.contains(charsetName.toLowerCase());
1165 }
1166
1167 /**
1168 * Determines if the VM supports decoding (bytes to chars) the
1169 * specified character set. NOTE: the given character set name may
1170 * not be known to the VM even if this method returns <code>true</code>.
1171 * Use {@link #toJavaCharset(String)} to get the canonical Java character
1172 * set name.
1173 *
1174 * @param charsetName the characters set name.
1175 * @return <code>true</code> if decoding is supported, <code>false</code>
1176 * otherwise.
1177 */
1178 public static boolean isDecodingSupported(String charsetName) {
1179 return decodingSupported.contains(charsetName.toLowerCase());
1180 }
1181
1182 /**
1183 * Gets the preferred MIME character set name for the specified
1184 * character set or <code>null</code> if not known.
1185 *
1186 * @param charsetName the character set name to look for.
1187 * @return the MIME preferred name or <code>null</code> if not known.
1188 */
1189 public static String toMimeCharset(String charsetName) {
1190 Charset c = charsetMap.get(charsetName.toLowerCase());
1191 if (c != null) {
1192 return c.mime;
1193 }
1194 return null;
1195 }
1196
1197 /**
1198 * Gets the canonical Java character set name for the specified
1199 * character set or <code>null</code> if not known. This should be
1200 * called before doing any conversions using the Java API. NOTE:
1201 * you must use {@link #isEncodingSupported(String)} or
1202 * {@link #isDecodingSupported(String)} to make sure the returned
1203 * Java character set is supported by the current VM.
1204 *
1205 * @param charsetName the character set name to look for.
1206 * @return the canonical Java name or <code>null</code> if not known.
1207 */
1208 public static String toJavaCharset(String charsetName) {
1209 Charset c = charsetMap.get(charsetName.toLowerCase());
1210 if (c != null) {
1211 return c.canonical;
1212 }
1213 return null;
1214 }
1215
1216 public static java.nio.charset.Charset getCharset(String charsetName) {
1217 String defaultCharset = "ISO-8859-1";
1218
1219 // Use the default chareset if given charset is null
1220 if(charsetName == null) charsetName = defaultCharset;
1221
1222 try {
1223 return java.nio.charset.Charset.forName(charsetName);
1224 } catch (IllegalCharsetNameException e) {
1225 log.info("Illegal charset " + charsetName + ", fallback to " + defaultCharset + ": " + e);
1226 // Use default charset on exception
1227 return java.nio.charset.Charset.forName(defaultCharset);
1228 } catch (UnsupportedCharsetException ex) {
1229 log.info("Unsupported charset " + charsetName + ", fallback to " + defaultCharset + ": " + ex);
1230 // Use default charset on exception
1231 return java.nio.charset.Charset.forName(defaultCharset);
1232 }
1233
1234 }
1235 /*
1236 * Uncomment the code below and run the main method to regenerate the
1237 * Javadoc table above when the known charsets change.
1238 */
1239
1240 /*
1241 private static String dumpHtmlTable() {
1242 List<Charset> l = new LinkedList<Charset>(Arrays.asList(JAVA_CHARSETS));
1243 Collections.sort(l);
1244 StringBuilder sb = new StringBuilder();
1245 sb.append(" * <table>\n");
1246 sb.append(" * <tr>\n");
1247 sb.append(" * <td>Canonical (Java) name</td>\n");
1248 sb.append(" * <td>MIME preferred</td>\n");
1249 sb.append(" * <td>Aliases</td>\n");
1250 sb.append(" * </tr>\n");
1251
1252 for (Charset c : l) {
1253 sb.append(" * <tr>\n");
1254 sb.append(" * <td>" + c.canonical + "</td>\n");
1255 sb.append(" * <td>" + (c.mime == null ? "?" : c.mime)+ "</td>\n");
1256 sb.append(" * <td>");
1257 for (int i = 0; c.aliases != null && i < c.aliases.length; i++) {
1258 sb.append(c.aliases[i] + " ");
1259 }
1260 sb.append("</td>\n");
1261 sb.append(" * </tr>\n");
1262 }
1263 sb.append(" * </table>\n");
1264 return sb.toString();
1265 }
1266
1267 public static void main(String[] args) {
1268 System.out.println(dumpHtmlTable());
1269 }
1270 */
1271 }