View Javadoc

1   /****************************************************************
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   ****************************************************************/
19  
20  package org.apache.james.transport.mailets;
21  
22  import org.apache.mailet.base.GenericMailet;
23  import org.apache.mailet.Mail;
24  import org.apache.mailet.MailetException;
25  
26  import javax.mail.Message;
27  import javax.mail.MessagingException;
28  import javax.mail.Multipart;
29  import javax.mail.Part;
30  import javax.mail.internet.ContentType;
31  
32  import java.io.IOException;
33  import java.util.HashMap;
34  
35  /**
36   * Keep only the text part of a message.
37   * <p>If the message is text only then it doesn't touch it, if it is a multipart it
38   * transform it a in plain text message with the first text part found.<br>
39   * - text/plain<br>
40   * - text/html => with a conversion to text only<br>
41   * - text/* as is.</p>
42   */
43  public class OnlyText extends GenericMailet {
44      private static final String PARAMETER_NAME_NOTEXT_PROCESSOR = "NoTextProcessor";
45      
46      private String optionsNotextProcessor = null;
47      private final HashMap charMap = new HashMap();
48      
49      /**
50       * returns a String describing this mailet.
51       * 
52       * @return A desciption of this mailet
53       */
54      public String getMailetInfo() {
55          return "OnlyText";
56      }
57  
58      public void init() throws MailetException {
59          optionsNotextProcessor = getInitParameter(PARAMETER_NAME_NOTEXT_PROCESSOR);
60          initEntityTable();
61      }
62  
63      private int[] process(Mail mail, Multipart mp, int found, int htmlPart, int stringPart)  throws MessagingException, IOException {
64          for (int i = 0; found < 0 && i < mp.getCount(); i++) {
65              Object content = null;
66              try {
67                  content = mp.getBodyPart(i).getContent();
68              } catch (java.io.UnsupportedEncodingException e) {
69                  log("Caught error [" + e.getMessage() + "] in a text/plain part, skipping...");
70              }
71              if (content != null) {
72                  if (mp.getBodyPart(i).isMimeType("text/plain")) {
73                      setContentFromPart(mail.getMessage(), mp.getBodyPart(i), null, false);
74                      found = 1;
75                  } 
76                  else if (htmlPart == -1 && mp.getBodyPart(i).isMimeType("text/html"))
77                      htmlPart = i;
78                      
79                  else if (stringPart == -1 && content instanceof String)
80                      stringPart = i;
81              
82                  else if (content instanceof Multipart) {
83                      int[] res = process(mail, (Multipart) content, found, htmlPart, stringPart);
84                      found = res[0];
85                      htmlPart = res[1];
86                      stringPart = res[2];
87                  }
88              }
89          }
90          
91          return new int[] {found, htmlPart, stringPart};
92          
93      }
94      
95      public void service(Mail mail) throws MailetException {
96          try {
97              Object content = mail.getMessage().getContent();
98              if (content instanceof Multipart) {
99                  Multipart mp = (Multipart) content;
100                 
101                 int found = -1;
102                 int htmlPart = -1;
103                 int stringPart = -1;
104                 int[] res = process(mail, (Multipart) content, found, htmlPart, stringPart);
105                 found = res[0];
106                 htmlPart = res[1];
107                 stringPart = res[2];
108                 
109                 if (found < 0 && htmlPart != -1) {
110                     setContentFromPart(mail.getMessage(), mp.getBodyPart(htmlPart), html2Text((String) mp.getBodyPart(htmlPart).getContent()), true);
111                     found = 1;
112                 }
113                 
114                 if (found < 0 && stringPart != -1) {
115                     setContentFromPart(mail.getMessage(), mp.getBodyPart(htmlPart), null, false);
116                     found = 1;
117                 }
118                 
119 
120                 if (found < 0 && optionsNotextProcessor != null) mail.setState(optionsNotextProcessor);
121                 
122             } 
123             
124             else if (!(content instanceof String) && optionsNotextProcessor != null) mail.setState(optionsNotextProcessor);
125             
126             else if (mail.getMessage().isMimeType("text/html")) {
127                 setContentFromPart(mail.getMessage(), mail.getMessage(), html2Text((String) mail.getMessage().getContent()), true);
128             }
129             
130         } catch (IOException e) {
131             throw new MailetException("Failed fetching text part", e);
132             
133         } catch (MessagingException e) {
134             throw new MailetException("Failed fetching text part", e);
135         }
136     }
137     
138     private static void setContentFromPart(Message m, Part p, String newText, boolean setTextPlain) throws MessagingException, IOException {
139         String contentType = p.getContentType();
140         if (setTextPlain) {
141             ContentType ct = new ContentType(contentType);
142             ct.setPrimaryType("text");
143             ct.setSubType("plain");
144             contentType = ct.toString();
145         }
146         m.setContent(newText != null ? newText : p.getContent(), contentType);
147         String[] h = p.getHeader("Content-Transfer-Encoding");
148         if (h != null && h.length > 0) m.setHeader("Content-Transfer-Encoding", h[0]);
149         m.saveChanges();
150     }
151     
152     public String html2Text(String html) {
153         return decodeEntities(html
154             .replaceAll("\\<([bB][rR]|[dD][lL])[ ]*[/]*[ ]*\\>", "\n")
155             .replaceAll("\\</([pP]|[hH]5|[dD][tT]|[dD][dD]|[dD][iI][vV])[ ]*\\>", "\n")
156             .replaceAll("\\<[lL][iI][ ]*[/]*[ ]*\\>", "\n* ")
157             .replaceAll("\\<[dD][dD][ ]*[/]*[ ]*\\>", " - ")
158             .replaceAll("\\<.*?\\>", ""));
159     }
160     
161     public String decodeEntities(String data) {
162         StringBuffer buffer = new StringBuffer();
163         StringBuffer res = new StringBuffer();
164         int lastAmp = -1;
165         for (int i = 0; i < data.length(); i++) {
166             char c = data.charAt(i);
167 
168             if (c == '&' && lastAmp == -1) lastAmp = buffer.length();
169             else if (c == ';' && (lastAmp > -1)) { // && (lastAmp > (buffer.length() - 7))) { // max: &#xxxx;
170                 if (charMap.containsKey(buffer.toString())) res.append((String) charMap.get(buffer.toString()));
171                 else res.append("&" + buffer.toString() + ";");
172                 lastAmp = -1;
173                 buffer = new StringBuffer();
174             } 
175             else if (lastAmp == -1) res.append(c);
176             else buffer.append(c);
177         }
178         return res.toString();
179     }
180 
181     private final void initEntityTable() {
182         for (int index = 11; index < 32; index++) charMap.put("#0" + index, String.valueOf((char) index));
183         for (int index = 32; index < 128; index++) charMap.put("#" + index, String.valueOf((char) index));
184         for (int index = 128; index < 256; index++) charMap.put("#" + index, String.valueOf((char) index));
185         
186         // A complete reference is here:
187         // http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
188         
189         charMap.put("#09", "\t");
190         charMap.put("#10", "\n");
191         charMap.put("#13", "\r");
192         charMap.put("#60", "<");
193         charMap.put("#62", ">");
194 
195         charMap.put("lt", "<");
196         charMap.put("gt", ">");
197         charMap.put("amp", "&");
198         charMap.put("nbsp", " ");
199         charMap.put("quot", "\"");
200 
201         charMap.put("iexcl", "\u00A1");
202         charMap.put("cent", "\u00A2");
203         charMap.put("pound", "\u00A3");
204         charMap.put("curren", "\u00A4");
205         charMap.put("yen", "\u00A5");
206         charMap.put("brvbar", "\u00A6");
207         charMap.put("sect", "\u00A7");
208         charMap.put("uml", "\u00A8");
209         charMap.put("copy", "\u00A9");
210         charMap.put("ordf", "\u00AA");
211         charMap.put("laquo", "\u00AB");
212         charMap.put("not", "\u00AC");
213         charMap.put("shy", "\u00AD");
214         charMap.put("reg", "\u00AE");
215         charMap.put("macr", "\u00AF");
216         charMap.put("deg", "\u00B0");
217         charMap.put("plusmn", "\u00B1");
218         charMap.put("sup2", "\u00B2");
219         charMap.put("sup3", "\u00B3");
220 
221         charMap.put("acute", "\u00B4");
222         charMap.put("micro", "\u00B5");
223         charMap.put("para", "\u00B6");
224         charMap.put("middot", "\u00B7");
225         charMap.put("cedil", "\u00B8");
226         charMap.put("sup1", "\u00B9");
227         charMap.put("ordm", "\u00BA");
228         charMap.put("raquo", "\u00BB");
229         charMap.put("frac14", "\u00BC");
230         charMap.put("frac12", "\u00BD");
231         charMap.put("frac34", "\u00BE");
232         charMap.put("iquest", "\u00BF");
233 
234         charMap.put("Agrave", "\u00C0");
235         charMap.put("Aacute", "\u00C1");
236         charMap.put("Acirc", "\u00C2");
237         charMap.put("Atilde", "\u00C3");
238         charMap.put("Auml", "\u00C4");
239         charMap.put("Aring", "\u00C5");
240         charMap.put("AElig", "\u00C6");
241         charMap.put("Ccedil", "\u00C7");
242         charMap.put("Egrave", "\u00C8");
243         charMap.put("Eacute", "\u00C9");
244         charMap.put("Ecirc", "\u00CA");
245         charMap.put("Euml", "\u00CB");
246         charMap.put("Igrave", "\u00CC");
247         charMap.put("Iacute", "\u00CD");
248         charMap.put("Icirc", "\u00CE");
249         charMap.put("Iuml", "\u00CF");
250 
251         charMap.put("ETH", "\u00D0");
252         charMap.put("Ntilde", "\u00D1");
253         charMap.put("Ograve", "\u00D2");
254         charMap.put("Oacute", "\u00D3");
255         charMap.put("Ocirc", "\u00D4");
256         charMap.put("Otilde", "\u00D5");
257         charMap.put("Ouml", "\u00D6");
258         charMap.put("times", "\u00D7");
259         charMap.put("Oslash", "\u00D8");
260         charMap.put("Ugrave", "\u00D9");
261         charMap.put("Uacute", "\u00DA");
262         charMap.put("Ucirc", "\u00DB");
263         charMap.put("Uuml", "\u00DC");
264         charMap.put("Yacute", "\u00DD");
265         charMap.put("THORN", "\u00DE");
266         charMap.put("szlig", "\u00DF");
267 
268         charMap.put("agrave", "\u00E0");
269         charMap.put("aacute", "\u00E1");
270         charMap.put("acirc", "\u00E2");
271         charMap.put("atilde", "\u00E3");
272         charMap.put("auml", "\u00E4");
273         charMap.put("aring", "\u00E5");
274         charMap.put("aelig", "\u00E6");
275         charMap.put("ccedil", "\u00E7");
276         charMap.put("egrave", "\u00E8");
277         charMap.put("eacute", "\u00E9");
278         charMap.put("ecirc", "\u00EA");
279         charMap.put("euml", "\u00EB");
280         charMap.put("igrave", "\u00EC");
281         charMap.put("iacute", "\u00ED");
282         charMap.put("icirc", "\u00EE");
283         charMap.put("iuml", "\u00EF");
284 
285         charMap.put("eth", "\u00F0");
286         charMap.put("ntilde", "\u00F1");
287         charMap.put("ograve", "\u00F2");
288         charMap.put("oacute", "\u00F3");
289         charMap.put("ocirc", "\u00F4");
290         charMap.put("otilde", "\u00F5");
291         charMap.put("ouml", "\u00F6");
292         charMap.put("divid", "\u00F7");
293         charMap.put("oslash", "\u00F8");
294         charMap.put("ugrave", "\u00F9");
295         charMap.put("uacute", "\u00FA");
296         charMap.put("ucirc", "\u00FB");
297         charMap.put("uuml", "\u00FC");
298         charMap.put("yacute", "\u00FD");
299         charMap.put("thorn", "\u00FE");
300         charMap.put("yuml", "\u00FF");
301         charMap.put("euro", "\u0080");
302     }
303 }