View Javadoc

1   /****************************************************************
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   ****************************************************************/
19  
20  package org.apache.james.mime4j.parser;
21  
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.nio.charset.Charset;
27  import java.nio.charset.IllegalCharsetNameException;
28  import java.nio.charset.UnsupportedCharsetException;
29  import java.util.LinkedList;
30  
31  import org.apache.james.mime4j.MimeException;
32  import org.apache.james.mime4j.codec.Base64InputStream;
33  import org.apache.james.mime4j.codec.QuotedPrintableInputStream;
34  import org.apache.james.mime4j.descriptor.BodyDescriptor;
35  import org.apache.james.mime4j.io.BufferedLineReaderInputStream;
36  import org.apache.james.mime4j.io.LineNumberInputStream;
37  import org.apache.james.mime4j.io.LineNumberSource;
38  import org.apache.james.mime4j.util.CharsetUtil;
39  import org.apache.james.mime4j.util.MimeUtil;
40  
41  /**
42   * <p>
43   * Parses MIME (or RFC822) message streams of bytes or characters.
44   * The stream is converted into an event stream.
45   * <p>
46   * <p>
47   * Typical usage:
48   * </p>
49   * <pre>
50   *      MimeTokenStream stream = new MimeTokenStream();
51   *      stream.parse(new FileInputStream("mime.msg"));
52   *      for (int state = stream.getState();
53   *           state != MimeTokenStream.T_END_OF_STREAM;
54   *           state = stream.next()) {
55   *          switch (state) {
56   *            case MimeTokenStream.T_BODY:
57   *              System.out.println("Body detected, contents = "
58   *                + stream.getInputStream() + ", header data = "
59   *                + stream.getBodyDescriptor());
60   *              break;
61   *            case MimeTokenStream.T_FIELD:
62   *              System.out.println("Header field detected: "
63   *                + stream.getField());
64   *              break;
65   *            case MimeTokenStream.T_START_MULTIPART:
66   *              System.out.println("Multipart message detexted,"
67   *                + " header data = "
68   *                + stream.getBodyDescriptor());
69   *            ...
70   *          }
71   *      }
72   * </pre>
73   * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
74   * method {@link #parse(InputStream)} resets the token streams internal
75   * state. However, they are definitely <em>not</em> thread safe. If you
76   * have a multi threaded application, then the suggested use is to have
77   * one instance per thread.</p>
78   */
79  public class MimeTokenStream implements EntityStates, RecursionMode {
80      
81      /**
82       * Creates a stream that creates a more detailed body descriptor.
83       * @return <code>MimeTokenStream</code>, not null
84       */
85      public static final MimeTokenStream createMaximalDescriptorStream() {
86          MimeEntityConfig config = new MimeEntityConfig();
87          config.setMaximalBodyDescriptor(true);
88          return new MimeTokenStream(config);
89      }
90      
91      /**
92       * Creates a stream that strictly validates the input.
93       * @return <code>MimeTokenStream</code> which throws a 
94       * <code>MimeException</code> whenever possible issues 
95       * are dedicated in the input
96       */
97      public static final MimeTokenStream createStrictValidationStream() {
98          MimeEntityConfig config = new MimeEntityConfig();
99          config.setStrictParsing(true);
100         return new MimeTokenStream(config);
101     }
102     
103     private final MimeEntityConfig config;
104     private final LinkedList<EntityStateMachine> entities = new LinkedList<EntityStateMachine>();
105     
106     private int state = T_END_OF_STREAM;
107     private EntityStateMachine currentStateMachine;
108     private int recursionMode = M_RECURSE;
109     private BufferedLineReaderInputStream inbuffer;
110     
111     /**
112      * Constructs a standard (lax) stream.
113      * Optional validation events will be logged only.
114      * Use {@link #createStrictValidationStream()} to create
115      * a stream that strictly validates the input.
116      */
117     public MimeTokenStream() {
118         this(new MimeEntityConfig());
119     }
120     
121     protected MimeTokenStream(final MimeEntityConfig config) {
122         super();
123         this.config = config;
124     }
125     
126     /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
127      * If the {@code MimeTokenStream} has already been in use, resets the streams
128      * internal state.
129      */
130     public void parse(InputStream stream) {
131         doParse(stream, null);
132     }
133 
134     /** Instructs the {@code MimeTokenStream} to parse the given content with 
135      * the content type. The message stream is assumed to have no message header
136      * and is expected to begin with a message body. This can be the case when 
137      * the message content is transmitted using a different transport protocol 
138      * such as HTTP.
139      * <p/>
140      * If the {@code MimeTokenStream} has already been in use, resets the streams
141      * internal state.
142      */    
143     public void parseHeadless(InputStream stream, String contentType) {
144         if (contentType == null) {
145             throw new IllegalArgumentException("Content type may not be null");
146         }
147         doParse(stream, contentType);
148     }
149 
150     private void doParse(InputStream stream, String contentType) {
151         entities.clear();
152 
153         LineNumberSource lineSource = null;
154         if (config.isCountLineNumbers()) {
155             LineNumberInputStream lineInput = new LineNumberInputStream(stream);
156             lineSource = lineInput;
157             stream = lineInput;
158         }
159 
160         inbuffer = new BufferedLineReaderInputStream(
161                 stream, 
162                 4 * 1024,
163                 config.getMaxLineLen());
164         switch (recursionMode) {
165         case M_RAW:
166             RawEntity rawentity = new RawEntity(inbuffer);
167             currentStateMachine = rawentity;
168             break;
169         case M_NO_RECURSE:
170         case M_FLAT:
171             // expected to be called only at start of paring
172         case M_RECURSE:
173             MimeEntity mimeentity = new MimeEntity(
174                     lineSource,
175                     inbuffer,
176                     null, 
177                     T_START_MESSAGE, 
178                     T_END_MESSAGE,
179                     config);
180             mimeentity.setRecursionMode(recursionMode);
181             if (contentType != null) {
182                 mimeentity.skipHeader(contentType);
183             }
184             currentStateMachine = mimeentity;
185             break;
186         }
187         entities.add(currentStateMachine);
188         state = currentStateMachine.getState();
189     }
190 
191     /**
192      * Determines if this parser is currently in raw mode.
193      * 
194      * @return <code>true</code> if in raw mode, <code>false</code>
195      *         otherwise.
196      * @see #setRecursionMode(int)
197      */
198     public boolean isRaw() {
199         return recursionMode == M_RAW;
200     }
201     
202     /**
203      * Gets the current recursion mode.
204      * The recursion mode specifies the approach taken to parsing parts.
205      * {@link #M_RAW}  mode does not parse the part at all.
206      * {@link #M_RECURSE} mode recursively parses each mail
207      * when an <code>message/rfc822</code> part is encounted;
208      * {@link #M_NO_RECURSE} does not.
209      * @return {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
210      */
211     public int getRecursionMode() {
212         return recursionMode;
213     }
214     
215     /**
216      * Sets the current recursion.
217      * The recursion mode specifies the approach taken to parsing parts.
218      * {@link #M_RAW}  mode does not parse the part at all.
219      * {@link #M_RECURSE} mode recursively parses each mail
220      * when an <code>message/rfc822</code> part is encounted;
221      * {@link #M_NO_RECURSE} does not.
222      * @param mode {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
223      */
224     public void setRecursionMode(int mode) {
225         recursionMode = mode;
226         if (currentStateMachine != null) {
227             currentStateMachine.setRecursionMode(mode);
228         }
229     }
230 
231     /**
232      * Finishes the parsing and stops reading lines.
233      * NOTE: No more lines will be parsed but the parser
234      * will still call 
235      * {@link ContentHandler#endMultipart()},
236      * {@link ContentHandler#endBodyPart()},
237      * {@link ContentHandler#endMessage()}, etc to match previous calls
238      * to 
239      * {@link ContentHandler#startMultipart(BodyDescriptor)},
240      * {@link ContentHandler#startBodyPart()},
241      * {@link ContentHandler#startMessage()}, etc.
242      */
243     public void stop() {
244         inbuffer.truncate();
245     }
246 
247     /**
248      * Returns the current state.
249      */
250     public int getState() {
251         return state;
252     }
253 
254     /**
255      * This method returns the raw entity, preamble, or epilogue contents.
256      * <p/>
257      * This method is valid, if {@link #getState()} returns either of
258      * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
259      * 
260      * @return Data stream, depending on the current state.
261      * @throws IllegalStateException {@link #getState()} returns an
262      *   invalid value.
263      */
264     public InputStream getInputStream() {
265         return currentStateMachine.getContentStream();
266     }
267     
268     /**
269      * This method returns a transfer decoded stream based on the MIME 
270      * fields with the standard defaults.
271      * <p/>
272      * This method is valid, if {@link #getState()} returns either of
273      * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
274      * 
275      * @return Data stream, depending on the current state.
276      * @throws IllegalStateException {@link #getState()} returns an
277      *   invalid value.
278      */
279     public InputStream getDecodedInputStream() {
280         BodyDescriptor bodyDescriptor = getBodyDescriptor();
281         String transferEncoding = bodyDescriptor.getTransferEncoding();
282         InputStream dataStream = currentStateMachine.getContentStream();
283         if (MimeUtil.isBase64Encoding(transferEncoding)) {
284             dataStream = new Base64InputStream(dataStream);
285         } else if (MimeUtil.isQuotedPrintableEncoded(transferEncoding)) {
286             dataStream = new QuotedPrintableInputStream(dataStream);
287         }
288         return dataStream;
289     }
290 
291     /**
292      * Gets a reader configured for the current body or body part.
293      * The reader will return a transfer and charset decoded 
294      * stream of characters based on the MIME fields with the standard
295      * defaults.
296      * This is a conveniance method and relies on {@link #getInputStream()}.
297      * Consult the javadoc for that method for known limitations.
298      * 
299      * @return <code>Reader</code>, not null
300      * @see #getInputStream 
301      * @throws IllegalStateException {@link #getState()} returns an
302      *   invalid value 
303      * @throws UnsupportedCharsetException if there is no JVM support 
304      * for decoding the charset
305      * @throws IllegalCharsetNameException if the charset name specified
306      * in the mime type is illegal
307      */
308     public Reader getReader() {
309         final BodyDescriptor bodyDescriptor = getBodyDescriptor();
310         final String mimeCharset = bodyDescriptor.getCharset();
311         final Charset charset;
312         if (mimeCharset == null || "".equals(mimeCharset)) {
313             charset = CharsetUtil.US_ASCII;
314         } else {
315             charset = Charset.forName(mimeCharset);
316         }
317         final InputStream instream = getDecodedInputStream();
318         return new InputStreamReader(instream, charset);
319     }
320     
321     /**
322      * <p>Gets a descriptor for the current entity.
323      * This method is valid if {@link #getState()} returns:</p>
324      * <ul>
325      * <li>{@link #T_BODY}</li>
326      * <li>{@link #T_START_MULTIPART}</li>
327      * <li>{@link #T_EPILOGUE}</li>
328      * <li>{@link #T_PREAMBLE}</li>
329      * </ul>
330      * @return <code>BodyDescriptor</code>, not nulls
331      */
332     public BodyDescriptor getBodyDescriptor() {
333         return currentStateMachine.getBodyDescriptor();
334     }
335 
336     /**
337      * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
338      * @return String with the fields raw contents.
339      * @throws IllegalStateException {@link #getState()} returns another
340      *   value than {@link #T_FIELD}.
341      */
342     public Field getField() {
343         return currentStateMachine.getField();
344     }
345     
346     /**
347      * This method advances the token stream to the next token.
348      * @throws IllegalStateException The method has been called, although
349      *   {@link #getState()} was already {@link #T_END_OF_STREAM}.
350      */
351     public int next() throws IOException, MimeException {
352         if (state == T_END_OF_STREAM  ||  currentStateMachine == null) {
353             throw new IllegalStateException("No more tokens are available.");
354         }
355         while (currentStateMachine != null) {
356             EntityStateMachine next = currentStateMachine.advance();
357             if (next != null) {
358                 entities.add(next);
359                 currentStateMachine = next;
360             }
361             state = currentStateMachine.getState();
362             if (state != T_END_OF_STREAM) {
363                 return state;
364             }
365             entities.removeLast();
366             if (entities.isEmpty()) {
367                 currentStateMachine = null;
368             } else {
369                 currentStateMachine = entities.getLast();
370                 currentStateMachine.setRecursionMode(recursionMode);
371             }
372         }
373         state = T_END_OF_STREAM;
374         return state;
375     }
376 
377     /**
378      * Renders a state as a string suitable for logging.
379      * @param state 
380      * @return rendered as string, not null
381      */
382     public static final String stateToString(int state) {
383         return AbstractEntity.stateToString(state);
384     }
385 }