1 /****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one *
3 * or more contributor license agreements. See the NOTICE file *
4 * distributed with this work for additional information *
5 * regarding copyright ownership. The ASF licenses this file *
6 * to you under the Apache License, Version 2.0 (the *
7 * "License"); you may not use this file except in compliance *
8 * with the License. You may obtain a copy of the License at *
9 * *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, *
13 * software distributed under the License is distributed on an *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15 * KIND, either express or implied. See the License for the *
16 * specific language governing permissions and limitations *
17 * under the License. *
18 ****************************************************************/
19
20 package org.apache.james.mime4j.parser;
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.nio.charset.Charset;
27 import java.nio.charset.IllegalCharsetNameException;
28 import java.nio.charset.UnsupportedCharsetException;
29 import java.util.LinkedList;
30
31 import org.apache.james.mime4j.MimeException;
32 import org.apache.james.mime4j.codec.Base64InputStream;
33 import org.apache.james.mime4j.codec.QuotedPrintableInputStream;
34 import org.apache.james.mime4j.descriptor.BodyDescriptor;
35 import org.apache.james.mime4j.io.BufferedLineReaderInputStream;
36 import org.apache.james.mime4j.io.LineNumberInputStream;
37 import org.apache.james.mime4j.io.LineNumberSource;
38 import org.apache.james.mime4j.util.CharsetUtil;
39 import org.apache.james.mime4j.util.MimeUtil;
40
41 /**
42 * <p>
43 * Parses MIME (or RFC822) message streams of bytes or characters.
44 * The stream is converted into an event stream.
45 * <p>
46 * <p>
47 * Typical usage:
48 * </p>
49 * <pre>
50 * MimeTokenStream stream = new MimeTokenStream();
51 * stream.parse(new FileInputStream("mime.msg"));
52 * for (int state = stream.getState();
53 * state != MimeTokenStream.T_END_OF_STREAM;
54 * state = stream.next()) {
55 * switch (state) {
56 * case MimeTokenStream.T_BODY:
57 * System.out.println("Body detected, contents = "
58 * + stream.getInputStream() + ", header data = "
59 * + stream.getBodyDescriptor());
60 * break;
61 * case MimeTokenStream.T_FIELD:
62 * System.out.println("Header field detected: "
63 * + stream.getField());
64 * break;
65 * case MimeTokenStream.T_START_MULTIPART:
66 * System.out.println("Multipart message detexted,"
67 * + " header data = "
68 * + stream.getBodyDescriptor());
69 * ...
70 * }
71 * }
72 * </pre>
73 * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
74 * method {@link #parse(InputStream)} resets the token streams internal
75 * state. However, they are definitely <em>not</em> thread safe. If you
76 * have a multi threaded application, then the suggested use is to have
77 * one instance per thread.</p>
78 */
79 public class MimeTokenStream implements EntityStates, RecursionMode {
80
81 /**
82 * Creates a stream that creates a more detailed body descriptor.
83 * @return <code>MimeTokenStream</code>, not null
84 */
85 public static final MimeTokenStream createMaximalDescriptorStream() {
86 MimeEntityConfig config = new MimeEntityConfig();
87 config.setMaximalBodyDescriptor(true);
88 return new MimeTokenStream(config);
89 }
90
91 /**
92 * Creates a stream that strictly validates the input.
93 * @return <code>MimeTokenStream</code> which throws a
94 * <code>MimeException</code> whenever possible issues
95 * are dedicated in the input
96 */
97 public static final MimeTokenStream createStrictValidationStream() {
98 MimeEntityConfig config = new MimeEntityConfig();
99 config.setStrictParsing(true);
100 return new MimeTokenStream(config);
101 }
102
103 private final MimeEntityConfig config;
104 private final LinkedList<EntityStateMachine> entities = new LinkedList<EntityStateMachine>();
105
106 private int state = T_END_OF_STREAM;
107 private EntityStateMachine currentStateMachine;
108 private int recursionMode = M_RECURSE;
109 private BufferedLineReaderInputStream inbuffer;
110
111 /**
112 * Constructs a standard (lax) stream.
113 * Optional validation events will be logged only.
114 * Use {@link #createStrictValidationStream()} to create
115 * a stream that strictly validates the input.
116 */
117 public MimeTokenStream() {
118 this(new MimeEntityConfig());
119 }
120
121 protected MimeTokenStream(final MimeEntityConfig config) {
122 super();
123 this.config = config;
124 }
125
126 /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
127 * If the {@code MimeTokenStream} has already been in use, resets the streams
128 * internal state.
129 */
130 public void parse(InputStream stream) {
131 doParse(stream, null);
132 }
133
134 /** Instructs the {@code MimeTokenStream} to parse the given content with
135 * the content type. The message stream is assumed to have no message header
136 * and is expected to begin with a message body. This can be the case when
137 * the message content is transmitted using a different transport protocol
138 * such as HTTP.
139 * <p/>
140 * If the {@code MimeTokenStream} has already been in use, resets the streams
141 * internal state.
142 */
143 public void parseHeadless(InputStream stream, String contentType) {
144 if (contentType == null) {
145 throw new IllegalArgumentException("Content type may not be null");
146 }
147 doParse(stream, contentType);
148 }
149
150 private void doParse(InputStream stream, String contentType) {
151 entities.clear();
152
153 LineNumberSource lineSource = null;
154 if (config.isCountLineNumbers()) {
155 LineNumberInputStream lineInput = new LineNumberInputStream(stream);
156 lineSource = lineInput;
157 stream = lineInput;
158 }
159
160 inbuffer = new BufferedLineReaderInputStream(
161 stream,
162 4 * 1024,
163 config.getMaxLineLen());
164 switch (recursionMode) {
165 case M_RAW:
166 RawEntity rawentity = new RawEntity(inbuffer);
167 currentStateMachine = rawentity;
168 break;
169 case M_NO_RECURSE:
170 case M_FLAT:
171 // expected to be called only at start of paring
172 case M_RECURSE:
173 MimeEntity mimeentity = new MimeEntity(
174 lineSource,
175 inbuffer,
176 null,
177 T_START_MESSAGE,
178 T_END_MESSAGE,
179 config);
180 mimeentity.setRecursionMode(recursionMode);
181 if (contentType != null) {
182 mimeentity.skipHeader(contentType);
183 }
184 currentStateMachine = mimeentity;
185 break;
186 }
187 entities.add(currentStateMachine);
188 state = currentStateMachine.getState();
189 }
190
191 /**
192 * Determines if this parser is currently in raw mode.
193 *
194 * @return <code>true</code> if in raw mode, <code>false</code>
195 * otherwise.
196 * @see #setRecursionMode(int)
197 */
198 public boolean isRaw() {
199 return recursionMode == M_RAW;
200 }
201
202 /**
203 * Gets the current recursion mode.
204 * The recursion mode specifies the approach taken to parsing parts.
205 * {@link #M_RAW} mode does not parse the part at all.
206 * {@link #M_RECURSE} mode recursively parses each mail
207 * when an <code>message/rfc822</code> part is encounted;
208 * {@link #M_NO_RECURSE} does not.
209 * @return {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
210 */
211 public int getRecursionMode() {
212 return recursionMode;
213 }
214
215 /**
216 * Sets the current recursion.
217 * The recursion mode specifies the approach taken to parsing parts.
218 * {@link #M_RAW} mode does not parse the part at all.
219 * {@link #M_RECURSE} mode recursively parses each mail
220 * when an <code>message/rfc822</code> part is encounted;
221 * {@link #M_NO_RECURSE} does not.
222 * @param mode {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
223 */
224 public void setRecursionMode(int mode) {
225 recursionMode = mode;
226 if (currentStateMachine != null) {
227 currentStateMachine.setRecursionMode(mode);
228 }
229 }
230
231 /**
232 * Finishes the parsing and stops reading lines.
233 * NOTE: No more lines will be parsed but the parser
234 * will still call
235 * {@link ContentHandler#endMultipart()},
236 * {@link ContentHandler#endBodyPart()},
237 * {@link ContentHandler#endMessage()}, etc to match previous calls
238 * to
239 * {@link ContentHandler#startMultipart(BodyDescriptor)},
240 * {@link ContentHandler#startBodyPart()},
241 * {@link ContentHandler#startMessage()}, etc.
242 */
243 public void stop() {
244 inbuffer.truncate();
245 }
246
247 /**
248 * Returns the current state.
249 */
250 public int getState() {
251 return state;
252 }
253
254 /**
255 * This method returns the raw entity, preamble, or epilogue contents.
256 * <p/>
257 * This method is valid, if {@link #getState()} returns either of
258 * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
259 *
260 * @return Data stream, depending on the current state.
261 * @throws IllegalStateException {@link #getState()} returns an
262 * invalid value.
263 */
264 public InputStream getInputStream() {
265 return currentStateMachine.getContentStream();
266 }
267
268 /**
269 * This method returns a transfer decoded stream based on the MIME
270 * fields with the standard defaults.
271 * <p/>
272 * This method is valid, if {@link #getState()} returns either of
273 * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
274 *
275 * @return Data stream, depending on the current state.
276 * @throws IllegalStateException {@link #getState()} returns an
277 * invalid value.
278 */
279 public InputStream getDecodedInputStream() {
280 BodyDescriptor bodyDescriptor = getBodyDescriptor();
281 String transferEncoding = bodyDescriptor.getTransferEncoding();
282 InputStream dataStream = currentStateMachine.getContentStream();
283 if (MimeUtil.isBase64Encoding(transferEncoding)) {
284 dataStream = new Base64InputStream(dataStream);
285 } else if (MimeUtil.isQuotedPrintableEncoded(transferEncoding)) {
286 dataStream = new QuotedPrintableInputStream(dataStream);
287 }
288 return dataStream;
289 }
290
291 /**
292 * Gets a reader configured for the current body or body part.
293 * The reader will return a transfer and charset decoded
294 * stream of characters based on the MIME fields with the standard
295 * defaults.
296 * This is a conveniance method and relies on {@link #getInputStream()}.
297 * Consult the javadoc for that method for known limitations.
298 *
299 * @return <code>Reader</code>, not null
300 * @see #getInputStream
301 * @throws IllegalStateException {@link #getState()} returns an
302 * invalid value
303 * @throws UnsupportedCharsetException if there is no JVM support
304 * for decoding the charset
305 * @throws IllegalCharsetNameException if the charset name specified
306 * in the mime type is illegal
307 */
308 public Reader getReader() {
309 final BodyDescriptor bodyDescriptor = getBodyDescriptor();
310 final String mimeCharset = bodyDescriptor.getCharset();
311 final Charset charset;
312 if (mimeCharset == null || "".equals(mimeCharset)) {
313 charset = CharsetUtil.US_ASCII;
314 } else {
315 charset = Charset.forName(mimeCharset);
316 }
317 final InputStream instream = getDecodedInputStream();
318 return new InputStreamReader(instream, charset);
319 }
320
321 /**
322 * <p>Gets a descriptor for the current entity.
323 * This method is valid if {@link #getState()} returns:</p>
324 * <ul>
325 * <li>{@link #T_BODY}</li>
326 * <li>{@link #T_START_MULTIPART}</li>
327 * <li>{@link #T_EPILOGUE}</li>
328 * <li>{@link #T_PREAMBLE}</li>
329 * </ul>
330 * @return <code>BodyDescriptor</code>, not nulls
331 */
332 public BodyDescriptor getBodyDescriptor() {
333 return currentStateMachine.getBodyDescriptor();
334 }
335
336 /**
337 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
338 * @return String with the fields raw contents.
339 * @throws IllegalStateException {@link #getState()} returns another
340 * value than {@link #T_FIELD}.
341 */
342 public Field getField() {
343 return currentStateMachine.getField();
344 }
345
346 /**
347 * This method advances the token stream to the next token.
348 * @throws IllegalStateException The method has been called, although
349 * {@link #getState()} was already {@link #T_END_OF_STREAM}.
350 */
351 public int next() throws IOException, MimeException {
352 if (state == T_END_OF_STREAM || currentStateMachine == null) {
353 throw new IllegalStateException("No more tokens are available.");
354 }
355 while (currentStateMachine != null) {
356 EntityStateMachine next = currentStateMachine.advance();
357 if (next != null) {
358 entities.add(next);
359 currentStateMachine = next;
360 }
361 state = currentStateMachine.getState();
362 if (state != T_END_OF_STREAM) {
363 return state;
364 }
365 entities.removeLast();
366 if (entities.isEmpty()) {
367 currentStateMachine = null;
368 } else {
369 currentStateMachine = entities.getLast();
370 currentStateMachine.setRecursionMode(recursionMode);
371 }
372 }
373 state = T_END_OF_STREAM;
374 return state;
375 }
376
377 /**
378 * Renders a state as a string suitable for logging.
379 * @param state
380 * @return rendered as string, not null
381 */
382 public static final String stateToString(int state) {
383 return AbstractEntity.stateToString(state);
384 }
385 }