View Javadoc

1   /****************************************************************
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   ****************************************************************/
19  
20  
21  
22  
23  package org.apache.james.management.impl;
24  
25  import java.io.BufferedReader;
26  import java.io.File;
27  import java.io.FileNotFoundException;
28  import java.io.FileOutputStream;
29  import java.io.FileReader;
30  import java.io.IOException;
31  import java.io.InputStreamReader;
32  import java.io.PrintWriter;
33  import java.io.InputStream;
34  import java.sql.SQLException;
35  import java.sql.Connection;
36  import java.util.Map;
37  
38  import net.fortuna.mstor.data.MboxFile;
39  
40  import org.apache.avalon.cornerstone.services.datasources.DataSourceSelector;
41  import org.apache.avalon.excalibur.datasource.DataSourceComponent;
42  import org.apache.avalon.framework.activity.Initializable;
43  import org.apache.avalon.framework.configuration.Configurable;
44  import org.apache.avalon.framework.configuration.Configuration;
45  import org.apache.avalon.framework.configuration.ConfigurationException;
46  import org.apache.avalon.framework.service.ServiceException;
47  import org.apache.avalon.framework.service.ServiceManager;
48  import org.apache.avalon.framework.service.Serviceable;
49  import org.apache.james.management.BayesianAnalyzerManagementException;
50  import org.apache.james.management.BayesianAnalyzerManagementMBean;
51  import org.apache.james.management.BayesianAnalyzerManagementService;
52  import org.apache.james.services.FileSystem;
53  import org.apache.james.util.bayesian.JDBCBayesianAnalyzer;
54  
55  import com.thoughtworks.xstream.XStream;
56  import com.thoughtworks.xstream.io.xml.DomDriver;
57  
58  /**
59   * Management for BayesianAnalyzer
60   */
61  public class BayesianAnalyzerManagement implements BayesianAnalyzerManagementService, Serviceable, Initializable, Configurable, BayesianAnalyzerManagementMBean {
62  
63      private final static String HAM = "HAM";
64      private final static String SPAM = "SPAM";
65      private DataSourceSelector selector;
66      private DataSourceComponent component;
67      private String repos;
68      private String sqlFileUrl;
69      private FileSystem fileSystem;
70      
71      /**
72       * @see org.apache.avalon.framework.service.Serviceable#service(ServiceManager)
73       */
74      public void service(ServiceManager arg0) throws ServiceException {
75          DataSourceSelector selector = (DataSourceSelector) arg0.lookup(DataSourceSelector.ROLE);
76          setDataSourceSelector(selector);
77          setFileSystem((FileSystem) arg0.lookup(FileSystem.ROLE));
78      }
79  
80      /**
81       * Sets the file system service
82       * 
83       * @param system new service
84       */
85      private void setFileSystem(FileSystem system) {
86          this.fileSystem = system;
87      }
88  
89      /**
90       * @see org.apache.avalon.framework.activity.Initializable#initialize()
91       */
92      public void initialize() throws Exception {
93          if (repos != null) {
94              setDataSourceComponent((DataSourceComponent) selector.select(repos));
95              File sqlFile = fileSystem.getFile(sqlFileUrl);
96              analyzer.initSqlQueries(component.getConnection(), sqlFile.getAbsolutePath());
97          }
98      }
99  
100     /**
101      * @see org.apache.avalon.framework.configuration.Configurable#configure(Configuration)
102      */
103     public void configure(Configuration arg0) throws ConfigurationException {
104         Configuration reposPath = arg0.getChild("repositoryPath",false);
105         if (reposPath != null) {
106             setRepositoryPath(reposPath.getValue());
107         }
108         sqlFileUrl = arg0.getChild("sqlFile").getValue();
109         if (sqlFileUrl == null) sqlFileUrl = "file://conf/sqlResources.xml";
110     }
111     
112     /**
113      * Set the repository path 
114      * 
115      * @param repositoryPath Thre repositoryPath
116      */
117     public void setRepositoryPath(String repositoryPath) {
118         repos = repositoryPath.substring(5);
119     }
120     
121     /**
122      * Set the DatasourceSekector
123      * 
124      * @param selector The DataSourceSelector
125      */
126     public void setDataSourceSelector (DataSourceSelector selector) {
127         this.selector = selector;
128     }
129     
130     /**
131      * Set the DataSourceComponent
132      * 
133      * @param component The DataSourceComponent
134      */
135     public void setDataSourceComponent(DataSourceComponent component) {
136         this.component = component;
137     }
138     
139     /**
140      * @see org.apache.james.management.BayesianAnalyzerManagementService#addHamFromDir(String)
141      */
142     public int addHamFromDir(String dir) throws BayesianAnalyzerManagementException {
143         if (repos == null) throw new BayesianAnalyzerManagementException("RepositoryPath not configured");
144         
145         return feedBayesianAnalyzerFromDir(dir,HAM);
146     }
147 
148     /**
149      * @see org.apache.james.management.BayesianAnalyzerManagementService#addSpamFromDir(String)
150      */
151     public int addSpamFromDir(String dir) throws BayesianAnalyzerManagementException {
152         if (repos == null) throw new BayesianAnalyzerManagementException("RepositoryPath not configured");
153         
154         return feedBayesianAnalyzerFromDir(dir,SPAM);
155     }
156     
157     /**
158      * @see org.apache.james.management.BayesianAnalyzerManagementService#addHamFromMbox(String)
159      */
160     public int addHamFromMbox(String file) throws BayesianAnalyzerManagementException {
161         if (repos == null) throw new BayesianAnalyzerManagementException("RepositoryPath not configured");
162         return feedBayesianAnalyzerFromMbox(file,HAM);
163     }
164 
165     /**
166      * @see org.apache.james.management.BayesianAnalyzerManagementService#addSpamFromMbox(String)
167      */
168     public int addSpamFromMbox(String file) throws BayesianAnalyzerManagementException {
169         if (repos == null) throw new BayesianAnalyzerManagementException("RepositoryPath not configured");
170         return feedBayesianAnalyzerFromMbox(file,SPAM);
171     }
172 
173     /**
174      * Helper method to train the BayesianAnalysis from directory which contain mails
175      *
176      * @param dir The directory which contains the emails which should be used to feed the BayesianAnalysis
177      * @param type The type to train. HAM or SPAM
178      * @return count The count of trained messages
179      * @throws BayesianAnalyzerManagementException
180      * @throws IllegalArgumentException Get thrown if the directory is not valid
181      */
182     private int feedBayesianAnalyzerFromDir(String dir, String type) throws BayesianAnalyzerManagementException {
183 
184         //Clear out any existing word/counts etc..
185         analyzer.clear();
186 
187         File tmpFile = new File(dir);
188         int count = 0;
189 
190         synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) {
191 
192             // check if the provided dir is really a directory
193             if (tmpFile.isDirectory()) {
194                 File[] files = tmpFile.listFiles();
195 
196                 for (int i = 0; i < files.length; i++) {
197                     BufferedReader stream = null;
198                     try {
199                         stream = new BufferedReader(new FileReader(files[i]));
200                     } catch (FileNotFoundException e) {
201                         throw new BayesianAnalyzerManagementException("acessing mail file failed.", e);
202                     }
203                     addMailToCorpus(type, stream);
204                     count++;
205                 }
206 
207                 updateTokens(type);
208 
209             } else {
210                throw new IllegalArgumentException("Please provide an valid directory");
211             }
212         }
213 
214         return count;
215     }
216 
217     /**
218      * Update the tokens 
219      * 
220      * @param type The type whichs tokens should be updated. Valid types are HAM or SPAM
221      * @throws BayesianAnalyzerManagementException
222      */
223     private void updateTokens(String type) throws BayesianAnalyzerManagementException {
224         //Update storage statistics.
225         try {
226             Connection connection = component.getConnection();
227             if (type.equalsIgnoreCase(HAM)) {
228                 analyzer.updateHamTokens(connection);
229             } else if (type.equalsIgnoreCase(SPAM)) {
230                 analyzer.updateSpamTokens(connection);
231             }
232         } catch (SQLException e) {
233             throw new BayesianAnalyzerManagementException("updating tokens failed.", e);
234         }
235     }
236 
237     /**
238      * Add mail to corpus 
239      * 
240      * @param type The type to add to corpus. Valid types are HAM or SPAM
241      * @param stream The stream which is used to transfer the data
242      * @throws BayesianAnalyzerManagementException
243      */
244     private void addMailToCorpus(String type, BufferedReader stream) throws BayesianAnalyzerManagementException {
245         try {
246             if (type.equalsIgnoreCase(HAM)) {
247                 analyzer.addHam(stream);
248             } else if (type.equalsIgnoreCase(SPAM)) {
249                 analyzer.addSpam(stream);
250             }
251         } catch (IOException e) {
252             throw new BayesianAnalyzerManagementException("adding to corpus failed.", e);
253         }
254     }
255 
256 
257     /**
258      * Helper method to train the BayesianAnalysis from mbox file
259      *
260      * @param mboxFile The mbox file
261      * @param type The type to train. HAM or SPAM
262      * @return count The count of trained messages
263      * @throws BayesianAnalyzerManagementException
264      */
265     private int feedBayesianAnalyzerFromMbox(String mboxFile, String type) throws BayesianAnalyzerManagementException {
266         int count = 0;
267 
268         //Clear out any existing word/counts etc..
269         analyzer.clear();
270 
271         File tmpFile = new File(mboxFile);
272 
273         if (MboxFile.isValid(tmpFile)) {
274             MboxFile mbox = new MboxFile(tmpFile,MboxFile.READ_ONLY);
275 
276             synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) {
277                 int messageCount = 0;
278                 try {
279                     messageCount = mbox.getMessageCount();
280                 } catch (IOException e) {
281                     throw new BayesianAnalyzerManagementException(e);
282                 }
283                 for (int i = 0; i < messageCount; i++) {
284                     InputStream message = null;
285                     try {
286                         message = mbox.getMessageAsStream(i);
287                     } catch (IOException e) {
288                         throw new BayesianAnalyzerManagementException("could not access mail from mbox streanm", e);
289                     }
290                     BufferedReader stream = new BufferedReader(new InputStreamReader(message));
291                     addMailToCorpus(type, stream);
292                     count++;
293                 }
294 
295                 //Update storage statistics.
296                 updateTokens(type);
297             }
298         } else {
299             throw new IllegalArgumentException("Please provide an valid mbox file");
300         }
301 
302         return count;
303     }
304     
305     /**
306      * @see org.apache.james.management.BayesianAnalyzerManagementService#exportData(String)
307      */
308     public void exportData(String file) throws BayesianAnalyzerManagementException {
309         if (repos == null) throw new BayesianAnalyzerManagementException("RepositoryPath not configured");
310 
311         synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) {
312             try {
313                 analyzer.loadHamNSpam(component.getConnection());
314             } catch (SQLException e) {
315                 throw new BayesianAnalyzerManagementException("loading ham and spam failed.", e);
316             }
317 
318             int hamMessageCount = analyzer.getHamMessageCount();
319             int spamMessageCount = analyzer.getSpamMessageCount();
320             Map hamTokenCounts = analyzer.getHamTokenCounts();
321             Map spamTokenCounts = analyzer.getSpamTokenCounts();
322 
323             XStream xstream = new XStream(new DomDriver());
324             xstream.alias("bayesianAnalyzer", BayesianAnalyzerXml.class);
325             FileOutputStream fileOutputStream = null;
326             try {
327                 fileOutputStream = new FileOutputStream(file);
328             } catch (FileNotFoundException e) {
329                 throw new BayesianAnalyzerManagementException("opening export file failed", e);
330             }
331             PrintWriter printwriter = new PrintWriter(fileOutputStream);
332             printwriter.println(xstream.toXML(new BayesianAnalyzerXml(hamMessageCount,spamMessageCount,hamTokenCounts,spamTokenCounts)));
333             printwriter.close();
334         }
335     }
336     
337     /**
338      * @see org.apache.james.management.BayesianAnalyzerManagementService#importData(String)
339      */
340     public void importData(String file) throws BayesianAnalyzerManagementException {
341         if (repos == null) throw new BayesianAnalyzerManagementException("RepositoryPath not configured");
342 
343         synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK){
344             XStream xstream = new XStream(new DomDriver());
345 
346             FileReader fileReader = null;
347             try {
348                 fileReader = new FileReader(file);
349             } catch (FileNotFoundException e) {
350                 throw new BayesianAnalyzerManagementException("opening input file failed", e);
351             }
352             BayesianAnalyzerXml bAnalyzerXml = (BayesianAnalyzerXml) xstream.fromXML(fileReader);
353 
354             // clear old data
355             analyzer.clear();
356             analyzer.tokenCountsClear();
357 
358             //TODO: Drop old corpus in database;
359 
360             // add the new data
361             analyzer.setHamMessageCount(bAnalyzerXml.getHamMessageCount());
362             analyzer.setSpamMessageCount(bAnalyzerXml.getSpamMessageCount());
363             analyzer.setHamTokenCounts(bAnalyzerXml.getHamTokenCounts());
364             analyzer.setSpamTokenCounts(bAnalyzerXml.getSpamTokenCounts());
365             updateTokens(HAM);
366             updateTokens(SPAM);
367         }
368 
369     }
370     
371     private JDBCBayesianAnalyzer analyzer = new JDBCBayesianAnalyzer() {
372         protected void delegatedLog(String logString) {
373             // no logging
374         }
375     };
376     
377 
378     /**
379      * @see org.apache.james.management.BayesianAnalyzerManagementService#resetData()
380      */
381     public void resetData() throws BayesianAnalyzerManagementException {
382         synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) {
383             try {
384                 analyzer.resetData(component.getConnection());
385             } catch (SQLException e) {
386                 throw new BayesianAnalyzerManagementException(e.getMessage());
387             }
388         }
389     
390     }
391     
392     /**
393      * Inner class to represent the data in an xml file
394      */
395     private static class BayesianAnalyzerXml {
396         private int hamMessageCount = 0;
397         private int spamMessageCount = 0;
398         private Map hamTokenCounts;
399         private Map spamTokenCounts;
400     
401         /**
402          * Default Constructer
403          * 
404          * @param hamMessageCount the count of trained ham messages
405          * @param spamMessageCount the count of trained spam messages
406          * @param hamTokenCounts the count and tokens of trained ham  
407          * @param spamTokenCounts the count and tokens of trained spam
408          */
409         public BayesianAnalyzerXml(int hamMessageCount, int spamMessageCount, Map hamTokenCounts, Map spamTokenCounts) {
410             this.hamMessageCount = hamMessageCount;
411             this.spamMessageCount = spamMessageCount;
412             this.hamTokenCounts = hamTokenCounts;
413             this.spamTokenCounts = spamTokenCounts;
414         }
415     
416         /**
417          * Return the count of trained ham messages
418          * 
419          * @return hamMessageCount the count of trained ham messages 
420          */
421         public int getHamMessageCount() {
422             return hamMessageCount;
423         }
424     
425         /**
426          * Return the count of trained spam messages
427          * 
428          * @return spamMessageCount the count of trained spam messages
429          */
430         public int getSpamMessageCount() {
431             return spamMessageCount;
432         }
433     
434         /**
435          * Return a Map which contains the token as key and the count as value of trained ham messages
436          * 
437          * @return hamTokenCounts a Map which contains the tokens and counts
438          */
439         public Map getHamTokenCounts() {
440             return hamTokenCounts;
441         }
442     
443         /**
444          * Return a Map which contains the token as key and the count as value of trained spam messages
445          * 
446          * @return spamTokenCounts a Map which countains the tokens and counts
447          */
448         public Map getSpamTokenCounts() {
449             return spamTokenCounts;
450         }
451     
452     }
453 
454 }