BayesianAnalyzer xref

View Javadoc

1   /****************************************************************
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   ****************************************************************/
19  
20  
21  
22  package org.apache.james.util.bayesian;
23  
24  import java.util.Map;
25  import java.util.Set;
26  import java.util.SortedSet;
27  import java.util.TreeSet;
28  import java.util.HashMap;
29  import java.util.HashSet;
30  import java.util.Iterator;
31  import java.util.Collection;
32  import java.util.ArrayList;
33  
34  import java.io.IOException;
35  import java.io.Reader;
36  
37  /**
38   * <P>Determines probability that text contains Spam.</P>
39   *
40   * <P>Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan for Spam</a>.
41   * Extended to Paul Grahams' <a href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.</P>
42   *
43   * <P>Sample method usage:</P>
44   *
45   * <P>Use:
46   *   void addHam(Reader)
47   *   and
48   *   void addSpam(Reader)
49   *
50   *   methods to build up the Maps of ham & spam tokens/occurrences.
51   *   Both addHam and addSpam assume they're reading one message at a time,
52   *   if you feed more than one message per call, be sure to adjust the
53   *   appropriate message counter:  hamMessageCount or spamMessageCount.
54   *
55   *   Then...</P>
56   *
57   * <P>Use:
58   *   void buildCorpus()
59   *
60   *   to build the final token/probabilities Map.
61   *
62   *   Use your own methods for persistent storage of either the individual
63   *   ham/spam corpus & message counts, and/or the final corpus.
64   *
65   *   Then you can...</P>
66   *
67   * <P>Use:
68   *   double computeSpamProbability(Reader)
69   *
70   *   to determine the probability that a particular text contains spam.
71   *   A returned result of 0.9 or above is an indicator that the text was
72   *   spam.</P>
73   *
74   * <P>If you use persistent storage, use:
75   *   void setCorpus(Map)
76   *
77   * before calling computeSpamProbability.</P>
78   *
79   * @version CVS $Revision: 684527 $ $Date: 2008-08-10 16:53:29 +0100 (Sun, 10 Aug 2008) $
80   * @since 2.3.0
81   */
82  
83  public class BayesianAnalyzer {
84      
85      /**
86       * Number of "interesting" tokens to use to compute overall
87       * spamminess probability.
88       */
89      private final static int MAX_INTERESTING_TOKENS = 15;
90      
91      /**
92       * Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall
93       * spamminess probability.
94       */
95      private final static double INTERESTINGNESS_THRESHOLD = 0.46;
96      
97      /**
98       * Default token probability to use when a token has not been
99       * encountered before.
100      */
101     private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;
102     
103     /**
104      * Map of ham tokens and their occurrences.
105      *
106      * String key
107      * Integer value
108      */
109     private Map hamTokenCounts = new HashMap();
110     
111     /**
112      * Map of spam tokens and their occurrences.
113      *
114      * String key
115      * Integer value
116      */
117     private Map spamTokenCounts = new HashMap();
118     
119     /**
120      * Number of ham messages analyzed.
121      */
122     private int hamMessageCount = 0;
123     
124     /**
125      * Number of spam messages analyzed.
126      */
127     private int spamMessageCount = 0;
128     
129     /**
130      * Final token/probability corpus.
131      *
132      * String key
133      * Double value
134      */
135     private Map corpus = new HashMap();
136     
137     /**
138      * Inner class for managing Token Probability Strengths during the
139      * computeSpamProbability phase.
140      *
141      * By probability <i>strength</i> we mean the absolute distance of a
142      * probability from the middle value 0.5.
143      *
144      * It implements Comparable so that it's sorting is automatic.
145      */
146     private class TokenProbabilityStrength
147     implements Comparable {
148         /**
149          * Message token.
150          */
151         String token = null;
152         
153         /**
154          * Token's computed probability strength.
155          */
156         double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
157         
158         /**
159          * Force the natural sort order for this object to be high-to-low.
160          *
161          * @param anotherTokenProbabilityStrength A TokenProbabilityStrength instance to compare
162          *                                this instance with.
163          *
164          * @return The result of the comparison (before, equal, after).
165          */
166         public final int compareTo(Object anotherTokenProbabilityStrength) {
167             int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);
168             if (result == 0) {
169                 return this.token.compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);
170             } else {
171                 return result;
172             }
173         }
174         
175         /**
176          * Simple toString () implementation mostly for debugging purposes.
177          *
178          * @return String representation of this object.
179          */
180         public String toString() {
181             StringBuffer sb = new StringBuffer(30);
182             
183             sb.append(token)
184             .append("=")
185             .append(strength);
186             
187             return sb.toString();
188         }
189     }
190     
191     /**
192      * Basic class constructor.
193      */
194     public BayesianAnalyzer() {
195     }
196     
197     /**
198      * Public setter for the hamTokenCounts Map.
199      *
200      * @param hamTokenCounts The new ham Token counts Map.
201      */
202     public void setHamTokenCounts(Map hamTokenCounts) {
203         this.hamTokenCounts = hamTokenCounts;
204     }
205     
206     /**
207      * Public getter for the hamTokenCounts Map.
208      */
209     public Map getHamTokenCounts() {
210         return this.hamTokenCounts;
211     }
212     
213     /**
214      * Public setter for the spamTokenCounts Map.
215      *
216      * @param spamTokenCounts The new spam Token counts Map.
217      */
218     public void setSpamTokenCounts(Map spamTokenCounts) {
219         this.spamTokenCounts = spamTokenCounts;
220     }
221     
222     /**
223      * Public getter for the spamTokenCounts Map.
224      */
225     public Map getSpamTokenCounts() {
226         return this.spamTokenCounts;
227     }
228     
229     /**
230      * Public setter for spamMessageCount.
231      *
232      * @param spamMessageCount The new spam message count.
233      */
234     public void setSpamMessageCount(int spamMessageCount) {
235         this.spamMessageCount = spamMessageCount;
236     }
237     
238     /**
239      * Public getter for spamMessageCount.
240      */
241     public int getSpamMessageCount() {
242         return this.spamMessageCount;
243     }
244     
245     /**
246      * Public setter for hamMessageCount.
247      *
248      * @param hamMessageCount The new ham message count.
249      */
250     public void setHamMessageCount(int hamMessageCount) {
251         this.hamMessageCount = hamMessageCount;
252     }
253     
254     /**
255      * Public getter for hamMessageCount.
256      */
257     public int getHamMessageCount() {
258         return this.hamMessageCount;
259     }
260     
261     /**
262      * Clears all analysis repositories and counters.
263      */
264     public void clear() {
265         corpus.clear();
266         
267         tokenCountsClear();
268         
269         hamMessageCount = 0;
270         spamMessageCount = 0;
271     }
272     
273     /**
274      * Clears token counters.
275      */
276     public void tokenCountsClear() {
277         hamTokenCounts.clear();
278         spamTokenCounts.clear();
279     }
280     
281     /**
282      * Public setter for corpus.
283      *
284      * @param corpus The new corpus.
285      */
286     public void setCorpus(Map corpus) {
287         this.corpus = corpus;
288     }
289     
290     /**
291      * Public getter for corpus.
292      */
293     public Map getCorpus() {
294         return this.corpus;
295     }
296     
297     /**
298      * Builds the corpus from the existing ham & spam counts.
299      */
300     public void buildCorpus() {
301         //Combine the known ham & spam tokens.
302         Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
303         set.addAll(hamTokenCounts.keySet());
304         set.addAll(spamTokenCounts.keySet());
305         Map tempCorpus = new HashMap(set.size());
306         
307         //Iterate through all the tokens and compute their new
308         //individual probabilities.
309         Iterator i = set.iterator();
310         while (i.hasNext()) {
311             String token = (String) i.next();
312             tempCorpus.put(token, new Double(computeProbability(token)));
313         }
314         setCorpus(tempCorpus);
315     }
316     
317     /**
318      * Adds a message to the ham list.
319      * @param stream A reader stream on the ham message to analyze
320      * @throws IOException If any error occurs
321      */
322     public void addHam(Reader stream)
323     throws java.io.IOException {
324         addTokenOccurrences(stream, hamTokenCounts);
325         hamMessageCount++;
326     }
327     
328     /**
329      * Adds a message to the spam list.
330      * @param stream A reader stream on the spam message to analyze
331      * @throws IOException If any error occurs
332      */
333     public void addSpam(Reader stream)
334     throws java.io.IOException {
335         addTokenOccurrences(stream, spamTokenCounts);
336         spamMessageCount++;
337     }
338     
339     /**
340      * Computes the probability that the stream contains SPAM.
341      * @param stream The text to be analyzed for Spamminess.
342      * @return A 0.0 - 1.0 probability
343      * @throws IOException If any error occurs
344      */
345     public double computeSpamProbability(Reader stream)
346     throws java.io.IOException {
347         //Build a set of the tokens in the Stream.
348         Set tokens = parse(stream);
349         
350         // Get the corpus to use in this run
351         // A new corpus may be being built in the meantime
352         Map workCorpus = getCorpus();
353         
354         //Assign their probabilities from the Corpus (using an additional
355         //calculation to determine spamminess).
356         SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
357         
358         //Compute and return the overall probability that the
359         //stream is SPAM.
360         return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
361     }
362     
363     /**
364      * Parses a stream into tokens, and updates the target Map
365      * with the token/counts.
366      *
367      * @param stream
368      * @param target
369      */
370     private void addTokenOccurrences(Reader stream, Map target)
371     throws java.io.IOException {
372         String token;
373         String header = "";
374         
375         //Update target with the tokens/count encountered.
376         while ((token = nextToken(stream)) != null) {
377             boolean endingLine = false;
378             if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
379                 endingLine = true;
380                 token = token.substring(0, token.length() - 1);
381             }
382             
383             if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
384                 if (token.equals("From:")
385                 || token.equals("Return-Path:")
386                 || token.equals("Subject:")
387                 || token.equals("To:")
388                 ) {
389                     header = token;
390                     if (!endingLine) {
391                         continue;
392                     }
393                 }
394                 
395                 token = header + token;
396                 
397                 Integer value = null;
398                 
399                 if (target.containsKey(token)) {
400                     value = new Integer(((Integer) target.get(token)).intValue() + 1);
401                 } else {
402                     value = new Integer(1);
403                 }
404                 
405                 target.put(token, value);
406             }
407             
408             if (endingLine) {
409                 header = "";
410             }
411         }
412     }
413     
414     /**
415      * Parses a stream into tokens, and returns a Set of
416      * the unique tokens encountered.
417      *
418      * @param stream
419      * @return Set
420      */
421     private Set parse(Reader stream)
422     throws java.io.IOException {
423         Set tokens = new HashSet();
424         String token;
425         String header = "";
426         
427         //Build a Map of tokens encountered.
428         while ((token = nextToken(stream)) != null) {
429             boolean endingLine = false;
430             if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
431                 endingLine = true;
432                 token = token.substring(0, token.length() - 1);
433             }
434             
435             if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
436                 if (token.equals("From:")
437                 || token.equals("Return-Path:")
438                 || token.equals("Subject:")
439                 || token.equals("To:")
440                 ) {
441                     header = token;
442                     if (!endingLine) {
443                         continue;
444                     }
445                 }
446                 
447                 token = header + token;
448                 
449                 tokens.add(token);
450             }
451             
452             if (endingLine) {
453                 header = "";
454             }
455         }
456         
457         //Return the unique set of tokens encountered.
458         return tokens;
459     }
460     
461     private String nextToken(Reader reader) throws java.io.IOException {
462         StringBuffer token = new StringBuffer();
463         int i;
464         char ch, ch2;
465         boolean previousWasDigit = false;
466         boolean tokenCharFound = false;
467         
468         if (!reader.ready()) {
469             return null;
470         }
471         
472         while ((i = reader.read()) != -1) {
473             
474             ch = (char) i;
475             
476             if (ch == ':') {
477                 String tokenString = token.toString() + ':';
478                 if (tokenString.equals("From:")
479                 || tokenString.equals("Return-Path:")
480                 || tokenString.equals("Subject:")
481                 || tokenString.equals("To:")
482                 ) {
483                     return tokenString;
484                 }
485             }
486             
487             if (Character.isLetter(ch)
488             || ch == '-'
489             || ch == '$'
490             || ch == '\u20AC' // the EURO symbol
491             || ch == '!'
492             || ch == '\''
493             ) {
494                 tokenCharFound = true;
495                 previousWasDigit = false;
496                 token.append(ch);
497             } else if (Character.isDigit(ch)) {
498                 tokenCharFound = true;
499                 previousWasDigit = true;
500                 token.append(ch);
501             } else if (previousWasDigit && (ch == '.' || ch == ',')) {
502                 reader.mark(1);
503                 previousWasDigit = false;
504                 i = reader.read();
505                 if (i == -1) {
506                     break;
507                 }
508                 ch2 = (char) i;
509                 if (Character.isDigit(ch2)) {
510                     tokenCharFound = true;
511                     previousWasDigit = true;
512                     token.append(ch);
513                     token.append(ch2);
514                 } else {
515                     reader.reset();
516                     break;
517                 }
518             } else if (ch == '\r') {
519                 // cr found, ignore
520             } else if (ch == '\n') {
521                 // eol found
522                 tokenCharFound = true;
523                 previousWasDigit = false;
524                 token.append(ch);
525                 break;
526             } else if (tokenCharFound) {
527                 break;
528             }
529         }
530         
531         if (tokenCharFound) {
532             //          System.out.println("Token read: " + token);
533             return token.toString();
534         } else {
535             return null;
536         }
537     }
538     
539     /**
540      * Compute the probability that "token" is SPAM.
541      *
542      * @param token
543      * @return  The probability that the token occurs within spam.
544      */
545     private double computeProbability(String token) {
546         double hamFactor  = 0;
547         double spamFactor = 0;
548         
549         boolean foundInHam = false;
550         boolean foundInSpam = false;
551         
552         double minThreshold = 0.01;
553         double maxThreshold = 0.99;
554         
555         if (hamTokenCounts.containsKey(token)) {
556             foundInHam = true;
557         }
558         
559         if (spamTokenCounts.containsKey(token)) {
560             foundInSpam = true;
561         }
562         
563         if (foundInHam) {
564             hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue();
565             if (!foundInSpam) {
566                 minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
567             }
568         }
569         
570         if (foundInSpam) {
571             spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();
572             if (!foundInHam) {
573                 maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
574             }
575         }
576         
577         if ((hamFactor + spamFactor) < 5) {
578             //This token hasn't been seen enough.
579             return 0.4;
580         }
581         
582         double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
583         double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
584         
585         return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
586     }
587     
588     /**
589      * Returns a SortedSet of TokenProbabilityStrength built from the
590      * Corpus and the tokens passed in the "tokens" Set.
591      * The ordering is from the highest strength to the lowest strength.
592      *
593      * @param tokens
594      * @param workCorpus
595      * @return  SortedSet of TokenProbabilityStrength objects.
596      */
597     private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
598         //Convert to a SortedSet of token probability strengths.
599         SortedSet tokenProbabilityStrengths = new TreeSet();
600         
601         Iterator i = tokens.iterator();
602         while (i.hasNext()) {
603             TokenProbabilityStrength tps = new TokenProbabilityStrength();
604             
605             tps.token = (String) i.next();
606             
607             if (workCorpus.containsKey(tps.token)) {
608                 tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
609             }
610             else {
611                 //This token has never been seen before,
612                 //we'll give it initially the default probability.
613                 Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
614                 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
615                 boolean isTokenDegeneratedFound = false;
616                 
617                 Collection degeneratedTokens = buildDegenerated(tps.token);
618                 Iterator iDegenerated = degeneratedTokens.iterator();
619                 String tokenDegenerated = null;
620                 double strengthDegenerated;
621                 while (iDegenerated.hasNext()) {
622                     tokenDegenerated = (String) iDegenerated.next();
623                     if (workCorpus.containsKey(tokenDegenerated)) {
624                         Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
625                         strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
626                         if (strengthDegenerated > tps.strength) {
627                             isTokenDegeneratedFound = true;
628                             tps.strength = strengthDegenerated;
629                             corpusProbability = probabilityTemp;
630                         }
631                     }
632                 }
633                 // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default
634                 if (isTokenDegeneratedFound) {
635                     synchronized(workCorpus) {
636                         workCorpus.put(tps.token, corpusProbability);
637                     }
638                 }
639             }
640             
641             tokenProbabilityStrengths.add(tps);
642         }
643         
644         return tokenProbabilityStrengths;
645     }
646     
647     private Collection buildDegenerated(String fullToken) {
648         ArrayList tokens = new ArrayList();
649         String header;
650         String token;
651         String tokenLower;
652         
653         // look for a header string termination
654         int headerEnd = fullToken.indexOf(':');
655         if (headerEnd >= 0) {
656             header = fullToken.substring(0, headerEnd);
657             token = fullToken.substring(headerEnd);
658         } else {
659             header = "";
660             token = fullToken;
661         }
662         
663         // prepare a version of the token containing all lower case (for performance reasons)
664         tokenLower = token.toLowerCase();
665         
666         int end = token.length();
667         do {
668             if (!token.substring(0, end).equals(tokenLower.substring(0, end))) {
669                 tokens.add(header + tokenLower.substring(0, end));
670                 if (header.length() > 0) {
671                     tokens.add(tokenLower.substring(0, end));
672                 }
673             }
674             if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
675                 tokens.add(header + token.charAt(0) + tokenLower.substring(1, end));
676                 if (header.length() > 0) {
677                     tokens.add(token.charAt(0) + tokenLower.substring(1, end));
678                 }
679             }
680             
681             if (token.charAt(end - 1) != '!') {
682                 break;
683             }
684             
685             end--;
686             
687             tokens.add(header + token.substring(0, end));
688             if (header.length() > 0) {
689                 tokens.add(token.substring(0, end));
690             }
691         } while (end > 0);
692         
693         return tokens;
694     }
695     
696     /**
697      * Compute the spamminess probability of the interesting tokens in
698      * the tokenProbabilities SortedSet.
699      *
700      * @param tokenProbabilityStrengths
701      * @param workCorpus
702      * @return  Computed spamminess.
703      */
704     private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
705         double p = 1.0;
706         double np = 1.0;
707         double tempStrength = 0.5;
708         int count = MAX_INTERESTING_TOKENS;
709         Iterator iterator = tokenProbabilityStrengths.iterator();
710         while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
711             TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next();
712             tempStrength = tps.strength;
713             
714             //      System.out.println(tps);
715             
716             double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
717             Double theDoubleObject = (Double) workCorpus.get(tps.token);
718             // if either the original token or a degeneration was found use the double value, otherwise use the default
719             if (theDoubleObject != null) {
720                 theDoubleValue = theDoubleObject.doubleValue();
721             }
722             p *= theDoubleValue;
723             np *= (1.0 - theDoubleValue);
724             //System.out.println("Token " + tps + ", p=" + theDoubleValue + ", overall p=" + p / (p + np));
725         }
726         
727         return (p / (p + np));
728     }
729     
730     private boolean allSameChar(String s) {
731         if (s.length() < 2) {
732             return false;
733         }
734         
735         char c = s.charAt(0);
736         
737         for (int i = 1; i < s.length(); i++) {
738             if (s.charAt(i) != c) {
739                 return false;
740             }
741         }
742         return true;
743     }
744     
745     private boolean allDigits(String s) {
746         for (int i = 0; i < s.length(); i++) {
747             if (!Character.isDigit(s.charAt(i))) {
748                 return false;
749             }
750         }
751         return true;
752     }
753 }
754