BayesianAnalyzer xref

View Javadoc

1   /************************************************************************
2    * Copyright (c) 2000-2006 The Apache Software Foundation.             *
3    * All rights reserved.                                                *
4    * ------------------------------------------------------------------- *
5    * Licensed under the Apache License, Version 2.0 (the "License"); you *
6    * may not use this file except in compliance with the License. You    *
7    * may obtain a copy of the License at:                                *
8    *                                                                     *
9    *     http://www.apache.org/licenses/LICENSE-2.0                      *
10   *                                                                     *
11   * Unless required by applicable law or agreed to in writing, software *
12   * distributed under the License is distributed on an "AS IS" BASIS,   *
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or     *
14   * implied.  See the License for the specific language governing       *
15   * permissions and limitations under the License.                      *
16   ***********************************************************************/
17  
18  package org.apache.james.util;
19  
20  import java.util.Map;
21  import java.util.Set;
22  import java.util.SortedSet;
23  import java.util.TreeSet;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.Collection;
28  import java.util.ArrayList;
29  
30  import java.io.Reader;
31  import java.io.StreamTokenizer;
32  import java.io.StringReader;
33  
34  /***
35   * <P>Determines probability that text contains Spam.</P>
36   *
37   * <P>Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan for Spam</a>.
38   * Extended to Paul Grahams' <a href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.</P>
39   *
40   * <P>Sample method usage:</P>
41   *
42   * <P>Use:
43   *   void addHam(Reader)
44   *   and
45   *   void addSpam(Reader)
46   *
47   *   methods to build up the Maps of ham & spam tokens/occurrences.
48   *   Both addHam and addSpam assume they're reading one message at a time,
49   *   if you feed more than one message per call, be sure to adjust the
50   *   appropriate message counter:  hamMessageCount or spamMessageCount.
51   *
52   *   Then...</P>
53   *
54   * <P>Use:
55   *   void buildCorpus()
56   *
57   *   to build the final token/probabilities Map.
58   *
59   *   Use your own methods for persistent storage of either the individual
60   *   ham/spam corpus & message counts, and/or the final corpus.
61   *
62   *   Then you can...</P>
63   *
64   * <P>Use:
65   *   double computeSpamProbability(Reader)
66   *
67   *   to determine the probability that a particular text contains spam.
68   *   A returned result of 0.9 or above is an indicator that the text was
69   *   spam.</P>
70   *
71   * <P>If you use persistent storage, use:
72   *   void setCorpus(Map)
73   *
74   * before calling computeSpamProbability.</P>
75   *
76   * @version CVS $Revision: $ $Date: $
77   * @since 2.3.0
78   */
79  
80  public class BayesianAnalyzer {
81      
82      /***
83       * Number of "interesting" tokens to use to compute overall
84       * spamminess probability.
85       */
86      private final static int MAX_INTERESTING_TOKENS = 15;
87      
88      /***
89       * Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall
90       * spamminess probability.
91       */
92      private final static double INTERESTINGNESS_THRESHOLD = 0.46;
93      
94      /***
95       * Default token probability to use when a token has not been
96       * encountered before.
97       */
98      private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;
99      
100     /***
101      * Map of ham tokens and their occurrences.
102      *
103      * String key
104      * Integer value
105      */
106     private Map hamTokenCounts = new HashMap();
107     
108     /***
109      * Map of spam tokens and their occurrences.
110      *
111      * String key
112      * Integer value
113      */
114     private Map spamTokenCounts = new HashMap();
115     
116     /***
117      * Number of ham messages analyzed.
118      */
119     private int hamMessageCount = 0;
120     
121     /***
122      * Number of spam messages analyzed.
123      */
124     private int spamMessageCount = 0;
125     
126     /***
127      * Final token/probability corpus.
128      *
129      * String key
130      * Double value
131      */
132     private Map corpus = new HashMap();
133     
134     /***
135      * Inner class for managing Token Probability Strengths during the
136      * computeSpamProbability phase.
137      *
138      * By probability <i>strength</i> we mean the absolute distance of a
139      * probability from the middle value 0.5.
140      *
141      * It implements Comparable so that it's sorting is automatic.
142      */
143     private class TokenProbabilityStrength
144     implements Comparable {
145         /***
146          * Message token.
147          */
148         String token = null;
149         
150         /***
151          * Token's computed probability strength.
152          */
153         double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
154         
155         /***
156          * Force the natural sort order for this object to be high-to-low.
157          *
158          * @param anotherTokenProbabilityStrength A TokenProbabilityStrength instance to compare
159          *                                this instance with.
160          *
161          * @return The result of the comparison (before, equal, after).
162          */
163         public final int compareTo(Object anotherTokenProbabilityStrength) {
164             int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);
165             if (result == 0) {
166                 return this.token.compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);
167             } else {
168                 return result;
169             }
170         }
171         
172         /***
173          * Simple toString () implementation mostly for debugging purposes.
174          *
175          * @return String representation of this object.
176          */
177         public String toString() {
178             StringBuffer sb = new StringBuffer(30);
179             
180             sb.append(token)
181             .append("=")
182             .append(strength);
183             
184             return sb.toString();
185         }
186     }
187     
188     /***
189      * Basic class constructor.
190      */
191     public BayesianAnalyzer() {
192     }
193     
194     /***
195      * Public setter for the hamTokenCounts Map.
196      *
197      * @param hamTokenCounts The new ham Token counts Map.
198      */
199     public void setHamTokenCounts(Map hamTokenCounts) {
200         this.hamTokenCounts = hamTokenCounts;
201     }
202     
203     /***
204      * Public getter for the hamTokenCounts Map.
205      */
206     public Map getHamTokenCounts() {
207         return this.hamTokenCounts;
208     }
209     
210     /***
211      * Public setter for the spamTokenCounts Map.
212      *
213      * @param spamTokenCounts The new spam Token counts Map.
214      */
215     public void setSpamTokenCounts(Map spamTokenCounts) {
216         this.spamTokenCounts = spamTokenCounts;
217     }
218     
219     /***
220      * Public getter for the spamTokenCounts Map.
221      */
222     public Map getSpamTokenCounts() {
223         return this.spamTokenCounts;
224     }
225     
226     /***
227      * Public setter for spamMessageCount.
228      *
229      * @param spamMessageCount The new spam message count.
230      */
231     public void setSpamMessageCount(int spamMessageCount) {
232         this.spamMessageCount = spamMessageCount;
233     }
234     
235     /***
236      * Public getter for spamMessageCount.
237      */
238     public int getSpamMessageCount() {
239         return this.spamMessageCount;
240     }
241     
242     /***
243      * Public setter for hamMessageCount.
244      *
245      * @param hamMessageCount The new ham message count.
246      */
247     public void setHamMessageCount(int hamMessageCount) {
248         this.hamMessageCount = hamMessageCount;
249     }
250     
251     /***
252      * Public getter for hamMessageCount.
253      */
254     public int getHamMessageCount() {
255         return this.hamMessageCount;
256     }
257     
258     /***
259      * Clears all analysis repositories and counters.
260      */
261     public void clear() {
262         corpus.clear();
263         
264         tokenCountsClear();
265         
266         hamMessageCount = 0;
267         spamMessageCount = 0;
268     }
269     
270     /***
271      * Clears token counters.
272      */
273     public void tokenCountsClear() {
274         hamTokenCounts.clear();
275         spamTokenCounts.clear();
276     }
277     
278     /***
279      * Public setter for corpus.
280      *
281      * @param corpus The new corpus.
282      */
283     public void setCorpus(Map corpus) {
284         this.corpus = corpus;
285     }
286     
287     /***
288      * Public getter for corpus.
289      */
290     public Map getCorpus() {
291         return this.corpus;
292     }
293     
294     /***
295      * Builds the corpus from the existing ham & spam counts.
296      */
297     public void buildCorpus() {
298         //Combine the known ham & spam tokens.
299         Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
300         set.addAll(hamTokenCounts.keySet());
301         set.addAll(spamTokenCounts.keySet());
302         Map tempCorpus = new HashMap(set.size());
303         
304         //Iterate through all the tokens and compute their new
305         //individual probabilities.
306         Iterator i = set.iterator();
307         while (i.hasNext()) {
308             String token = (String) i.next();
309             tempCorpus.put(token, new Double(computeProbability(token)));
310         }
311         setCorpus(tempCorpus);
312     }
313     
314     /***
315      * Adds a message to the ham list.
316      * @param stream A reader stream on the ham message to analyze
317      * @throws IOException If any error occurs
318      */
319     public void addHam(Reader stream)
320     throws java.io.IOException {
321         addTokenOccurrences(stream, hamTokenCounts);
322         hamMessageCount++;
323     }
324     
325     /***
326      * Adds a message to the spam list.
327      * @param stream A reader stream on the spam message to analyze
328      * @throws IOException If any error occurs
329      */
330     public void addSpam(Reader stream)
331     throws java.io.IOException {
332         addTokenOccurrences(stream, spamTokenCounts);
333         spamMessageCount++;
334     }
335     
336     /***
337      * Computes the probability that the stream contains SPAM.
338      * @param stream The text to be analyzed for Spamminess.
339      * @return A 0.0 - 1.0 probability
340      * @throws IOException If any error occurs
341      */
342     public double computeSpamProbability(Reader stream)
343     throws java.io.IOException {
344         //Build a set of the tokens in the Stream.
345         Set tokens = parse(stream);
346         
347         // Get the corpus to use in this run
348         // A new corpus may be being built in the meantime
349         Map workCorpus = getCorpus();
350         
351         //Assign their probabilities from the Corpus (using an additional
352         //calculation to determine spamminess).
353         SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
354         
355         //Compute and return the overall probability that the
356         //stream is SPAM.
357         return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
358     }
359     
360     /***
361      * Parses a stream into tokens, and updates the target Map
362      * with the token/counts.
363      *
364      * @param stream
365      * @param target
366      */
367     private void addTokenOccurrences(Reader stream, Map target)
368     throws java.io.IOException {
369         String token;
370         String header = "";
371         
372         //Update target with the tokens/count encountered.
373         while ((token = nextToken(stream)) != null) {
374             boolean endingLine = false;
375             if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
376                 endingLine = true;
377                 token = token.substring(0, token.length() - 1);
378             }
379             
380             if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
381                 if (token.equals("From:")
382                 || token.equals("Return-Path:")
383                 || token.equals("Subject:")
384                 || token.equals("To:")
385                 ) {
386                     header = token;
387                     if (!endingLine) {
388                         continue;
389                     }
390                 }
391                 
392                 token = header + token;
393                 
394                 Integer value = null;
395                 
396                 if (target.containsKey(token)) {
397                     value = new Integer(((Integer) target.get(token)).intValue() + 1);
398                 } else {
399                     value = new Integer(1);
400                 }
401                 
402                 target.put(token, value);
403             }
404             
405             if (endingLine) {
406                 header = "";
407             }
408         }
409     }
410     
411     /***
412      * Parses a stream into tokens, and returns a Set of
413      * the unique tokens encountered.
414      *
415      * @param stream
416      * @return Set
417      */
418     private Set parse(Reader stream)
419     throws java.io.IOException {
420         Set tokens = new HashSet();
421         String token;
422         String header = "";
423         
424         //Build a Map of tokens encountered.
425         while ((token = nextToken(stream)) != null) {
426             boolean endingLine = false;
427             if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
428                 endingLine = true;
429                 token = token.substring(0, token.length() - 1);
430             }
431             
432             if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
433                 if (token.equals("From:")
434                 || token.equals("Return-Path:")
435                 || token.equals("Subject:")
436                 || token.equals("To:")
437                 ) {
438                     header = token;
439                     if (!endingLine) {
440                         continue;
441                     }
442                 }
443                 
444                 token = header + token;
445                 
446                 tokens.add(token);
447             }
448             
449             if (endingLine) {
450                 header = "";
451             }
452         }
453         
454         //Return the unique set of tokens encountered.
455         return tokens;
456     }
457     
458     private String nextToken(Reader reader) throws java.io.IOException {
459         StringBuffer token = new StringBuffer();
460         int i;
461         char ch, ch2;
462         boolean previousWasDigit = false;
463         boolean tokenCharFound = false;
464         
465         if (!reader.ready()) {
466             return null;
467         }
468         
469         while ((i = reader.read()) != -1) {
470             
471             ch = (char) i;
472             
473             if (ch == ':') {
474                 String tokenString = token.toString() + ':';
475                 if (tokenString.equals("From:")
476                 || tokenString.equals("Return-Path:")
477                 || tokenString.equals("Subject:")
478                 || tokenString.equals("To:")
479                 ) {
480                     return tokenString;
481                 }
482             }
483             
484             if (Character.isLetter(ch)
485             || ch == '-'
486             || ch == '$'
487             || ch == '\u20AC' // the EURO symbol
488             || ch == '!'
489             || ch == '\''
490             ) {
491                 tokenCharFound = true;
492                 previousWasDigit = false;
493                 token.append(ch);
494             } else if (Character.isDigit(ch)) {
495                 tokenCharFound = true;
496                 previousWasDigit = true;
497                 token.append(ch);
498             } else if (previousWasDigit && (ch == '.' || ch == ',')) {
499                 reader.mark(1);
500                 previousWasDigit = false;
501                 i = reader.read();
502                 if (i == -1) {
503                     break;
504                 }
505                 ch2 = (char) i;
506                 if (Character.isDigit(ch2)) {
507                     tokenCharFound = true;
508                     previousWasDigit = true;
509                     token.append(ch);
510                     token.append(ch2);
511                 } else {
512                     reader.reset();
513                     break;
514                 }
515             } else if (ch == '\r') {
516                 // cr found, ignore
517             } else if (ch == '\n') {
518                 // eol found
519                 tokenCharFound = true;
520                 previousWasDigit = false;
521                 token.append(ch);
522                 break;
523             } else if (tokenCharFound) {
524                 break;
525             }
526         }
527         
528         if (tokenCharFound) {
529             //          System.out.println("Token read: " + token);
530             return token.toString();
531         } else {
532             return null;
533         }
534     }
535     
536     /***
537      * Compute the probability that "token" is SPAM.
538      *
539      * @param token
540      * @return  The probability that the token occurs within spam.
541      */
542     private double computeProbability(String token) {
543         double hamFactor  = 0;
544         double spamFactor = 0;
545         
546         boolean foundInHam = false;
547         boolean foundInSpam = false;
548         
549         double minThreshold = 0.01;
550         double maxThreshold = 0.99;
551         
552         if (hamTokenCounts.containsKey(token)) {
553             foundInHam = true;
554         }
555         
556         if (spamTokenCounts.containsKey(token)) {
557             foundInSpam = true;
558         }
559         
560         if (foundInHam) {
561             hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue();
562             if (!foundInSpam) {
563                 minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
564             }
565         }
566         
567         if (foundInSpam) {
568             spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();
569             if (!foundInHam) {
570                 maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
571             }
572         }
573         
574         if ((hamFactor + spamFactor) < 5) {
575             //This token hasn't been seen enough.
576             return 0.4;
577         }
578         
579         double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
580         double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
581         
582         return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
583     }
584     
585     /***
586      * Returns a SortedSet of TokenProbabilityStrength built from the
587      * Corpus and the tokens passed in the "tokens" Set.
588      * The ordering is from the highest strength to the lowest strength.
589      *
590      * @param tokens
591      * @param workCorpus
592      * @return  SortedSet of TokenProbabilityStrength objects.
593      */
594     private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
595         //Convert to a SortedSet of token probability strengths.
596         SortedSet tokenProbabilityStrengths = new TreeSet();
597         
598         Iterator i = tokens.iterator();
599         while (i.hasNext()) {
600             TokenProbabilityStrength tps = new TokenProbabilityStrength();
601             
602             tps.token = (String) i.next();
603             
604             if (workCorpus.containsKey(tps.token)) {
605                 tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
606             }
607             else {
608                 //This token has never been seen before,
609                 //we'll give it initially the default probability.
610                 Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
611                 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
612                 boolean isTokenDegeneratedFound = false;
613                 
614                 Collection degeneratedTokens = buildDegenerated(tps.token);
615                 Iterator iDegenerated = degeneratedTokens.iterator();
616                 String tokenDegenerated = null;
617                 double strengthDegenerated;
618                 while (iDegenerated.hasNext()) {
619                     tokenDegenerated = (String) iDegenerated.next();
620                     if (workCorpus.containsKey(tokenDegenerated)) {
621                         Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
622                         strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
623                         if (strengthDegenerated > tps.strength) {
624                             isTokenDegeneratedFound = true;
625                             tps.strength = strengthDegenerated;
626                             corpusProbability = probabilityTemp;
627                         }
628                     }
629                 }
630                 // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default
631                 if (isTokenDegeneratedFound) {
632                     synchronized(workCorpus) {
633                         workCorpus.put(tps.token, corpusProbability);
634                     }
635                 }
636             }
637             
638             tokenProbabilityStrengths.add(tps);
639         }
640         
641         return tokenProbabilityStrengths;
642     }
643     
644     private Collection buildDegenerated(String fullToken) {
645         ArrayList tokens = new ArrayList();
646         String header;
647         String token;
648         
649         // look for a header string termination
650         int headerEnd = fullToken.indexOf(':');
651         if (headerEnd >= 0) {
652             header = fullToken.substring(0, headerEnd);
653             token = fullToken.substring(headerEnd);
654         } else {
655             header = "";
656             token = fullToken;
657         }
658         
659         int end = token.length();
660         do {
661             if (!token.substring(0, end).equals(token.substring(0, end).toLowerCase())) {
662                 tokens.add(header + token.substring(0, end).toLowerCase());
663                 if (header.length() > 0) {
664                     tokens.add(token.substring(0, end).toLowerCase());
665                 }
666             }
667             if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
668                 tokens.add(header + token.charAt(0) + token.substring(1, end).toLowerCase());
669                 if (header.length() > 0) {
670                     tokens.add(token.charAt(0) + token.substring(1, end).toLowerCase());
671                 }
672             }
673             
674             if (token.charAt(end - 1) != '!') {
675                 break;
676             }
677             
678             end--;
679             
680             tokens.add(header + token.substring(0, end));
681             if (header.length() > 0) {
682                 tokens.add(token.substring(0, end));
683             }
684         } while (end > 0);
685         
686         return tokens;
687     }
688     
689     /***
690      * Compute the spamminess probability of the interesting tokens in
691      * the tokenProbabilities SortedSet.
692      *
693      * @param tokenProbabilities
694      * @param workCorpus
695      * @return  Computed spamminess.
696      */
697     private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
698         double p = 1.0;
699         double np = 1.0;
700         double tempStrength = 0.5;
701         int count = MAX_INTERESTING_TOKENS;
702         Iterator iterator = tokenProbabilityStrengths.iterator();
703         while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
704             TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next();
705             tempStrength = tps.strength;
706             
707             //      System.out.println(tps);
708             
709             double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
710             Double theDoubleObject = (Double) workCorpus.get(tps.token);
711             // if either the original token or a degeneration was found use the double value, otherwise use the default
712             if (theDoubleObject != null) {
713                 theDoubleValue = theDoubleObject.doubleValue();
714             }
715             p *= theDoubleValue;
716             np *= (1.0 - theDoubleValue);
717             // System.out.println("Token:" + tps.token + ", p=" + theDoubleValue + ", overall p=" + p / (p + np));
718         }
719         
720         return (p / (p + np));
721     }
722     
723     private boolean allSameChar(String s) {
724         if (s.length() < 2) {
725             return false;
726         }
727         
728         char c = s.charAt(0);
729         
730         for (int i = 1; i < s.length(); i++) {
731             if (s.charAt(i) != c) {
732                 return false;
733             }
734         }
735         return true;
736     }
737     
738     private boolean allDigits(String s) {
739         for (int i = 0; i < s.length(); i++) {
740             if (!Character.isDigit(s.charAt(i))) {
741                 return false;
742             }
743         }
744         return true;
745     }
746 }
747