BayesianAnalyzer xref

View Javadoc

1   /*****************************************************************
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   ****************************************************************/
19  
20  package org.apache.james.util;
21  
22  import java.util.Map;
23  import java.util.Set;
24  import java.util.SortedSet;
25  import java.util.TreeSet;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.Iterator;
29  import java.util.Collection;
30  import java.util.ArrayList;
31  
32  import java.io.Reader;
33  import java.io.StreamTokenizer;
34  import java.io.StringReader;
35  
36  /***
37   * <P>Determines probability that text contains Spam.</P>
38   *
39   * <P>Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan for Spam</a>.
40   * Extended to Paul Grahams' <a href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.</P>
41   *
42   * <P>Sample method usage:</P>
43   *
44   * <P>Use:
45   *   void addHam(Reader)
46   *   and
47   *   void addSpam(Reader)
48   *
49   *   methods to build up the Maps of ham & spam tokens/occurrences.
50   *   Both addHam and addSpam assume they're reading one message at a time,
51   *   if you feed more than one message per call, be sure to adjust the
52   *   appropriate message counter:  hamMessageCount or spamMessageCount.
53   *
54   *   Then...</P>
55   *
56   * <P>Use:
57   *   void buildCorpus()
58   *
59   *   to build the final token/probabilities Map.
60   *
61   *   Use your own methods for persistent storage of either the individual
62   *   ham/spam corpus & message counts, and/or the final corpus.
63   *
64   *   Then you can...</P>
65   *
66   * <P>Use:
67   *   double computeSpamProbability(Reader)
68   *
69   *   to determine the probability that a particular text contains spam.
70   *   A returned result of 0.9 or above is an indicator that the text was
71   *   spam.</P>
72   *
73   * <P>If you use persistent storage, use:
74   *   void setCorpus(Map)
75   *
76   * before calling computeSpamProbability.</P>
77   *
78   * @version CVS $Revision: $ $Date: $
79   * @since 2.3.0
80   */
81  
82  public class BayesianAnalyzer {
83      
84      /***
85       * Number of "interesting" tokens to use to compute overall
86       * spamminess probability.
87       */
88      private final static int MAX_INTERESTING_TOKENS = 15;
89      
90      /***
91       * Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall
92       * spamminess probability.
93       */
94      private final static double INTERESTINGNESS_THRESHOLD = 0.46;
95      
96      /***
97       * Default token probability to use when a token has not been
98       * encountered before.
99       */
100     private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;
101     
102     /***
103      * Map of ham tokens and their occurrences.
104      *
105      * String key
106      * Integer value
107      */
108     private Map hamTokenCounts = new HashMap();
109     
110     /***
111      * Map of spam tokens and their occurrences.
112      *
113      * String key
114      * Integer value
115      */
116     private Map spamTokenCounts = new HashMap();
117     
118     /***
119      * Number of ham messages analyzed.
120      */
121     private int hamMessageCount = 0;
122     
123     /***
124      * Number of spam messages analyzed.
125      */
126     private int spamMessageCount = 0;
127     
128     /***
129      * Final token/probability corpus.
130      *
131      * String key
132      * Double value
133      */
134     private Map corpus = new HashMap();
135     
136     /***
137      * Inner class for managing Token Probability Strengths during the
138      * computeSpamProbability phase.
139      *
140      * By probability <i>strength</i> we mean the absolute distance of a
141      * probability from the middle value 0.5.
142      *
143      * It implements Comparable so that it's sorting is automatic.
144      */
145     private class TokenProbabilityStrength
146     implements Comparable {
147         /***
148          * Message token.
149          */
150         String token = null;
151         
152         /***
153          * Token's computed probability strength.
154          */
155         double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
156         
157         /***
158          * Force the natural sort order for this object to be high-to-low.
159          *
160          * @param anotherTokenProbabilityStrength A TokenProbabilityStrength instance to compare
161          *                                this instance with.
162          *
163          * @return The result of the comparison (before, equal, after).
164          */
165         public final int compareTo(Object anotherTokenProbabilityStrength) {
166             int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);
167             if (result == 0) {
168                 return this.token.compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);
169             } else {
170                 return result;
171             }
172         }
173         
174         /***
175          * Simple toString () implementation mostly for debugging purposes.
176          *
177          * @return String representation of this object.
178          */
179         public String toString() {
180             StringBuffer sb = new StringBuffer(30);
181             
182             sb.append(token)
183             .append("=")
184             .append(strength);
185             
186             return sb.toString();
187         }
188     }
189     
190     /***
191      * Basic class constructor.
192      */
193     public BayesianAnalyzer() {
194     }
195     
196     /***
197      * Public setter for the hamTokenCounts Map.
198      *
199      * @param hamTokenCounts The new ham Token counts Map.
200      */
201     public void setHamTokenCounts(Map hamTokenCounts) {
202         this.hamTokenCounts = hamTokenCounts;
203     }
204     
205     /***
206      * Public getter for the hamTokenCounts Map.
207      */
208     public Map getHamTokenCounts() {
209         return this.hamTokenCounts;
210     }
211     
212     /***
213      * Public setter for the spamTokenCounts Map.
214      *
215      * @param spamTokenCounts The new spam Token counts Map.
216      */
217     public void setSpamTokenCounts(Map spamTokenCounts) {
218         this.spamTokenCounts = spamTokenCounts;
219     }
220     
221     /***
222      * Public getter for the spamTokenCounts Map.
223      */
224     public Map getSpamTokenCounts() {
225         return this.spamTokenCounts;
226     }
227     
228     /***
229      * Public setter for spamMessageCount.
230      *
231      * @param spamMessageCount The new spam message count.
232      */
233     public void setSpamMessageCount(int spamMessageCount) {
234         this.spamMessageCount = spamMessageCount;
235     }
236     
237     /***
238      * Public getter for spamMessageCount.
239      */
240     public int getSpamMessageCount() {
241         return this.spamMessageCount;
242     }
243     
244     /***
245      * Public setter for hamMessageCount.
246      *
247      * @param hamMessageCount The new ham message count.
248      */
249     public void setHamMessageCount(int hamMessageCount) {
250         this.hamMessageCount = hamMessageCount;
251     }
252     
253     /***
254      * Public getter for hamMessageCount.
255      */
256     public int getHamMessageCount() {
257         return this.hamMessageCount;
258     }
259     
260     /***
261      * Clears all analysis repositories and counters.
262      */
263     public void clear() {
264         corpus.clear();
265         
266         tokenCountsClear();
267         
268         hamMessageCount = 0;
269         spamMessageCount = 0;
270     }
271     
272     /***
273      * Clears token counters.
274      */
275     public void tokenCountsClear() {
276         hamTokenCounts.clear();
277         spamTokenCounts.clear();
278     }
279     
280     /***
281      * Public setter for corpus.
282      *
283      * @param corpus The new corpus.
284      */
285     public void setCorpus(Map corpus) {
286         this.corpus = corpus;
287     }
288     
289     /***
290      * Public getter for corpus.
291      */
292     public Map getCorpus() {
293         return this.corpus;
294     }
295     
296     /***
297      * Builds the corpus from the existing ham & spam counts.
298      */
299     public void buildCorpus() {
300         //Combine the known ham & spam tokens.
301         Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
302         set.addAll(hamTokenCounts.keySet());
303         set.addAll(spamTokenCounts.keySet());
304         Map tempCorpus = new HashMap(set.size());
305         
306         //Iterate through all the tokens and compute their new
307         //individual probabilities.
308         Iterator i = set.iterator();
309         while (i.hasNext()) {
310             String token = (String) i.next();
311             tempCorpus.put(token, new Double(computeProbability(token)));
312         }
313         setCorpus(tempCorpus);
314     }
315     
316     /***
317      * Adds a message to the ham list.
318      * @param stream A reader stream on the ham message to analyze
319      * @throws IOException If any error occurs
320      */
321     public void addHam(Reader stream)
322     throws java.io.IOException {
323         addTokenOccurrences(stream, hamTokenCounts);
324         hamMessageCount++;
325     }
326     
327     /***
328      * Adds a message to the spam list.
329      * @param stream A reader stream on the spam message to analyze
330      * @throws IOException If any error occurs
331      */
332     public void addSpam(Reader stream)
333     throws java.io.IOException {
334         addTokenOccurrences(stream, spamTokenCounts);
335         spamMessageCount++;
336     }
337     
338     /***
339      * Computes the probability that the stream contains SPAM.
340      * @param stream The text to be analyzed for Spamminess.
341      * @return A 0.0 - 1.0 probability
342      * @throws IOException If any error occurs
343      */
344     public double computeSpamProbability(Reader stream)
345     throws java.io.IOException {
346         //Build a set of the tokens in the Stream.
347         Set tokens = parse(stream);
348         
349         // Get the corpus to use in this run
350         // A new corpus may be being built in the meantime
351         Map workCorpus = getCorpus();
352         
353         //Assign their probabilities from the Corpus (using an additional
354         //calculation to determine spamminess).
355         SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
356         
357         //Compute and return the overall probability that the
358         //stream is SPAM.
359         return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
360     }
361     
362     /***
363      * Parses a stream into tokens, and updates the target Map
364      * with the token/counts.
365      *
366      * @param stream
367      * @param target
368      */
369     private void addTokenOccurrences(Reader stream, Map target)
370     throws java.io.IOException {
371         String token;
372         String header = "";
373         
374         //Update target with the tokens/count encountered.
375         while ((token = nextToken(stream)) != null) {
376             boolean endingLine = false;
377             if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
378                 endingLine = true;
379                 token = token.substring(0, token.length() - 1);
380             }
381             
382             if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
383                 if (token.equals("From:")
384                 || token.equals("Return-Path:")
385                 || token.equals("Subject:")
386                 || token.equals("To:")
387                 ) {
388                     header = token;
389                     if (!endingLine) {
390                         continue;
391                     }
392                 }
393                 
394                 token = header + token;
395                 
396                 Integer value = null;
397                 
398                 if (target.containsKey(token)) {
399                     value = new Integer(((Integer) target.get(token)).intValue() + 1);
400                 } else {
401                     value = new Integer(1);
402                 }
403                 
404                 target.put(token, value);
405             }
406             
407             if (endingLine) {
408                 header = "";
409             }
410         }
411     }
412     
413     /***
414      * Parses a stream into tokens, and returns a Set of
415      * the unique tokens encountered.
416      *
417      * @param stream
418      * @return Set
419      */
420     private Set parse(Reader stream)
421     throws java.io.IOException {
422         Set tokens = new HashSet();
423         String token;
424         String header = "";
425         
426         //Build a Map of tokens encountered.
427         while ((token = nextToken(stream)) != null) {
428             boolean endingLine = false;
429             if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
430                 endingLine = true;
431                 token = token.substring(0, token.length() - 1);
432             }
433             
434             if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
435                 if (token.equals("From:")
436                 || token.equals("Return-Path:")
437                 || token.equals("Subject:")
438                 || token.equals("To:")
439                 ) {
440                     header = token;
441                     if (!endingLine) {
442                         continue;
443                     }
444                 }
445                 
446                 token = header + token;
447                 
448                 tokens.add(token);
449             }
450             
451             if (endingLine) {
452                 header = "";
453             }
454         }
455         
456         //Return the unique set of tokens encountered.
457         return tokens;
458     }
459     
460     private String nextToken(Reader reader) throws java.io.IOException {
461         StringBuffer token = new StringBuffer();
462         int i;
463         char ch, ch2;
464         boolean previousWasDigit = false;
465         boolean tokenCharFound = false;
466         
467         if (!reader.ready()) {
468             return null;
469         }
470         
471         while ((i = reader.read()) != -1) {
472             
473             ch = (char) i;
474             
475             if (ch == ':') {
476                 String tokenString = token.toString() + ':';
477                 if (tokenString.equals("From:")
478                 || tokenString.equals("Return-Path:")
479                 || tokenString.equals("Subject:")
480                 || tokenString.equals("To:")
481                 ) {
482                     return tokenString;
483                 }
484             }
485             
486             if (Character.isLetter(ch)
487             || ch == '-'
488             || ch == '$'
489             || ch == '\u20AC' // the EURO symbol
490             || ch == '!'
491             || ch == '\''
492             ) {
493                 tokenCharFound = true;
494                 previousWasDigit = false;
495                 token.append(ch);
496             } else if (Character.isDigit(ch)) {
497                 tokenCharFound = true;
498                 previousWasDigit = true;
499                 token.append(ch);
500             } else if (previousWasDigit && (ch == '.' || ch == ',')) {
501                 reader.mark(1);
502                 previousWasDigit = false;
503                 i = reader.read();
504                 if (i == -1) {
505                     break;
506                 }
507                 ch2 = (char) i;
508                 if (Character.isDigit(ch2)) {
509                     tokenCharFound = true;
510                     previousWasDigit = true;
511                     token.append(ch);
512                     token.append(ch2);
513                 } else {
514                     reader.reset();
515                     break;
516                 }
517             } else if (ch == '\r') {
518                 // cr found, ignore
519             } else if (ch == '\n') {
520                 // eol found
521                 tokenCharFound = true;
522                 previousWasDigit = false;
523                 token.append(ch);
524                 break;
525             } else if (tokenCharFound) {
526                 break;
527             }
528         }
529         
530         if (tokenCharFound) {
531             //          System.out.println("Token read: " + token);
532             return token.toString();
533         } else {
534             return null;
535         }
536     }
537     
538     /***
539      * Compute the probability that "token" is SPAM.
540      *
541      * @param token
542      * @return  The probability that the token occurs within spam.
543      */
544     private double computeProbability(String token) {
545         double hamFactor  = 0;
546         double spamFactor = 0;
547         
548         boolean foundInHam = false;
549         boolean foundInSpam = false;
550         
551         double minThreshold = 0.01;
552         double maxThreshold = 0.99;
553         
554         if (hamTokenCounts.containsKey(token)) {
555             foundInHam = true;
556         }
557         
558         if (spamTokenCounts.containsKey(token)) {
559             foundInSpam = true;
560         }
561         
562         if (foundInHam) {
563             hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue();
564             if (!foundInSpam) {
565                 minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
566             }
567         }
568         
569         if (foundInSpam) {
570             spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();
571             if (!foundInHam) {
572                 maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
573             }
574         }
575         
576         if ((hamFactor + spamFactor) < 5) {
577             //This token hasn't been seen enough.
578             return 0.4;
579         }
580         
581         double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
582         double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
583         
584         return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
585     }
586     
587     /***
588      * Returns a SortedSet of TokenProbabilityStrength built from the
589      * Corpus and the tokens passed in the "tokens" Set.
590      * The ordering is from the highest strength to the lowest strength.
591      *
592      * @param tokens
593      * @param workCorpus
594      * @return  SortedSet of TokenProbabilityStrength objects.
595      */
596     private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
597         //Convert to a SortedSet of token probability strengths.
598         SortedSet tokenProbabilityStrengths = new TreeSet();
599         
600         Iterator i = tokens.iterator();
601         while (i.hasNext()) {
602             TokenProbabilityStrength tps = new TokenProbabilityStrength();
603             
604             tps.token = (String) i.next();
605             
606             if (workCorpus.containsKey(tps.token)) {
607                 tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
608             }
609             else {
610                 //This token has never been seen before,
611                 //we'll give it initially the default probability.
612                 Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
613                 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
614                 boolean isTokenDegeneratedFound = false;
615                 
616                 Collection degeneratedTokens = buildDegenerated(tps.token);
617                 Iterator iDegenerated = degeneratedTokens.iterator();
618                 String tokenDegenerated = null;
619                 double strengthDegenerated;
620                 while (iDegenerated.hasNext()) {
621                     tokenDegenerated = (String) iDegenerated.next();
622                     if (workCorpus.containsKey(tokenDegenerated)) {
623                         Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
624                         strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
625                         if (strengthDegenerated > tps.strength) {
626                             isTokenDegeneratedFound = true;
627                             tps.strength = strengthDegenerated;
628                             corpusProbability = probabilityTemp;
629                         }
630                     }
631                 }
632                 // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default
633                 if (isTokenDegeneratedFound) {
634                     synchronized(workCorpus) {
635                         workCorpus.put(tps.token, corpusProbability);
636                     }
637                 }
638             }
639             
640             tokenProbabilityStrengths.add(tps);
641         }
642         
643         return tokenProbabilityStrengths;
644     }
645     
646     private Collection buildDegenerated(String fullToken) {
647         ArrayList tokens = new ArrayList();
648         String header;
649         String token;
650         
651         // look for a header string termination
652         int headerEnd = fullToken.indexOf(':');
653         if (headerEnd >= 0) {
654             header = fullToken.substring(0, headerEnd);
655             token = fullToken.substring(headerEnd);
656         } else {
657             header = "";
658             token = fullToken;
659         }
660         
661         int end = token.length();
662         do {
663             if (!token.substring(0, end).equals(token.substring(0, end).toLowerCase())) {
664                 tokens.add(header + token.substring(0, end).toLowerCase());
665                 if (header.length() > 0) {
666                     tokens.add(token.substring(0, end).toLowerCase());
667                 }
668             }
669             if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
670                 tokens.add(header + token.charAt(0) + token.substring(1, end).toLowerCase());
671                 if (header.length() > 0) {
672                     tokens.add(token.charAt(0) + token.substring(1, end).toLowerCase());
673                 }
674             }
675             
676             if (token.charAt(end - 1) != '!') {
677                 break;
678             }
679             
680             end--;
681             
682             tokens.add(header + token.substring(0, end));
683             if (header.length() > 0) {
684                 tokens.add(token.substring(0, end));
685             }
686         } while (end > 0);
687         
688         return tokens;
689     }
690     
691     /***
692      * Compute the spamminess probability of the interesting tokens in
693      * the tokenProbabilities SortedSet.
694      *
695      * @param tokenProbabilities
696      * @param workCorpus
697      * @return  Computed spamminess.
698      */
699     private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
700         double p = 1.0;
701         double np = 1.0;
702         double tempStrength = 0.5;
703         int count = MAX_INTERESTING_TOKENS;
704         Iterator iterator = tokenProbabilityStrengths.iterator();
705         while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
706             TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next();
707             tempStrength = tps.strength;
708             
709             //      System.out.println(tps);
710             
711             double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
712             Double theDoubleObject = (Double) workCorpus.get(tps.token);
713             // if either the original token or a degeneration was found use the double value, otherwise use the default
714             if (theDoubleObject != null) {
715                 theDoubleValue = theDoubleObject.doubleValue();
716             }
717             p *= theDoubleValue;
718             np *= (1.0 - theDoubleValue);
719             // System.out.println("Token:" + tps.token + ", p=" + theDoubleValue + ", overall p=" + p / (p + np));
720         }
721         
722         return (p / (p + np));
723     }
724     
725     private boolean allSameChar(String s) {
726         if (s.length() < 2) {
727             return false;
728         }
729         
730         char c = s.charAt(0);
731         
732         for (int i = 1; i < s.length(); i++) {
733             if (s.charAt(i) != c) {
734                 return false;
735             }
736         }
737         return true;
738     }
739     
740     private boolean allDigits(String s) {
741         for (int i = 0; i < s.length(); i++) {
742             if (!Character.isDigit(s.charAt(i))) {
743                 return false;
744             }
745         }
746         return true;
747     }
748 }
749