1 /************************************************************************
2 * Copyright (c) 2000-2006 The Apache Software Foundation. *
3 * All rights reserved. *
4 * ------------------------------------------------------------------- *
5 * Licensed under the Apache License, Version 2.0 (the "License"); you *
6 * may not use this file except in compliance with the License. You *
7 * may obtain a copy of the License at: *
8 * *
9 * http://www.apache.org/licenses/LICENSE-2.0 *
10 * *
11 * Unless required by applicable law or agreed to in writing, software *
12 * distributed under the License is distributed on an "AS IS" BASIS, *
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or *
14 * implied. See the License for the specific language governing *
15 * permissions and limitations under the License. *
16 ***********************************************************************/
17
18 package org.apache.james.util;
19
20 import java.util.Map;
21 import java.util.Set;
22 import java.util.SortedSet;
23 import java.util.TreeSet;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.Iterator;
27 import java.util.Collection;
28 import java.util.ArrayList;
29
30 import java.io.Reader;
31 import java.io.StreamTokenizer;
32 import java.io.StringReader;
33
34 /***
35 * <P>Determines probability that text contains Spam.</P>
36 *
37 * <P>Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan for Spam</a>.
38 * Extended to Paul Grahams' <a href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.</P>
39 *
40 * <P>Sample method usage:</P>
41 *
42 * <P>Use:
43 * void addHam(Reader)
44 * and
45 * void addSpam(Reader)
46 *
47 * methods to build up the Maps of ham & spam tokens/occurrences.
48 * Both addHam and addSpam assume they're reading one message at a time,
49 * if you feed more than one message per call, be sure to adjust the
50 * appropriate message counter: hamMessageCount or spamMessageCount.
51 *
52 * Then...</P>
53 *
54 * <P>Use:
55 * void buildCorpus()
56 *
57 * to build the final token/probabilities Map.
58 *
59 * Use your own methods for persistent storage of either the individual
60 * ham/spam corpus & message counts, and/or the final corpus.
61 *
62 * Then you can...</P>
63 *
64 * <P>Use:
65 * double computeSpamProbability(Reader)
66 *
67 * to determine the probability that a particular text contains spam.
68 * A returned result of 0.9 or above is an indicator that the text was
69 * spam.</P>
70 *
71 * <P>If you use persistent storage, use:
72 * void setCorpus(Map)
73 *
74 * before calling computeSpamProbability.</P>
75 *
76 * @version CVS $Revision: $ $Date: $
77 * @since 2.3.0
78 */
79
80 public class BayesianAnalyzer {
81
82 /***
83 * Number of "interesting" tokens to use to compute overall
84 * spamminess probability.
85 */
86 private final static int MAX_INTERESTING_TOKENS = 15;
87
88 /***
89 * Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall
90 * spamminess probability.
91 */
92 private final static double INTERESTINGNESS_THRESHOLD = 0.46;
93
94 /***
95 * Default token probability to use when a token has not been
96 * encountered before.
97 */
98 private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;
99
100 /***
101 * Map of ham tokens and their occurrences.
102 *
103 * String key
104 * Integer value
105 */
106 private Map hamTokenCounts = new HashMap();
107
108 /***
109 * Map of spam tokens and their occurrences.
110 *
111 * String key
112 * Integer value
113 */
114 private Map spamTokenCounts = new HashMap();
115
116 /***
117 * Number of ham messages analyzed.
118 */
119 private int hamMessageCount = 0;
120
121 /***
122 * Number of spam messages analyzed.
123 */
124 private int spamMessageCount = 0;
125
126 /***
127 * Final token/probability corpus.
128 *
129 * String key
130 * Double value
131 */
132 private Map corpus = new HashMap();
133
134 /***
135 * Inner class for managing Token Probability Strengths during the
136 * computeSpamProbability phase.
137 *
138 * By probability <i>strength</i> we mean the absolute distance of a
139 * probability from the middle value 0.5.
140 *
141 * It implements Comparable so that it's sorting is automatic.
142 */
143 private class TokenProbabilityStrength
144 implements Comparable {
145 /***
146 * Message token.
147 */
148 String token = null;
149
150 /***
151 * Token's computed probability strength.
152 */
153 double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
154
155 /***
156 * Force the natural sort order for this object to be high-to-low.
157 *
158 * @param anotherTokenProbabilityStrength A TokenProbabilityStrength instance to compare
159 * this instance with.
160 *
161 * @return The result of the comparison (before, equal, after).
162 */
163 public final int compareTo(Object anotherTokenProbabilityStrength) {
164 int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);
165 if (result == 0) {
166 return this.token.compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);
167 } else {
168 return result;
169 }
170 }
171
172 /***
173 * Simple toString () implementation mostly for debugging purposes.
174 *
175 * @return String representation of this object.
176 */
177 public String toString() {
178 StringBuffer sb = new StringBuffer(30);
179
180 sb.append(token)
181 .append("=")
182 .append(strength);
183
184 return sb.toString();
185 }
186 }
187
188 /***
189 * Basic class constructor.
190 */
191 public BayesianAnalyzer() {
192 }
193
194 /***
195 * Public setter for the hamTokenCounts Map.
196 *
197 * @param hamTokenCounts The new ham Token counts Map.
198 */
199 public void setHamTokenCounts(Map hamTokenCounts) {
200 this.hamTokenCounts = hamTokenCounts;
201 }
202
203 /***
204 * Public getter for the hamTokenCounts Map.
205 */
206 public Map getHamTokenCounts() {
207 return this.hamTokenCounts;
208 }
209
210 /***
211 * Public setter for the spamTokenCounts Map.
212 *
213 * @param spamTokenCounts The new spam Token counts Map.
214 */
215 public void setSpamTokenCounts(Map spamTokenCounts) {
216 this.spamTokenCounts = spamTokenCounts;
217 }
218
219 /***
220 * Public getter for the spamTokenCounts Map.
221 */
222 public Map getSpamTokenCounts() {
223 return this.spamTokenCounts;
224 }
225
226 /***
227 * Public setter for spamMessageCount.
228 *
229 * @param spamMessageCount The new spam message count.
230 */
231 public void setSpamMessageCount(int spamMessageCount) {
232 this.spamMessageCount = spamMessageCount;
233 }
234
235 /***
236 * Public getter for spamMessageCount.
237 */
238 public int getSpamMessageCount() {
239 return this.spamMessageCount;
240 }
241
242 /***
243 * Public setter for hamMessageCount.
244 *
245 * @param hamMessageCount The new ham message count.
246 */
247 public void setHamMessageCount(int hamMessageCount) {
248 this.hamMessageCount = hamMessageCount;
249 }
250
251 /***
252 * Public getter for hamMessageCount.
253 */
254 public int getHamMessageCount() {
255 return this.hamMessageCount;
256 }
257
258 /***
259 * Clears all analysis repositories and counters.
260 */
261 public void clear() {
262 corpus.clear();
263
264 tokenCountsClear();
265
266 hamMessageCount = 0;
267 spamMessageCount = 0;
268 }
269
270 /***
271 * Clears token counters.
272 */
273 public void tokenCountsClear() {
274 hamTokenCounts.clear();
275 spamTokenCounts.clear();
276 }
277
278 /***
279 * Public setter for corpus.
280 *
281 * @param corpus The new corpus.
282 */
283 public void setCorpus(Map corpus) {
284 this.corpus = corpus;
285 }
286
287 /***
288 * Public getter for corpus.
289 */
290 public Map getCorpus() {
291 return this.corpus;
292 }
293
294 /***
295 * Builds the corpus from the existing ham & spam counts.
296 */
297 public void buildCorpus() {
298
299 Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
300 set.addAll(hamTokenCounts.keySet());
301 set.addAll(spamTokenCounts.keySet());
302 Map tempCorpus = new HashMap(set.size());
303
304
305
306 Iterator i = set.iterator();
307 while (i.hasNext()) {
308 String token = (String) i.next();
309 tempCorpus.put(token, new Double(computeProbability(token)));
310 }
311 setCorpus(tempCorpus);
312 }
313
314 /***
315 * Adds a message to the ham list.
316 * @param stream A reader stream on the ham message to analyze
317 * @throws IOException If any error occurs
318 */
319 public void addHam(Reader stream)
320 throws java.io.IOException {
321 addTokenOccurrences(stream, hamTokenCounts);
322 hamMessageCount++;
323 }
324
325 /***
326 * Adds a message to the spam list.
327 * @param stream A reader stream on the spam message to analyze
328 * @throws IOException If any error occurs
329 */
330 public void addSpam(Reader stream)
331 throws java.io.IOException {
332 addTokenOccurrences(stream, spamTokenCounts);
333 spamMessageCount++;
334 }
335
336 /***
337 * Computes the probability that the stream contains SPAM.
338 * @param stream The text to be analyzed for Spamminess.
339 * @return A 0.0 - 1.0 probability
340 * @throws IOException If any error occurs
341 */
342 public double computeSpamProbability(Reader stream)
343 throws java.io.IOException {
344
345 Set tokens = parse(stream);
346
347
348
349 Map workCorpus = getCorpus();
350
351
352
353 SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
354
355
356
357 return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
358 }
359
360 /***
361 * Parses a stream into tokens, and updates the target Map
362 * with the token/counts.
363 *
364 * @param stream
365 * @param target
366 */
367 private void addTokenOccurrences(Reader stream, Map target)
368 throws java.io.IOException {
369 String token;
370 String header = "";
371
372
373 while ((token = nextToken(stream)) != null) {
374 boolean endingLine = false;
375 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
376 endingLine = true;
377 token = token.substring(0, token.length() - 1);
378 }
379
380 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
381 if (token.equals("From:")
382 || token.equals("Return-Path:")
383 || token.equals("Subject:")
384 || token.equals("To:")
385 ) {
386 header = token;
387 if (!endingLine) {
388 continue;
389 }
390 }
391
392 token = header + token;
393
394 Integer value = null;
395
396 if (target.containsKey(token)) {
397 value = new Integer(((Integer) target.get(token)).intValue() + 1);
398 } else {
399 value = new Integer(1);
400 }
401
402 target.put(token, value);
403 }
404
405 if (endingLine) {
406 header = "";
407 }
408 }
409 }
410
411 /***
412 * Parses a stream into tokens, and returns a Set of
413 * the unique tokens encountered.
414 *
415 * @param stream
416 * @return Set
417 */
418 private Set parse(Reader stream)
419 throws java.io.IOException {
420 Set tokens = new HashSet();
421 String token;
422 String header = "";
423
424
425 while ((token = nextToken(stream)) != null) {
426 boolean endingLine = false;
427 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
428 endingLine = true;
429 token = token.substring(0, token.length() - 1);
430 }
431
432 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
433 if (token.equals("From:")
434 || token.equals("Return-Path:")
435 || token.equals("Subject:")
436 || token.equals("To:")
437 ) {
438 header = token;
439 if (!endingLine) {
440 continue;
441 }
442 }
443
444 token = header + token;
445
446 tokens.add(token);
447 }
448
449 if (endingLine) {
450 header = "";
451 }
452 }
453
454
455 return tokens;
456 }
457
458 private String nextToken(Reader reader) throws java.io.IOException {
459 StringBuffer token = new StringBuffer();
460 int i;
461 char ch, ch2;
462 boolean previousWasDigit = false;
463 boolean tokenCharFound = false;
464
465 if (!reader.ready()) {
466 return null;
467 }
468
469 while ((i = reader.read()) != -1) {
470
471 ch = (char) i;
472
473 if (ch == ':') {
474 String tokenString = token.toString() + ':';
475 if (tokenString.equals("From:")
476 || tokenString.equals("Return-Path:")
477 || tokenString.equals("Subject:")
478 || tokenString.equals("To:")
479 ) {
480 return tokenString;
481 }
482 }
483
484 if (Character.isLetter(ch)
485 || ch == '-'
486 || ch == '$'
487 || ch == '\u20AC'
488 || ch == '!'
489 || ch == '\''
490 ) {
491 tokenCharFound = true;
492 previousWasDigit = false;
493 token.append(ch);
494 } else if (Character.isDigit(ch)) {
495 tokenCharFound = true;
496 previousWasDigit = true;
497 token.append(ch);
498 } else if (previousWasDigit && (ch == '.' || ch == ',')) {
499 reader.mark(1);
500 previousWasDigit = false;
501 i = reader.read();
502 if (i == -1) {
503 break;
504 }
505 ch2 = (char) i;
506 if (Character.isDigit(ch2)) {
507 tokenCharFound = true;
508 previousWasDigit = true;
509 token.append(ch);
510 token.append(ch2);
511 } else {
512 reader.reset();
513 break;
514 }
515 } else if (ch == '\r') {
516
517 } else if (ch == '\n') {
518
519 tokenCharFound = true;
520 previousWasDigit = false;
521 token.append(ch);
522 break;
523 } else if (tokenCharFound) {
524 break;
525 }
526 }
527
528 if (tokenCharFound) {
529
530 return token.toString();
531 } else {
532 return null;
533 }
534 }
535
536 /***
537 * Compute the probability that "token" is SPAM.
538 *
539 * @param token
540 * @return The probability that the token occurs within spam.
541 */
542 private double computeProbability(String token) {
543 double hamFactor = 0;
544 double spamFactor = 0;
545
546 boolean foundInHam = false;
547 boolean foundInSpam = false;
548
549 double minThreshold = 0.01;
550 double maxThreshold = 0.99;
551
552 if (hamTokenCounts.containsKey(token)) {
553 foundInHam = true;
554 }
555
556 if (spamTokenCounts.containsKey(token)) {
557 foundInSpam = true;
558 }
559
560 if (foundInHam) {
561 hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue();
562 if (!foundInSpam) {
563 minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
564 }
565 }
566
567 if (foundInSpam) {
568 spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();
569 if (!foundInHam) {
570 maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
571 }
572 }
573
574 if ((hamFactor + spamFactor) < 5) {
575
576 return 0.4;
577 }
578
579 double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
580 double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
581
582 return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
583 }
584
585 /***
586 * Returns a SortedSet of TokenProbabilityStrength built from the
587 * Corpus and the tokens passed in the "tokens" Set.
588 * The ordering is from the highest strength to the lowest strength.
589 *
590 * @param tokens
591 * @param workCorpus
592 * @return SortedSet of TokenProbabilityStrength objects.
593 */
594 private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
595
596 SortedSet tokenProbabilityStrengths = new TreeSet();
597
598 Iterator i = tokens.iterator();
599 while (i.hasNext()) {
600 TokenProbabilityStrength tps = new TokenProbabilityStrength();
601
602 tps.token = (String) i.next();
603
604 if (workCorpus.containsKey(tps.token)) {
605 tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
606 }
607 else {
608
609
610 Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
611 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
612 boolean isTokenDegeneratedFound = false;
613
614 Collection degeneratedTokens = buildDegenerated(tps.token);
615 Iterator iDegenerated = degeneratedTokens.iterator();
616 String tokenDegenerated = null;
617 double strengthDegenerated;
618 while (iDegenerated.hasNext()) {
619 tokenDegenerated = (String) iDegenerated.next();
620 if (workCorpus.containsKey(tokenDegenerated)) {
621 Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
622 strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
623 if (strengthDegenerated > tps.strength) {
624 isTokenDegeneratedFound = true;
625 tps.strength = strengthDegenerated;
626 corpusProbability = probabilityTemp;
627 }
628 }
629 }
630
631 if (isTokenDegeneratedFound) {
632 synchronized(workCorpus) {
633 workCorpus.put(tps.token, corpusProbability);
634 }
635 }
636 }
637
638 tokenProbabilityStrengths.add(tps);
639 }
640
641 return tokenProbabilityStrengths;
642 }
643
644 private Collection buildDegenerated(String fullToken) {
645 ArrayList tokens = new ArrayList();
646 String header;
647 String token;
648
649
650 int headerEnd = fullToken.indexOf(':');
651 if (headerEnd >= 0) {
652 header = fullToken.substring(0, headerEnd);
653 token = fullToken.substring(headerEnd);
654 } else {
655 header = "";
656 token = fullToken;
657 }
658
659 int end = token.length();
660 do {
661 if (!token.substring(0, end).equals(token.substring(0, end).toLowerCase())) {
662 tokens.add(header + token.substring(0, end).toLowerCase());
663 if (header.length() > 0) {
664 tokens.add(token.substring(0, end).toLowerCase());
665 }
666 }
667 if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
668 tokens.add(header + token.charAt(0) + token.substring(1, end).toLowerCase());
669 if (header.length() > 0) {
670 tokens.add(token.charAt(0) + token.substring(1, end).toLowerCase());
671 }
672 }
673
674 if (token.charAt(end - 1) != '!') {
675 break;
676 }
677
678 end--;
679
680 tokens.add(header + token.substring(0, end));
681 if (header.length() > 0) {
682 tokens.add(token.substring(0, end));
683 }
684 } while (end > 0);
685
686 return tokens;
687 }
688
689 /***
690 * Compute the spamminess probability of the interesting tokens in
691 * the tokenProbabilities SortedSet.
692 *
693 * @param tokenProbabilities
694 * @param workCorpus
695 * @return Computed spamminess.
696 */
697 private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
698 double p = 1.0;
699 double np = 1.0;
700 double tempStrength = 0.5;
701 int count = MAX_INTERESTING_TOKENS;
702 Iterator iterator = tokenProbabilityStrengths.iterator();
703 while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
704 TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next();
705 tempStrength = tps.strength;
706
707
708
709 double theDoubleValue = DEFAULT_TOKEN_PROBABILITY;
710 Double theDoubleObject = (Double) workCorpus.get(tps.token);
711
712 if (theDoubleObject != null) {
713 theDoubleValue = theDoubleObject.doubleValue();
714 }
715 p *= theDoubleValue;
716 np *= (1.0 - theDoubleValue);
717
718 }
719
720 return (p / (p + np));
721 }
722
723 private boolean allSameChar(String s) {
724 if (s.length() < 2) {
725 return false;
726 }
727
728 char c = s.charAt(0);
729
730 for (int i = 1; i < s.length(); i++) {
731 if (s.charAt(i) != c) {
732 return false;
733 }
734 }
735 return true;
736 }
737
738 private boolean allDigits(String s) {
739 for (int i = 0; i < s.length(); i++) {
740 if (!Character.isDigit(s.charAt(i))) {
741 return false;
742 }
743 }
744 return true;
745 }
746 }
747