1 /****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one *
3 * or more contributor license agreements. See the NOTICE file *
4 * distributed with this work for additional information *
5 * regarding copyright ownership. The ASF licenses this file *
6 * to you under the Apache License, Version 2.0 (the *
7 * "License"); you may not use this file except in compliance *
8 * with the License. You may obtain a copy of the License at *
9 * *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, *
13 * software distributed under the License is distributed on an *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15 * KIND, either express or implied. See the License for the *
16 * specific language governing permissions and limitations *
17 * under the License. *
18 ****************************************************************/
19
20
21
22 package org.apache.james.util.bayesian;
23
24 import java.util.Map;
25 import java.util.Set;
26 import java.util.SortedSet;
27 import java.util.TreeSet;
28 import java.util.HashMap;
29 import java.util.HashSet;
30 import java.util.Iterator;
31 import java.util.Collection;
32 import java.util.ArrayList;
33
34 import java.io.IOException;
35 import java.io.Reader;
36
37 /**
38 * <P>Determines probability that text contains Spam.</P>
39 *
40 * <P>Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan for Spam</a>.
41 * Extended to Paul Grahams' <a href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.</P>
42 *
43 * <P>Sample method usage:</P>
44 *
45 * <P>Use:
46 * void addHam(Reader)
47 * and
48 * void addSpam(Reader)
49 *
50 * methods to build up the Maps of ham & spam tokens/occurrences.
51 * Both addHam and addSpam assume they're reading one message at a time,
52 * if you feed more than one message per call, be sure to adjust the
53 * appropriate message counter: hamMessageCount or spamMessageCount.
54 *
55 * Then...</P>
56 *
57 * <P>Use:
58 * void buildCorpus()
59 *
60 * to build the final token/probabilities Map.
61 *
62 * Use your own methods for persistent storage of either the individual
63 * ham/spam corpus & message counts, and/or the final corpus.
64 *
65 * Then you can...</P>
66 *
67 * <P>Use:
68 * double computeSpamProbability(Reader)
69 *
70 * to determine the probability that a particular text contains spam.
71 * A returned result of 0.9 or above is an indicator that the text was
72 * spam.</P>
73 *
74 * <P>If you use persistent storage, use:
75 * void setCorpus(Map)
76 *
77 * before calling computeSpamProbability.</P>
78 *
79 * @version CVS $Revision: 684527 $ $Date: 2008-08-10 16:53:29 +0100 (Sun, 10 Aug 2008) $
80 * @since 2.3.0
81 */
82
83 public class BayesianAnalyzer {
84
85 /**
86 * Number of "interesting" tokens to use to compute overall
87 * spamminess probability.
88 */
89 private final static int MAX_INTERESTING_TOKENS = 15;
90
91 /**
92 * Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall
93 * spamminess probability.
94 */
95 private final static double INTERESTINGNESS_THRESHOLD = 0.46;
96
97 /**
98 * Default token probability to use when a token has not been
99 * encountered before.
100 */
101 private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;
102
103 /**
104 * Map of ham tokens and their occurrences.
105 *
106 * String key
107 * Integer value
108 */
109 private Map hamTokenCounts = new HashMap();
110
111 /**
112 * Map of spam tokens and their occurrences.
113 *
114 * String key
115 * Integer value
116 */
117 private Map spamTokenCounts = new HashMap();
118
119 /**
120 * Number of ham messages analyzed.
121 */
122 private int hamMessageCount = 0;
123
124 /**
125 * Number of spam messages analyzed.
126 */
127 private int spamMessageCount = 0;
128
129 /**
130 * Final token/probability corpus.
131 *
132 * String key
133 * Double value
134 */
135 private Map corpus = new HashMap();
136
137 /**
138 * Inner class for managing Token Probability Strengths during the
139 * computeSpamProbability phase.
140 *
141 * By probability <i>strength</i> we mean the absolute distance of a
142 * probability from the middle value 0.5.
143 *
144 * It implements Comparable so that it's sorting is automatic.
145 */
146 private class TokenProbabilityStrength
147 implements Comparable {
148 /**
149 * Message token.
150 */
151 String token = null;
152
153 /**
154 * Token's computed probability strength.
155 */
156 double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
157
158 /**
159 * Force the natural sort order for this object to be high-to-low.
160 *
161 * @param anotherTokenProbabilityStrength A TokenProbabilityStrength instance to compare
162 * this instance with.
163 *
164 * @return The result of the comparison (before, equal, after).
165 */
166 public final int compareTo(Object anotherTokenProbabilityStrength) {
167 int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);
168 if (result == 0) {
169 return this.token.compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);
170 } else {
171 return result;
172 }
173 }
174
175 /**
176 * Simple toString () implementation mostly for debugging purposes.
177 *
178 * @return String representation of this object.
179 */
180 public String toString() {
181 StringBuffer sb = new StringBuffer(30);
182
183 sb.append(token)
184 .append("=")
185 .append(strength);
186
187 return sb.toString();
188 }
189 }
190
191 /**
192 * Basic class constructor.
193 */
194 public BayesianAnalyzer() {
195 }
196
197 /**
198 * Public setter for the hamTokenCounts Map.
199 *
200 * @param hamTokenCounts The new ham Token counts Map.
201 */
202 public void setHamTokenCounts(Map hamTokenCounts) {
203 this.hamTokenCounts = hamTokenCounts;
204 }
205
206 /**
207 * Public getter for the hamTokenCounts Map.
208 */
209 public Map getHamTokenCounts() {
210 return this.hamTokenCounts;
211 }
212
213 /**
214 * Public setter for the spamTokenCounts Map.
215 *
216 * @param spamTokenCounts The new spam Token counts Map.
217 */
218 public void setSpamTokenCounts(Map spamTokenCounts) {
219 this.spamTokenCounts = spamTokenCounts;
220 }
221
222 /**
223 * Public getter for the spamTokenCounts Map.
224 */
225 public Map getSpamTokenCounts() {
226 return this.spamTokenCounts;
227 }
228
229 /**
230 * Public setter for spamMessageCount.
231 *
232 * @param spamMessageCount The new spam message count.
233 */
234 public void setSpamMessageCount(int spamMessageCount) {
235 this.spamMessageCount = spamMessageCount;
236 }
237
238 /**
239 * Public getter for spamMessageCount.
240 */
241 public int getSpamMessageCount() {
242 return this.spamMessageCount;
243 }
244
245 /**
246 * Public setter for hamMessageCount.
247 *
248 * @param hamMessageCount The new ham message count.
249 */
250 public void setHamMessageCount(int hamMessageCount) {
251 this.hamMessageCount = hamMessageCount;
252 }
253
254 /**
255 * Public getter for hamMessageCount.
256 */
257 public int getHamMessageCount() {
258 return this.hamMessageCount;
259 }
260
261 /**
262 * Clears all analysis repositories and counters.
263 */
264 public void clear() {
265 corpus.clear();
266
267 tokenCountsClear();
268
269 hamMessageCount = 0;
270 spamMessageCount = 0;
271 }
272
273 /**
274 * Clears token counters.
275 */
276 public void tokenCountsClear() {
277 hamTokenCounts.clear();
278 spamTokenCounts.clear();
279 }
280
281 /**
282 * Public setter for corpus.
283 *
284 * @param corpus The new corpus.
285 */
286 public void setCorpus(Map corpus) {
287 this.corpus = corpus;
288 }
289
290 /**
291 * Public getter for corpus.
292 */
293 public Map getCorpus() {
294 return this.corpus;
295 }
296
297 /**
298 * Builds the corpus from the existing ham & spam counts.
299 */
300 public void buildCorpus() {
301 //Combine the known ham & spam tokens.
302 Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
303 set.addAll(hamTokenCounts.keySet());
304 set.addAll(spamTokenCounts.keySet());
305 Map tempCorpus = new HashMap(set.size());
306
307 //Iterate through all the tokens and compute their new
308 //individual probabilities.
309 Iterator i = set.iterator();
310 while (i.hasNext()) {
311 String token = (String) i.next();
312 tempCorpus.put(token, new Double(computeProbability(token)));
313 }
314 setCorpus(tempCorpus);
315 }
316
317 /**
318 * Adds a message to the ham list.
319 * @param stream A reader stream on the ham message to analyze
320 * @throws IOException If any error occurs
321 */
322 public void addHam(Reader stream)
323 throws java.io.IOException {
324 addTokenOccurrences(stream, hamTokenCounts);
325 hamMessageCount++;
326 }
327
328 /**
329 * Adds a message to the spam list.
330 * @param stream A reader stream on the spam message to analyze
331 * @throws IOException If any error occurs
332 */
333 public void addSpam(Reader stream)
334 throws java.io.IOException {
335 addTokenOccurrences(stream, spamTokenCounts);
336 spamMessageCount++;
337 }
338
339 /**
340 * Computes the probability that the stream contains SPAM.
341 * @param stream The text to be analyzed for Spamminess.
342 * @return A 0.0 - 1.0 probability
343 * @throws IOException If any error occurs
344 */
345 public double computeSpamProbability(Reader stream)
346 throws java.io.IOException {
347 //Build a set of the tokens in the Stream.
348 Set tokens = parse(stream);
349
350 // Get the corpus to use in this run
351 // A new corpus may be being built in the meantime
352 Map workCorpus = getCorpus();
353
354 //Assign their probabilities from the Corpus (using an additional
355 //calculation to determine spamminess).
356 SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
357
358 //Compute and return the overall probability that the
359 //stream is SPAM.
360 return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
361 }
362
363 /**
364 * Parses a stream into tokens, and updates the target Map
365 * with the token/counts.
366 *
367 * @param stream
368 * @param target
369 */
370 private void addTokenOccurrences(Reader stream, Map target)
371 throws java.io.IOException {
372 String token;
373 String header = "";
374
375 //Update target with the tokens/count encountered.
376 while ((token = nextToken(stream)) != null) {
377 boolean endingLine = false;
378 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
379 endingLine = true;
380 token = token.substring(0, token.length() - 1);
381 }
382
383 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
384 if (token.equals("From:")
385 || token.equals("Return-Path:")
386 || token.equals("Subject:")
387 || token.equals("To:")
388 ) {
389 header = token;
390 if (!endingLine) {
391 continue;
392 }
393 }
394
395 token = header + token;
396
397 Integer value = null;
398
399 if (target.containsKey(token)) {
400 value = new Integer(((Integer) target.get(token)).intValue() + 1);
401 } else {
402 value = new Integer(1);
403 }
404
405 target.put(token, value);
406 }
407
408 if (endingLine) {
409 header = "";
410 }
411 }
412 }
413
414 /**
415 * Parses a stream into tokens, and returns a Set of
416 * the unique tokens encountered.
417 *
418 * @param stream
419 * @return Set
420 */
421 private Set parse(Reader stream)
422 throws java.io.IOException {
423 Set tokens = new HashSet();
424 String token;
425 String header = "";
426
427 //Build a Map of tokens encountered.
428 while ((token = nextToken(stream)) != null) {
429 boolean endingLine = false;
430 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
431 endingLine = true;
432 token = token.substring(0, token.length() - 1);
433 }
434
435 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
436 if (token.equals("From:")
437 || token.equals("Return-Path:")
438 || token.equals("Subject:")
439 || token.equals("To:")
440 ) {
441 header = token;
442 if (!endingLine) {
443 continue;
444 }
445 }
446
447 token = header + token;
448
449 tokens.add(token);
450 }
451
452 if (endingLine) {
453 header = "";
454 }
455 }
456
457 //Return the unique set of tokens encountered.
458 return tokens;
459 }
460
461 private String nextToken(Reader reader) throws java.io.IOException {
462 StringBuffer token = new StringBuffer();
463 int i;
464 char ch, ch2;
465 boolean previousWasDigit = false;
466 boolean tokenCharFound = false;
467
468 if (!reader.ready()) {
469 return null;
470 }
471
472 while ((i = reader.read()) != -1) {
473
474 ch = (char) i;
475
476 if (ch == ':') {
477 String tokenString = token.toString() + ':';
478 if (tokenString.equals("From:")
479 || tokenString.equals("Return-Path:")
480 || tokenString.equals("Subject:")
481 || tokenString.equals("To:")
482 ) {
483 return tokenString;
484 }
485 }
486
487 if (Character.isLetter(ch)
488 || ch == '-'
489 || ch == '$'
490 || ch == '\u20AC' // the EURO symbol
491 || ch == '!'
492 || ch == '\''
493 ) {
494 tokenCharFound = true;
495 previousWasDigit = false;
496 token.append(ch);
497 } else if (Character.isDigit(ch)) {
498 tokenCharFound = true;
499 previousWasDigit = true;
500 token.append(ch);
501 } else if (previousWasDigit && (ch == '.' || ch == ',')) {
502 reader.mark(1);
503 previousWasDigit = false;
504 i = reader.read();
505 if (i == -1) {
506 break;
507 }
508 ch2 = (char) i;
509 if (Character.isDigit(ch2)) {
510 tokenCharFound = true;
511 previousWasDigit = true;
512 token.append(ch);
513 token.append(ch2);
514 } else {
515 reader.reset();
516 break;
517 }
518 } else if (ch == '\r') {
519 // cr found, ignore
520 } else if (ch == '\n') {
521 // eol found
522 tokenCharFound = true;
523 previousWasDigit = false;
524 token.append(ch);
525 break;
526 } else if (tokenCharFound) {
527 break;
528 }
529 }
530
531 if (tokenCharFound) {
532 // System.out.println("Token read: " + token);
533 return token.toString();
534 } else {
535 return null;
536 }
537 }
538
539 /**
540 * Compute the probability that "token" is SPAM.
541 *
542 * @param token
543 * @return The probability that the token occurs within spam.
544 */
545 private double computeProbability(String token) {
546 double hamFactor = 0;
547 double spamFactor = 0;
548
549 boolean foundInHam = false;
550 boolean foundInSpam = false;
551
552 double minThreshold = 0.01;
553 double maxThreshold = 0.99;
554
555 if (hamTokenCounts.containsKey(token)) {
556 foundInHam = true;
557 }
558
559 if (spamTokenCounts.containsKey(token)) {
560 foundInSpam = true;
561 }
562
563 if (foundInHam) {
564 hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue();
565 if (!foundInSpam) {
566 minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
567 }
568 }
569
570 if (foundInSpam) {
571 spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();
572 if (!foundInHam) {
573 maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
574 }
575 }
576
577 if ((hamFactor + spamFactor) < 5) {
578 //This token hasn't been seen enough.
579 return 0.4;
580 }
581
582 double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
583 double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
584
585 return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
586 }
587
588 /**
589 * Returns a SortedSet of TokenProbabilityStrength built from the
590 * Corpus and the tokens passed in the "tokens" Set.
591 * The ordering is from the highest strength to the lowest strength.
592 *
593 * @param tokens
594 * @param workCorpus
595 * @return SortedSet of TokenProbabilityStrength objects.
596 */
597 private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
598 //Convert to a SortedSet of token probability strengths.
599 SortedSet tokenProbabilityStrengths = new TreeSet();
600
601 Iterator i = tokens.iterator();
602 while (i.hasNext()) {
603 TokenProbabilityStrength tps = new TokenProbabilityStrength();
604
605 tps.token = (String) i.next();
606
607 if (workCorpus.containsKey(tps.token)) {
608 tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
609 }
610 else {
611 //This token has never been seen before,
612 //we'll give it initially the default probability.
613 Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
614 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
615 boolean isTokenDegeneratedFound = false;
616
617 Collection degeneratedTokens = buildDegenerated(tps.token);
618 Iterator iDegenerated = degeneratedTokens.iterator();
619 String tokenDegenerated = null;
620 double strengthDegenerated;
621 while (iDegenerated.hasNext()) {
622 tokenDegenerated = (String) iDegenerated.next();
623 if (workCorpus.containsKey(tokenDegenerated)) {
624 Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
625 strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
626 if (strengthDegenerated > tps.strength) {
627 isTokenDegeneratedFound = true;
628 tps.strength = strengthDegenerated;
629 corpusProbability = probabilityTemp;
630 }
631 }
632 }
633 // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default
634 if (isTokenDegeneratedFound) {
635 synchronized(workCorpus) {
636 workCorpus.put(tps.token, corpusProbability);
637 }
638 }
639 }
640
641 tokenProbabilityStrengths.add(tps);
642 }
643
644 return tokenProbabilityStrengths;
645 }
646
647 private Collection buildDegenerated(String fullToken) {
648 ArrayList tokens = new ArrayList();
649 String header;
650 String token;
651 String tokenLower;
652
653 // look for a header string termination
654 int headerEnd = fullToken.indexOf(':');
655 if (headerEnd >= 0) {
656 header = fullToken.substring(0, headerEnd);
657 token = fullToken.substring(headerEnd);
658 } else {
659 header = "";
660 token = fullToken;
661 }
662
663 // prepare a version of the token containing all lower case (for performance reasons)
664 tokenLower = token.toLowerCase();
665
666 int end = token.length();
667 do {
668 if (!token.substring(0, end).equals(tokenLower.substring(0, end))) {
669 tokens.add(header + tokenLower.substring(0, end));
670 if (header.length() > 0) {
671 tokens.add(tokenLower.substring(0, end));
672 }
673 }
674 if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
675 tokens.add(header + token.charAt(0) + tokenLower.substring(1, end));
676 if (header.length() > 0) {
677 tokens.add(token.charAt(0) + tokenLower.substring(1, end));
678 }
679 }
680
681 if (token.charAt(end - 1) != '!') {
682 break;
683 }
684
685 end--;
686
687 tokens.add(header + token.substring(0, end));
688 if (header.length() > 0) {
689 tokens.add(token.substring(0, end));
690 }
691 } while (end > 0);
692
693 return tokens;
694 }
695
696 /**
697 * Compute the spamminess probability of the interesting tokens in
698 * the tokenProbabilities SortedSet.
699 *
700 * @param tokenProbabilityStrengths
701 * @param workCorpus
702 * @return Computed spamminess.
703 */
704 private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
705 double p = 1.0;
706 double np = 1.0;
707 double tempStrength = 0.5;
708 int count = MAX_INTERESTING_TOKENS;
709 Iterator iterator = tokenProbabilityStrengths.iterator();
710 while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
711 TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next();
712 tempStrength = tps.strength;
713
714 // System.out.println(tps);
715
716 double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
717 Double theDoubleObject = (Double) workCorpus.get(tps.token);
718 // if either the original token or a degeneration was found use the double value, otherwise use the default
719 if (theDoubleObject != null) {
720 theDoubleValue = theDoubleObject.doubleValue();
721 }
722 p *= theDoubleValue;
723 np *= (1.0 - theDoubleValue);
724 //System.out.println("Token " + tps + ", p=" + theDoubleValue + ", overall p=" + p / (p + np));
725 }
726
727 return (p / (p + np));
728 }
729
730 private boolean allSameChar(String s) {
731 if (s.length() < 2) {
732 return false;
733 }
734
735 char c = s.charAt(0);
736
737 for (int i = 1; i < s.length(); i++) {
738 if (s.charAt(i) != c) {
739 return false;
740 }
741 }
742 return true;
743 }
744
745 private boolean allDigits(String s) {
746 for (int i = 0; i < s.length(); i++) {
747 if (!Character.isDigit(s.charAt(i))) {
748 return false;
749 }
750 }
751 return true;
752 }
753 }
754