1 /*****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one *
3 * or more contributor license agreements. See the NOTICE file *
4 * distributed with this work for additional information *
5 * regarding copyright ownership. The ASF licenses this file *
6 * to you under the Apache License, Version 2.0 (the *
7 * "License"); you may not use this file except in compliance *
8 * with the License. You may obtain a copy of the License at *
9 * *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, *
13 * software distributed under the License is distributed on an *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15 * KIND, either express or implied. See the License for the *
16 * specific language governing permissions and limitations *
17 * under the License. *
18 ****************************************************************/
19
20 package org.apache.james.util;
21
22 import java.util.Map;
23 import java.util.Set;
24 import java.util.SortedSet;
25 import java.util.TreeSet;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.Iterator;
29 import java.util.Collection;
30 import java.util.ArrayList;
31
32 import java.io.Reader;
33 import java.io.StreamTokenizer;
34 import java.io.StringReader;
35
36 /***
37 * <P>Determines probability that text contains Spam.</P>
38 *
39 * <P>Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan for Spam</a>.
40 * Extended to Paul Grahams' <a href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.</P>
41 *
42 * <P>Sample method usage:</P>
43 *
44 * <P>Use:
45 * void addHam(Reader)
46 * and
47 * void addSpam(Reader)
48 *
49 * methods to build up the Maps of ham & spam tokens/occurrences.
50 * Both addHam and addSpam assume they're reading one message at a time,
51 * if you feed more than one message per call, be sure to adjust the
52 * appropriate message counter: hamMessageCount or spamMessageCount.
53 *
54 * Then...</P>
55 *
56 * <P>Use:
57 * void buildCorpus()
58 *
59 * to build the final token/probabilities Map.
60 *
61 * Use your own methods for persistent storage of either the individual
62 * ham/spam corpus & message counts, and/or the final corpus.
63 *
64 * Then you can...</P>
65 *
66 * <P>Use:
67 * double computeSpamProbability(Reader)
68 *
69 * to determine the probability that a particular text contains spam.
70 * A returned result of 0.9 or above is an indicator that the text was
71 * spam.</P>
72 *
73 * <P>If you use persistent storage, use:
74 * void setCorpus(Map)
75 *
76 * before calling computeSpamProbability.</P>
77 *
78 * @version CVS $Revision: $ $Date: $
79 * @since 2.3.0
80 */
81
82 public class BayesianAnalyzer {
83
84 /***
85 * Number of "interesting" tokens to use to compute overall
86 * spamminess probability.
87 */
88 private final static int MAX_INTERESTING_TOKENS = 15;
89
90 /***
91 * Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall
92 * spamminess probability.
93 */
94 private final static double INTERESTINGNESS_THRESHOLD = 0.46;
95
96 /***
97 * Default token probability to use when a token has not been
98 * encountered before.
99 */
100 private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;
101
102 /***
103 * Map of ham tokens and their occurrences.
104 *
105 * String key
106 * Integer value
107 */
108 private Map hamTokenCounts = new HashMap();
109
110 /***
111 * Map of spam tokens and their occurrences.
112 *
113 * String key
114 * Integer value
115 */
116 private Map spamTokenCounts = new HashMap();
117
118 /***
119 * Number of ham messages analyzed.
120 */
121 private int hamMessageCount = 0;
122
123 /***
124 * Number of spam messages analyzed.
125 */
126 private int spamMessageCount = 0;
127
128 /***
129 * Final token/probability corpus.
130 *
131 * String key
132 * Double value
133 */
134 private Map corpus = new HashMap();
135
136 /***
137 * Inner class for managing Token Probability Strengths during the
138 * computeSpamProbability phase.
139 *
140 * By probability <i>strength</i> we mean the absolute distance of a
141 * probability from the middle value 0.5.
142 *
143 * It implements Comparable so that it's sorting is automatic.
144 */
145 private class TokenProbabilityStrength
146 implements Comparable {
147 /***
148 * Message token.
149 */
150 String token = null;
151
152 /***
153 * Token's computed probability strength.
154 */
155 double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
156
157 /***
158 * Force the natural sort order for this object to be high-to-low.
159 *
160 * @param anotherTokenProbabilityStrength A TokenProbabilityStrength instance to compare
161 * this instance with.
162 *
163 * @return The result of the comparison (before, equal, after).
164 */
165 public final int compareTo(Object anotherTokenProbabilityStrength) {
166 int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);
167 if (result == 0) {
168 return this.token.compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);
169 } else {
170 return result;
171 }
172 }
173
174 /***
175 * Simple toString () implementation mostly for debugging purposes.
176 *
177 * @return String representation of this object.
178 */
179 public String toString() {
180 StringBuffer sb = new StringBuffer(30);
181
182 sb.append(token)
183 .append("=")
184 .append(strength);
185
186 return sb.toString();
187 }
188 }
189
190 /***
191 * Basic class constructor.
192 */
193 public BayesianAnalyzer() {
194 }
195
196 /***
197 * Public setter for the hamTokenCounts Map.
198 *
199 * @param hamTokenCounts The new ham Token counts Map.
200 */
201 public void setHamTokenCounts(Map hamTokenCounts) {
202 this.hamTokenCounts = hamTokenCounts;
203 }
204
205 /***
206 * Public getter for the hamTokenCounts Map.
207 */
208 public Map getHamTokenCounts() {
209 return this.hamTokenCounts;
210 }
211
212 /***
213 * Public setter for the spamTokenCounts Map.
214 *
215 * @param spamTokenCounts The new spam Token counts Map.
216 */
217 public void setSpamTokenCounts(Map spamTokenCounts) {
218 this.spamTokenCounts = spamTokenCounts;
219 }
220
221 /***
222 * Public getter for the spamTokenCounts Map.
223 */
224 public Map getSpamTokenCounts() {
225 return this.spamTokenCounts;
226 }
227
228 /***
229 * Public setter for spamMessageCount.
230 *
231 * @param spamMessageCount The new spam message count.
232 */
233 public void setSpamMessageCount(int spamMessageCount) {
234 this.spamMessageCount = spamMessageCount;
235 }
236
237 /***
238 * Public getter for spamMessageCount.
239 */
240 public int getSpamMessageCount() {
241 return this.spamMessageCount;
242 }
243
244 /***
245 * Public setter for hamMessageCount.
246 *
247 * @param hamMessageCount The new ham message count.
248 */
249 public void setHamMessageCount(int hamMessageCount) {
250 this.hamMessageCount = hamMessageCount;
251 }
252
253 /***
254 * Public getter for hamMessageCount.
255 */
256 public int getHamMessageCount() {
257 return this.hamMessageCount;
258 }
259
260 /***
261 * Clears all analysis repositories and counters.
262 */
263 public void clear() {
264 corpus.clear();
265
266 tokenCountsClear();
267
268 hamMessageCount = 0;
269 spamMessageCount = 0;
270 }
271
272 /***
273 * Clears token counters.
274 */
275 public void tokenCountsClear() {
276 hamTokenCounts.clear();
277 spamTokenCounts.clear();
278 }
279
280 /***
281 * Public setter for corpus.
282 *
283 * @param corpus The new corpus.
284 */
285 public void setCorpus(Map corpus) {
286 this.corpus = corpus;
287 }
288
289 /***
290 * Public getter for corpus.
291 */
292 public Map getCorpus() {
293 return this.corpus;
294 }
295
296 /***
297 * Builds the corpus from the existing ham & spam counts.
298 */
299 public void buildCorpus() {
300
301 Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
302 set.addAll(hamTokenCounts.keySet());
303 set.addAll(spamTokenCounts.keySet());
304 Map tempCorpus = new HashMap(set.size());
305
306
307
308 Iterator i = set.iterator();
309 while (i.hasNext()) {
310 String token = (String) i.next();
311 tempCorpus.put(token, new Double(computeProbability(token)));
312 }
313 setCorpus(tempCorpus);
314 }
315
316 /***
317 * Adds a message to the ham list.
318 * @param stream A reader stream on the ham message to analyze
319 * @throws IOException If any error occurs
320 */
321 public void addHam(Reader stream)
322 throws java.io.IOException {
323 addTokenOccurrences(stream, hamTokenCounts);
324 hamMessageCount++;
325 }
326
327 /***
328 * Adds a message to the spam list.
329 * @param stream A reader stream on the spam message to analyze
330 * @throws IOException If any error occurs
331 */
332 public void addSpam(Reader stream)
333 throws java.io.IOException {
334 addTokenOccurrences(stream, spamTokenCounts);
335 spamMessageCount++;
336 }
337
338 /***
339 * Computes the probability that the stream contains SPAM.
340 * @param stream The text to be analyzed for Spamminess.
341 * @return A 0.0 - 1.0 probability
342 * @throws IOException If any error occurs
343 */
344 public double computeSpamProbability(Reader stream)
345 throws java.io.IOException {
346
347 Set tokens = parse(stream);
348
349
350
351 Map workCorpus = getCorpus();
352
353
354
355 SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
356
357
358
359 return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
360 }
361
362 /***
363 * Parses a stream into tokens, and updates the target Map
364 * with the token/counts.
365 *
366 * @param stream
367 * @param target
368 */
369 private void addTokenOccurrences(Reader stream, Map target)
370 throws java.io.IOException {
371 String token;
372 String header = "";
373
374
375 while ((token = nextToken(stream)) != null) {
376 boolean endingLine = false;
377 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
378 endingLine = true;
379 token = token.substring(0, token.length() - 1);
380 }
381
382 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
383 if (token.equals("From:")
384 || token.equals("Return-Path:")
385 || token.equals("Subject:")
386 || token.equals("To:")
387 ) {
388 header = token;
389 if (!endingLine) {
390 continue;
391 }
392 }
393
394 token = header + token;
395
396 Integer value = null;
397
398 if (target.containsKey(token)) {
399 value = new Integer(((Integer) target.get(token)).intValue() + 1);
400 } else {
401 value = new Integer(1);
402 }
403
404 target.put(token, value);
405 }
406
407 if (endingLine) {
408 header = "";
409 }
410 }
411 }
412
413 /***
414 * Parses a stream into tokens, and returns a Set of
415 * the unique tokens encountered.
416 *
417 * @param stream
418 * @return Set
419 */
420 private Set parse(Reader stream)
421 throws java.io.IOException {
422 Set tokens = new HashSet();
423 String token;
424 String header = "";
425
426
427 while ((token = nextToken(stream)) != null) {
428 boolean endingLine = false;
429 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
430 endingLine = true;
431 token = token.substring(0, token.length() - 1);
432 }
433
434 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
435 if (token.equals("From:")
436 || token.equals("Return-Path:")
437 || token.equals("Subject:")
438 || token.equals("To:")
439 ) {
440 header = token;
441 if (!endingLine) {
442 continue;
443 }
444 }
445
446 token = header + token;
447
448 tokens.add(token);
449 }
450
451 if (endingLine) {
452 header = "";
453 }
454 }
455
456
457 return tokens;
458 }
459
460 private String nextToken(Reader reader) throws java.io.IOException {
461 StringBuffer token = new StringBuffer();
462 int i;
463 char ch, ch2;
464 boolean previousWasDigit = false;
465 boolean tokenCharFound = false;
466
467 if (!reader.ready()) {
468 return null;
469 }
470
471 while ((i = reader.read()) != -1) {
472
473 ch = (char) i;
474
475 if (ch == ':') {
476 String tokenString = token.toString() + ':';
477 if (tokenString.equals("From:")
478 || tokenString.equals("Return-Path:")
479 || tokenString.equals("Subject:")
480 || tokenString.equals("To:")
481 ) {
482 return tokenString;
483 }
484 }
485
486 if (Character.isLetter(ch)
487 || ch == '-'
488 || ch == '$'
489 || ch == '\u20AC'
490 || ch == '!'
491 || ch == '\''
492 ) {
493 tokenCharFound = true;
494 previousWasDigit = false;
495 token.append(ch);
496 } else if (Character.isDigit(ch)) {
497 tokenCharFound = true;
498 previousWasDigit = true;
499 token.append(ch);
500 } else if (previousWasDigit && (ch == '.' || ch == ',')) {
501 reader.mark(1);
502 previousWasDigit = false;
503 i = reader.read();
504 if (i == -1) {
505 break;
506 }
507 ch2 = (char) i;
508 if (Character.isDigit(ch2)) {
509 tokenCharFound = true;
510 previousWasDigit = true;
511 token.append(ch);
512 token.append(ch2);
513 } else {
514 reader.reset();
515 break;
516 }
517 } else if (ch == '\r') {
518
519 } else if (ch == '\n') {
520
521 tokenCharFound = true;
522 previousWasDigit = false;
523 token.append(ch);
524 break;
525 } else if (tokenCharFound) {
526 break;
527 }
528 }
529
530 if (tokenCharFound) {
531
532 return token.toString();
533 } else {
534 return null;
535 }
536 }
537
538 /***
539 * Compute the probability that "token" is SPAM.
540 *
541 * @param token
542 * @return The probability that the token occurs within spam.
543 */
544 private double computeProbability(String token) {
545 double hamFactor = 0;
546 double spamFactor = 0;
547
548 boolean foundInHam = false;
549 boolean foundInSpam = false;
550
551 double minThreshold = 0.01;
552 double maxThreshold = 0.99;
553
554 if (hamTokenCounts.containsKey(token)) {
555 foundInHam = true;
556 }
557
558 if (spamTokenCounts.containsKey(token)) {
559 foundInSpam = true;
560 }
561
562 if (foundInHam) {
563 hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue();
564 if (!foundInSpam) {
565 minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
566 }
567 }
568
569 if (foundInSpam) {
570 spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();
571 if (!foundInHam) {
572 maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
573 }
574 }
575
576 if ((hamFactor + spamFactor) < 5) {
577
578 return 0.4;
579 }
580
581 double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
582 double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
583
584 return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
585 }
586
587 /***
588 * Returns a SortedSet of TokenProbabilityStrength built from the
589 * Corpus and the tokens passed in the "tokens" Set.
590 * The ordering is from the highest strength to the lowest strength.
591 *
592 * @param tokens
593 * @param workCorpus
594 * @return SortedSet of TokenProbabilityStrength objects.
595 */
596 private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
597
598 SortedSet tokenProbabilityStrengths = new TreeSet();
599
600 Iterator i = tokens.iterator();
601 while (i.hasNext()) {
602 TokenProbabilityStrength tps = new TokenProbabilityStrength();
603
604 tps.token = (String) i.next();
605
606 if (workCorpus.containsKey(tps.token)) {
607 tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
608 }
609 else {
610
611
612 Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
613 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
614 boolean isTokenDegeneratedFound = false;
615
616 Collection degeneratedTokens = buildDegenerated(tps.token);
617 Iterator iDegenerated = degeneratedTokens.iterator();
618 String tokenDegenerated = null;
619 double strengthDegenerated;
620 while (iDegenerated.hasNext()) {
621 tokenDegenerated = (String) iDegenerated.next();
622 if (workCorpus.containsKey(tokenDegenerated)) {
623 Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
624 strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
625 if (strengthDegenerated > tps.strength) {
626 isTokenDegeneratedFound = true;
627 tps.strength = strengthDegenerated;
628 corpusProbability = probabilityTemp;
629 }
630 }
631 }
632
633 if (isTokenDegeneratedFound) {
634 synchronized(workCorpus) {
635 workCorpus.put(tps.token, corpusProbability);
636 }
637 }
638 }
639
640 tokenProbabilityStrengths.add(tps);
641 }
642
643 return tokenProbabilityStrengths;
644 }
645
646 private Collection buildDegenerated(String fullToken) {
647 ArrayList tokens = new ArrayList();
648 String header;
649 String token;
650
651
652 int headerEnd = fullToken.indexOf(':');
653 if (headerEnd >= 0) {
654 header = fullToken.substring(0, headerEnd);
655 token = fullToken.substring(headerEnd);
656 } else {
657 header = "";
658 token = fullToken;
659 }
660
661 int end = token.length();
662 do {
663 if (!token.substring(0, end).equals(token.substring(0, end).toLowerCase())) {
664 tokens.add(header + token.substring(0, end).toLowerCase());
665 if (header.length() > 0) {
666 tokens.add(token.substring(0, end).toLowerCase());
667 }
668 }
669 if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
670 tokens.add(header + token.charAt(0) + token.substring(1, end).toLowerCase());
671 if (header.length() > 0) {
672 tokens.add(token.charAt(0) + token.substring(1, end).toLowerCase());
673 }
674 }
675
676 if (token.charAt(end - 1) != '!') {
677 break;
678 }
679
680 end--;
681
682 tokens.add(header + token.substring(0, end));
683 if (header.length() > 0) {
684 tokens.add(token.substring(0, end));
685 }
686 } while (end > 0);
687
688 return tokens;
689 }
690
691 /***
692 * Compute the spamminess probability of the interesting tokens in
693 * the tokenProbabilities SortedSet.
694 *
695 * @param tokenProbabilities
696 * @param workCorpus
697 * @return Computed spamminess.
698 */
699 private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
700 double p = 1.0;
701 double np = 1.0;
702 double tempStrength = 0.5;
703 int count = MAX_INTERESTING_TOKENS;
704 Iterator iterator = tokenProbabilityStrengths.iterator();
705 while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
706 TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next();
707 tempStrength = tps.strength;
708
709
710
711 double theDoubleValue = DEFAULT_TOKEN_PROBABILITY;
712 Double theDoubleObject = (Double) workCorpus.get(tps.token);
713
714 if (theDoubleObject != null) {
715 theDoubleValue = theDoubleObject.doubleValue();
716 }
717 p *= theDoubleValue;
718 np *= (1.0 - theDoubleValue);
719
720 }
721
722 return (p / (p + np));
723 }
724
725 private boolean allSameChar(String s) {
726 if (s.length() < 2) {
727 return false;
728 }
729
730 char c = s.charAt(0);
731
732 for (int i = 1; i < s.length(); i++) {
733 if (s.charAt(i) != c) {
734 return false;
735 }
736 }
737 return true;
738 }
739
740 private boolean allDigits(String s) {
741 for (int i = 0; i < s.length(); i++) {
742 if (!Character.isDigit(s.charAt(i))) {
743 return false;
744 }
745 }
746 return true;
747 }
748 }
749