001 /****************************************************************
002 * Licensed to the Apache Software Foundation (ASF) under one *
003 * or more contributor license agreements. See the NOTICE file *
004 * distributed with this work for additional information *
005 * regarding copyright ownership. The ASF licenses this file *
006 * to you under the Apache License, Version 2.0 (the *
007 * "License"); you may not use this file except in compliance *
008 * with the License. You may obtain a copy of the License at *
009 * *
010 * http://www.apache.org/licenses/LICENSE-2.0 *
011 * *
012 * Unless required by applicable law or agreed to in writing, *
013 * software distributed under the License is distributed on an *
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
015 * KIND, either express or implied. See the License for the *
016 * specific language governing permissions and limitations *
017 * under the License. *
018 ****************************************************************/
019
020 package org.apache.james.ai.classic;
021
022 import java.io.IOException;
023 import java.io.Reader;
024 import java.util.ArrayList;
025 import java.util.Collection;
026 import java.util.HashMap;
027 import java.util.HashSet;
028 import java.util.Iterator;
029 import java.util.Map;
030 import java.util.Set;
031 import java.util.SortedSet;
032 import java.util.TreeSet;
033
034 /**
035 * <p>
036 * Determines probability that text contains Spam.
037 * </p>
038 *
039 * <p>
040 * Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan
041 * for Spam</a>. Extended to Paul Grahams' <a
042 * href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.
043 * </p>
044 *
045 * <p>
046 * Sample method usage:
047 * </p>
048 *
049 * <p>
050 * Use: void addHam(Reader) and void addSpam(Reader)
051 *
052 * methods to build up the Maps of ham & spam tokens/occurrences. Both addHam
053 * and addSpam assume they're reading one message at a time, if you feed more
054 * than one message per call, be sure to adjust the appropriate message counter:
055 * hamMessageCount or spamMessageCount.
056 *
057 * Then...
058 * </p>
059 *
060 * <p>
061 * Use: void buildCorpus()
062 *
063 * to build the final token/probabilities Map.
064 *
065 * Use your own methods for persistent storage of either the individual ham/spam
066 * corpus & message counts, and/or the final corpus.
067 *
068 * Then you can...
069 * </p>
070 *
071 * <p>
072 * Use: double computeSpamProbability(Reader)
073 *
074 * to determine the probability that a particular text contains spam. A returned
075 * result of 0.9 or above is an indicator that the text was spam.
076 * </p>
077 *
078 * <p>
079 * If you use persistent storage, use: void setCorpus(Map)
080 *
081 * before calling computeSpamProbability.
082 * </p>
083 *
084 * @since 2.3.0
085 */
086
087 public class BayesianAnalyzer {
088
089 /**
090 * Number of "interesting" tokens to use to compute overall spamminess
091 * probability.
092 */
093 private final static int MAX_INTERESTING_TOKENS = 15;
094
095 /**
096 * Minimum probability distance from 0.5 to consider a token "interesting"
097 * to use to compute overall spamminess probability.
098 */
099 private final static double INTERESTINGNESS_THRESHOLD = 0.46;
100
101 /**
102 * Default token probability to use when a token has not been encountered
103 * before.
104 */
105 private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;
106
107 /** Map of ham tokens and their occurrences. */
108 private Map<String, Integer> hamTokenCounts = new HashMap<String, Integer>();
109
110 /** Map of spam tokens and their occurrences. */
111 private Map<String, Integer> spamTokenCounts = new HashMap<String, Integer>();
112
113 /** Number of ham messages analyzed. */
114 private int hamMessageCount = 0;
115
116 /** Number of spam messages analyzed. */
117 private int spamMessageCount = 0;
118
119 /** Final token/probability corpus. */
120 private Map<String, Double> corpus = new HashMap<String, Double>();
121
122 /**
123 * Inner class for managing Token Probability Strengths during the
124 * computeSpamProbability phase.
125 *
126 * By probability <i>strength</i> we mean the absolute distance of a
127 * probability from the middle value 0.5.
128 *
129 * It implements Comparable so that it's sorting is automatic.
130 */
131 private class TokenProbabilityStrength implements Comparable<TokenProbabilityStrength> {
132 /**
133 * Message token.
134 */
135 String token = null;
136
137 /**
138 * Token's computed probability strength.
139 */
140 double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
141
142 /**
143 * Force the natural sort order for this object to be high-to-low.
144 *
145 * @param anotherTokenProbabilityStrength
146 * A TokenProbabilityStrength instance to compare this
147 * instance with.
148 *
149 * @return The result of the comparison (before, equal, after).
150 */
151 public final int compareTo(TokenProbabilityStrength anotherTokenProbabilityStrength) {
152 int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);
153 if (result == 0) {
154 return this.token.compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);
155 } else {
156 return result;
157 }
158 }
159
160 /**
161 * Simple toString () implementation mostly for debugging purposes.
162 *
163 * @return String representation of this object.
164 */
165 public String toString() {
166 StringBuffer sb = new StringBuffer(30);
167
168 sb.append(token).append("=").append(strength);
169
170 return sb.toString();
171 }
172 }
173
174 /**
175 * Basic class constructor.
176 */
177 public BayesianAnalyzer() {
178 }
179
180 /**
181 * Public setter for the hamTokenCounts Map.
182 *
183 * @param hamTokenCounts
184 * The new ham Token counts Map.
185 */
186 public void setHamTokenCounts(Map<String, Integer> hamTokenCounts) {
187 this.hamTokenCounts = hamTokenCounts;
188 }
189
190 /**
191 * Public getter for the hamTokenCounts Map.
192 */
193 public Map<String, Integer> getHamTokenCounts() {
194 return this.hamTokenCounts;
195 }
196
197 /**
198 * Public setter for the spamTokenCounts Map.
199 *
200 * @param spamTokenCounts
201 * The new spam Token counts Map.
202 */
203 public void setSpamTokenCounts(Map<String, Integer> spamTokenCounts) {
204 this.spamTokenCounts = spamTokenCounts;
205 }
206
207 /**
208 * Public getter for the spamTokenCounts Map.
209 */
210 public Map<String, Integer> getSpamTokenCounts() {
211 return this.spamTokenCounts;
212 }
213
214 /**
215 * Public setter for spamMessageCount.
216 *
217 * @param spamMessageCount
218 * The new spam message count.
219 */
220 public void setSpamMessageCount(int spamMessageCount) {
221 this.spamMessageCount = spamMessageCount;
222 }
223
224 /**
225 * Public getter for spamMessageCount.
226 */
227 public int getSpamMessageCount() {
228 return this.spamMessageCount;
229 }
230
231 /**
232 * Public setter for hamMessageCount.
233 *
234 * @param hamMessageCount
235 * The new ham message count.
236 */
237 public void setHamMessageCount(int hamMessageCount) {
238 this.hamMessageCount = hamMessageCount;
239 }
240
241 /**
242 * Public getter for hamMessageCount.
243 */
244 public int getHamMessageCount() {
245 return this.hamMessageCount;
246 }
247
248 /**
249 * Clears all analysis repositories and counters.
250 */
251 public void clear() {
252 corpus.clear();
253
254 tokenCountsClear();
255
256 hamMessageCount = 0;
257 spamMessageCount = 0;
258 }
259
260 /**
261 * Clears token counters.
262 */
263 public void tokenCountsClear() {
264 hamTokenCounts.clear();
265 spamTokenCounts.clear();
266 }
267
268 /**
269 * Public setter for corpus.
270 *
271 * @param corpus
272 * The new corpus.
273 */
274 public void setCorpus(Map<String, Double> corpus) {
275 this.corpus = corpus;
276 }
277
278 /**
279 * Public getter for corpus.
280 */
281 public Map<String, Double> getCorpus() {
282 return this.corpus;
283 }
284
285 /**
286 * Builds the corpus from the existing ham & spam counts.
287 */
288 public void buildCorpus() {
289 // Combine the known ham & spam tokens.
290 Set<String> set = new HashSet<String>(hamTokenCounts.size() + spamTokenCounts.size());
291 set.addAll(hamTokenCounts.keySet());
292 set.addAll(spamTokenCounts.keySet());
293 Map<String, Double> tempCorpus = new HashMap<String, Double>(set.size());
294
295 // Iterate through all the tokens and compute their new
296 // individual probabilities.
297 Iterator<String> i = set.iterator();
298 while (i.hasNext()) {
299 String token = i.next();
300 tempCorpus.put(token, new Double(computeProbability(token)));
301 }
302 setCorpus(tempCorpus);
303 }
304
305 /**
306 * Adds a message to the ham list.
307 *
308 * @param stream
309 * A reader stream on the ham message to analyze
310 * @throws IOException
311 * If any error occurs
312 */
313 public void addHam(Reader stream) throws java.io.IOException {
314 addTokenOccurrences(stream, hamTokenCounts);
315 hamMessageCount++;
316 }
317
318 /**
319 * Adds a message to the spam list.
320 *
321 * @param stream
322 * A reader stream on the spam message to analyze
323 * @throws IOException
324 * If any error occurs
325 */
326 public void addSpam(Reader stream) throws java.io.IOException {
327 addTokenOccurrences(stream, spamTokenCounts);
328 spamMessageCount++;
329 }
330
331 /**
332 * Computes the probability that the stream contains SPAM.
333 *
334 * @param stream
335 * The text to be analyzed for Spamminess.
336 * @return A 0.0 - 1.0 probability
337 * @throws IOException
338 * If any error occurs
339 */
340 public double computeSpamProbability(Reader stream) throws java.io.IOException {
341 // Build a set of the tokens in the Stream.
342 Set<String> tokens = parse(stream);
343
344 // Get the corpus to use in this run
345 // A new corpus may be being built in the meantime
346 Map<String, Double> workCorpus = getCorpus();
347
348 // Assign their probabilities from the Corpus (using an additional
349 // calculation to determine spamminess).
350 SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
351
352 // Compute and return the overall probability that the
353 // stream is SPAM.
354 return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
355 }
356
357 /**
358 * Parses a stream into tokens, and updates the target Map with the
359 * token/counts.
360 *
361 * @param stream
362 * @param target
363 */
364 private void addTokenOccurrences(Reader stream, Map<String, Integer> target) throws java.io.IOException {
365 String token;
366 String header = "";
367
368 // Update target with the tokens/count encountered.
369 while ((token = nextToken(stream)) != null) {
370 boolean endingLine = false;
371 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
372 endingLine = true;
373 token = token.substring(0, token.length() - 1);
374 }
375
376 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
377 if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) {
378 header = token;
379 if (!endingLine) {
380 continue;
381 }
382 }
383
384 token = header + token;
385
386 Integer value = null;
387
388 if (target.containsKey(token)) {
389 value = Integer.valueOf(((Integer) target.get(token)).intValue() + 1);
390 } else {
391 value = Integer.valueOf(1);
392 }
393
394 target.put(token, value);
395 }
396
397 if (endingLine) {
398 header = "";
399 }
400 }
401 }
402
403 /**
404 * Parses a stream into tokens, and returns a Set of the unique tokens
405 * encountered.
406 *
407 * @param stream
408 * @return Set
409 */
410 private Set<String> parse(Reader stream) throws java.io.IOException {
411 Set<String> tokens = new HashSet<String>();
412 String token;
413 String header = "";
414
415 // Build a Map of tokens encountered.
416 while ((token = nextToken(stream)) != null) {
417 boolean endingLine = false;
418 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') {
419 endingLine = true;
420 token = token.substring(0, token.length() - 1);
421 }
422
423 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
424 if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) {
425 header = token;
426 if (!endingLine) {
427 continue;
428 }
429 }
430
431 token = header + token;
432
433 tokens.add(token);
434 }
435
436 if (endingLine) {
437 header = "";
438 }
439 }
440
441 // Return the unique set of tokens encountered.
442 return tokens;
443 }
444
445 private String nextToken(Reader reader) throws java.io.IOException {
446 StringBuffer token = new StringBuffer();
447 int i;
448 char ch, ch2;
449 boolean previousWasDigit = false;
450 boolean tokenCharFound = false;
451
452 if (!reader.ready()) {
453 return null;
454 }
455
456 while ((i = reader.read()) != -1) {
457
458 ch = (char) i;
459
460 if (ch == ':') {
461 String tokenString = token.toString() + ':';
462 if (tokenString.equals("From:") || tokenString.equals("Return-Path:") || tokenString.equals("Subject:") || tokenString.equals("To:")) {
463 return tokenString;
464 }
465 }
466
467 if (Character.isLetter(ch) || ch == '-' || ch == '$' || ch == '\u20AC' // the
468 // EURO
469 // symbol
470 || ch == '!' || ch == '\'') {
471 tokenCharFound = true;
472 previousWasDigit = false;
473 token.append(ch);
474 } else if (Character.isDigit(ch)) {
475 tokenCharFound = true;
476 previousWasDigit = true;
477 token.append(ch);
478 } else if (previousWasDigit && (ch == '.' || ch == ',')) {
479 reader.mark(1);
480 previousWasDigit = false;
481 i = reader.read();
482 if (i == -1) {
483 break;
484 }
485 ch2 = (char) i;
486 if (Character.isDigit(ch2)) {
487 tokenCharFound = true;
488 previousWasDigit = true;
489 token.append(ch);
490 token.append(ch2);
491 } else {
492 reader.reset();
493 break;
494 }
495 } else if (ch == '\r') {
496 // cr found, ignore
497 } else if (ch == '\n') {
498 // eol found
499 tokenCharFound = true;
500 previousWasDigit = false;
501 token.append(ch);
502 break;
503 } else if (tokenCharFound) {
504 break;
505 }
506 }
507
508 if (tokenCharFound) {
509 // System.out.println("Token read: " + token);
510 return token.toString();
511 } else {
512 return null;
513 }
514 }
515
516 /**
517 * Compute the probability that "token" is SPAM.
518 *
519 * @param token
520 * @return The probability that the token occurs within spam.
521 */
522 private double computeProbability(String token) {
523 double hamFactor = 0;
524 double spamFactor = 0;
525
526 boolean foundInHam = false;
527 boolean foundInSpam = false;
528
529 double minThreshold = 0.01;
530 double maxThreshold = 0.99;
531
532 if (hamTokenCounts.containsKey(token)) {
533 foundInHam = true;
534 }
535
536 if (spamTokenCounts.containsKey(token)) {
537 foundInSpam = true;
538 }
539
540 if (foundInHam) {
541 hamFactor = 2 * ((Integer) hamTokenCounts.get(token)).doubleValue();
542 if (!foundInSpam) {
543 minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
544 }
545 }
546
547 if (foundInSpam) {
548 spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();
549 if (!foundInHam) {
550 maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
551 }
552 }
553
554 if ((hamFactor + spamFactor) < 5) {
555 // This token hasn't been seen enough.
556 return 0.4;
557 }
558
559 double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
560 double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
561
562 return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
563 }
564
565 /**
566 * Returns a SortedSet of TokenProbabilityStrength built from the Corpus and
567 * the tokens passed in the "tokens" Set. The ordering is from the highest
568 * strength to the lowest strength.
569 *
570 * @param tokens
571 * @param workCorpus
572 * @return SortedSet of TokenProbabilityStrength objects.
573 */
574 private SortedSet<TokenProbabilityStrength> getTokenProbabilityStrengths(Set<String> tokens, Map<String, Double> workCorpus) {
575 // Convert to a SortedSet of token probability strengths.
576 SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = new TreeSet<TokenProbabilityStrength>();
577
578 Iterator<String> i = tokens.iterator();
579 while (i.hasNext()) {
580 TokenProbabilityStrength tps = new TokenProbabilityStrength();
581
582 tps.token = (String) i.next();
583
584 if (workCorpus.containsKey(tps.token)) {
585 tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
586 } else {
587 // This token has never been seen before,
588 // we'll give it initially the default probability.
589 Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
590 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
591 boolean isTokenDegeneratedFound = false;
592
593 Collection<String> degeneratedTokens = buildDegenerated(tps.token);
594 Iterator<String> iDegenerated = degeneratedTokens.iterator();
595 String tokenDegenerated = null;
596 double strengthDegenerated;
597 while (iDegenerated.hasNext()) {
598 tokenDegenerated = (String) iDegenerated.next();
599 if (workCorpus.containsKey(tokenDegenerated)) {
600 Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
601 strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
602 if (strengthDegenerated > tps.strength) {
603 isTokenDegeneratedFound = true;
604 tps.strength = strengthDegenerated;
605 corpusProbability = probabilityTemp;
606 }
607 }
608 }
609 // to reduce memory usage, put in the corpus only if the
610 // probability is different from (stronger than) the default
611 if (isTokenDegeneratedFound) {
612 synchronized (workCorpus) {
613 workCorpus.put(tps.token, corpusProbability);
614 }
615 }
616 }
617
618 tokenProbabilityStrengths.add(tps);
619 }
620
621 return tokenProbabilityStrengths;
622 }
623
624 private Collection<String> buildDegenerated(String fullToken) {
625 ArrayList<String> tokens = new ArrayList<String>();
626 String header;
627 String token;
628 String tokenLower;
629
630 // look for a header string termination
631 int headerEnd = fullToken.indexOf(':');
632 if (headerEnd >= 0) {
633 header = fullToken.substring(0, headerEnd);
634 token = fullToken.substring(headerEnd);
635 } else {
636 header = "";
637 token = fullToken;
638 }
639
640 // prepare a version of the token containing all lower case (for
641 // performance reasons)
642 tokenLower = token.toLowerCase();
643
644 int end = token.length();
645 do {
646 if (!token.substring(0, end).equals(tokenLower.substring(0, end))) {
647 tokens.add(header + tokenLower.substring(0, end));
648 if (header.length() > 0) {
649 tokens.add(tokenLower.substring(0, end));
650 }
651 }
652 if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
653 tokens.add(header + token.charAt(0) + tokenLower.substring(1, end));
654 if (header.length() > 0) {
655 tokens.add(token.charAt(0) + tokenLower.substring(1, end));
656 }
657 }
658
659 if (token.charAt(end - 1) != '!') {
660 break;
661 }
662
663 end--;
664
665 tokens.add(header + token.substring(0, end));
666 if (header.length() > 0) {
667 tokens.add(token.substring(0, end));
668 }
669 } while (end > 0);
670
671 return tokens;
672 }
673
674 /**
675 * Compute the spamminess probability of the interesting tokens in the
676 * tokenProbabilities SortedSet.
677 *
678 * @param tokenProbabilityStrengths
679 * @param workCorpus
680 * @return Computed spamminess.
681 */
682 private double computeOverallProbability(SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths, Map<String, Double> workCorpus) {
683 double p = 1.0;
684 double np = 1.0;
685 double tempStrength = 0.5;
686 int count = MAX_INTERESTING_TOKENS;
687 Iterator<TokenProbabilityStrength> iterator = tokenProbabilityStrengths.iterator();
688 while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
689 TokenProbabilityStrength tps = iterator.next();
690 tempStrength = tps.strength;
691
692 // System.out.println(tps);
693
694 double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it
695 // to the default
696 Double theDoubleObject = (Double) workCorpus.get(tps.token);
697 // if either the original token or a degeneration was found use the
698 // double value, otherwise use the default
699 if (theDoubleObject != null) {
700 theDoubleValue = theDoubleObject.doubleValue();
701 }
702 p *= theDoubleValue;
703 np *= (1.0 - theDoubleValue);
704 // System.out.println("Token " + tps + ", p=" + theDoubleValue +
705 // ", overall p=" + p / (p + np));
706 }
707
708 return (p / (p + np));
709 }
710
711 private boolean allDigits(String s) {
712 for (int i = 0; i < s.length(); i++) {
713 if (!Character.isDigit(s.charAt(i))) {
714 return false;
715 }
716 }
717 return true;
718 }
719 }