001 /**************************************************************** 002 * Licensed to the Apache Software Foundation (ASF) under one * 003 * or more contributor license agreements. See the NOTICE file * 004 * distributed with this work for additional information * 005 * regarding copyright ownership. The ASF licenses this file * 006 * to you under the Apache License, Version 2.0 (the * 007 * "License"); you may not use this file except in compliance * 008 * with the License. You may obtain a copy of the License at * 009 * * 010 * http://www.apache.org/licenses/LICENSE-2.0 * 011 * * 012 * Unless required by applicable law or agreed to in writing, * 013 * software distributed under the License is distributed on an * 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 015 * KIND, either express or implied. See the License for the * 016 * specific language governing permissions and limitations * 017 * under the License. * 018 ****************************************************************/ 019 020 package org.apache.james.ai.classic; 021 022 import java.io.IOException; 023 import java.io.Reader; 024 import java.util.ArrayList; 025 import java.util.Collection; 026 import java.util.HashMap; 027 import java.util.HashSet; 028 import java.util.Iterator; 029 import java.util.Map; 030 import java.util.Set; 031 import java.util.SortedSet; 032 import java.util.TreeSet; 033 034 /** 035 * <p> 036 * Determines probability that text contains Spam. 037 * </p> 038 * 039 * <p> 040 * Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan 041 * for Spam</a>. Extended to Paul Grahams' <a 042 * href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>. 043 * </p> 044 * 045 * <p> 046 * Sample method usage: 047 * </p> 048 * 049 * <p> 050 * Use: void addHam(Reader) and void addSpam(Reader) 051 * 052 * methods to build up the Maps of ham & spam tokens/occurrences. Both addHam 053 * and addSpam assume they're reading one message at a time, if you feed more 054 * than one message per call, be sure to adjust the appropriate message counter: 055 * hamMessageCount or spamMessageCount. 056 * 057 * Then... 058 * </p> 059 * 060 * <p> 061 * Use: void buildCorpus() 062 * 063 * to build the final token/probabilities Map. 064 * 065 * Use your own methods for persistent storage of either the individual ham/spam 066 * corpus & message counts, and/or the final corpus. 067 * 068 * Then you can... 069 * </p> 070 * 071 * <p> 072 * Use: double computeSpamProbability(Reader) 073 * 074 * to determine the probability that a particular text contains spam. A returned 075 * result of 0.9 or above is an indicator that the text was spam. 076 * </p> 077 * 078 * <p> 079 * If you use persistent storage, use: void setCorpus(Map) 080 * 081 * before calling computeSpamProbability. 082 * </p> 083 * 084 * @since 2.3.0 085 */ 086 087 public class BayesianAnalyzer { 088 089 /** 090 * Number of "interesting" tokens to use to compute overall spamminess 091 * probability. 092 */ 093 private final static int MAX_INTERESTING_TOKENS = 15; 094 095 /** 096 * Minimum probability distance from 0.5 to consider a token "interesting" 097 * to use to compute overall spamminess probability. 098 */ 099 private final static double INTERESTINGNESS_THRESHOLD = 0.46; 100 101 /** 102 * Default token probability to use when a token has not been encountered 103 * before. 104 */ 105 private final static double DEFAULT_TOKEN_PROBABILITY = 0.4; 106 107 /** Map of ham tokens and their occurrences. */ 108 private Map<String, Integer> hamTokenCounts = new HashMap<String, Integer>(); 109 110 /** Map of spam tokens and their occurrences. */ 111 private Map<String, Integer> spamTokenCounts = new HashMap<String, Integer>(); 112 113 /** Number of ham messages analyzed. */ 114 private int hamMessageCount = 0; 115 116 /** Number of spam messages analyzed. */ 117 private int spamMessageCount = 0; 118 119 /** Final token/probability corpus. */ 120 private Map<String, Double> corpus = new HashMap<String, Double>(); 121 122 /** 123 * Inner class for managing Token Probability Strengths during the 124 * computeSpamProbability phase. 125 * 126 * By probability <i>strength</i> we mean the absolute distance of a 127 * probability from the middle value 0.5. 128 * 129 * It implements Comparable so that it's sorting is automatic. 130 */ 131 private class TokenProbabilityStrength implements Comparable<TokenProbabilityStrength> { 132 /** 133 * Message token. 134 */ 135 String token = null; 136 137 /** 138 * Token's computed probability strength. 139 */ 140 double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY); 141 142 /** 143 * Force the natural sort order for this object to be high-to-low. 144 * 145 * @param anotherTokenProbabilityStrength 146 * A TokenProbabilityStrength instance to compare this 147 * instance with. 148 * 149 * @return The result of the comparison (before, equal, after). 150 */ 151 public final int compareTo(TokenProbabilityStrength anotherTokenProbabilityStrength) { 152 int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000); 153 if (result == 0) { 154 return this.token.compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token); 155 } else { 156 return result; 157 } 158 } 159 160 /** 161 * Simple toString () implementation mostly for debugging purposes. 162 * 163 * @return String representation of this object. 164 */ 165 public String toString() { 166 StringBuffer sb = new StringBuffer(30); 167 168 sb.append(token).append("=").append(strength); 169 170 return sb.toString(); 171 } 172 } 173 174 /** 175 * Basic class constructor. 176 */ 177 public BayesianAnalyzer() { 178 } 179 180 /** 181 * Public setter for the hamTokenCounts Map. 182 * 183 * @param hamTokenCounts 184 * The new ham Token counts Map. 185 */ 186 public void setHamTokenCounts(Map<String, Integer> hamTokenCounts) { 187 this.hamTokenCounts = hamTokenCounts; 188 } 189 190 /** 191 * Public getter for the hamTokenCounts Map. 192 */ 193 public Map<String, Integer> getHamTokenCounts() { 194 return this.hamTokenCounts; 195 } 196 197 /** 198 * Public setter for the spamTokenCounts Map. 199 * 200 * @param spamTokenCounts 201 * The new spam Token counts Map. 202 */ 203 public void setSpamTokenCounts(Map<String, Integer> spamTokenCounts) { 204 this.spamTokenCounts = spamTokenCounts; 205 } 206 207 /** 208 * Public getter for the spamTokenCounts Map. 209 */ 210 public Map<String, Integer> getSpamTokenCounts() { 211 return this.spamTokenCounts; 212 } 213 214 /** 215 * Public setter for spamMessageCount. 216 * 217 * @param spamMessageCount 218 * The new spam message count. 219 */ 220 public void setSpamMessageCount(int spamMessageCount) { 221 this.spamMessageCount = spamMessageCount; 222 } 223 224 /** 225 * Public getter for spamMessageCount. 226 */ 227 public int getSpamMessageCount() { 228 return this.spamMessageCount; 229 } 230 231 /** 232 * Public setter for hamMessageCount. 233 * 234 * @param hamMessageCount 235 * The new ham message count. 236 */ 237 public void setHamMessageCount(int hamMessageCount) { 238 this.hamMessageCount = hamMessageCount; 239 } 240 241 /** 242 * Public getter for hamMessageCount. 243 */ 244 public int getHamMessageCount() { 245 return this.hamMessageCount; 246 } 247 248 /** 249 * Clears all analysis repositories and counters. 250 */ 251 public void clear() { 252 corpus.clear(); 253 254 tokenCountsClear(); 255 256 hamMessageCount = 0; 257 spamMessageCount = 0; 258 } 259 260 /** 261 * Clears token counters. 262 */ 263 public void tokenCountsClear() { 264 hamTokenCounts.clear(); 265 spamTokenCounts.clear(); 266 } 267 268 /** 269 * Public setter for corpus. 270 * 271 * @param corpus 272 * The new corpus. 273 */ 274 public void setCorpus(Map<String, Double> corpus) { 275 this.corpus = corpus; 276 } 277 278 /** 279 * Public getter for corpus. 280 */ 281 public Map<String, Double> getCorpus() { 282 return this.corpus; 283 } 284 285 /** 286 * Builds the corpus from the existing ham & spam counts. 287 */ 288 public void buildCorpus() { 289 // Combine the known ham & spam tokens. 290 Set<String> set = new HashSet<String>(hamTokenCounts.size() + spamTokenCounts.size()); 291 set.addAll(hamTokenCounts.keySet()); 292 set.addAll(spamTokenCounts.keySet()); 293 Map<String, Double> tempCorpus = new HashMap<String, Double>(set.size()); 294 295 // Iterate through all the tokens and compute their new 296 // individual probabilities. 297 Iterator<String> i = set.iterator(); 298 while (i.hasNext()) { 299 String token = i.next(); 300 tempCorpus.put(token, new Double(computeProbability(token))); 301 } 302 setCorpus(tempCorpus); 303 } 304 305 /** 306 * Adds a message to the ham list. 307 * 308 * @param stream 309 * A reader stream on the ham message to analyze 310 * @throws IOException 311 * If any error occurs 312 */ 313 public void addHam(Reader stream) throws java.io.IOException { 314 addTokenOccurrences(stream, hamTokenCounts); 315 hamMessageCount++; 316 } 317 318 /** 319 * Adds a message to the spam list. 320 * 321 * @param stream 322 * A reader stream on the spam message to analyze 323 * @throws IOException 324 * If any error occurs 325 */ 326 public void addSpam(Reader stream) throws java.io.IOException { 327 addTokenOccurrences(stream, spamTokenCounts); 328 spamMessageCount++; 329 } 330 331 /** 332 * Computes the probability that the stream contains SPAM. 333 * 334 * @param stream 335 * The text to be analyzed for Spamminess. 336 * @return A 0.0 - 1.0 probability 337 * @throws IOException 338 * If any error occurs 339 */ 340 public double computeSpamProbability(Reader stream) throws java.io.IOException { 341 // Build a set of the tokens in the Stream. 342 Set<String> tokens = parse(stream); 343 344 // Get the corpus to use in this run 345 // A new corpus may be being built in the meantime 346 Map<String, Double> workCorpus = getCorpus(); 347 348 // Assign their probabilities from the Corpus (using an additional 349 // calculation to determine spamminess). 350 SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus); 351 352 // Compute and return the overall probability that the 353 // stream is SPAM. 354 return computeOverallProbability(tokenProbabilityStrengths, workCorpus); 355 } 356 357 /** 358 * Parses a stream into tokens, and updates the target Map with the 359 * token/counts. 360 * 361 * @param stream 362 * @param target 363 */ 364 private void addTokenOccurrences(Reader stream, Map<String, Integer> target) throws java.io.IOException { 365 String token; 366 String header = ""; 367 368 // Update target with the tokens/count encountered. 369 while ((token = nextToken(stream)) != null) { 370 boolean endingLine = false; 371 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') { 372 endingLine = true; 373 token = token.substring(0, token.length() - 1); 374 } 375 376 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) { 377 if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) { 378 header = token; 379 if (!endingLine) { 380 continue; 381 } 382 } 383 384 token = header + token; 385 386 Integer value = null; 387 388 if (target.containsKey(token)) { 389 value = Integer.valueOf(((Integer) target.get(token)).intValue() + 1); 390 } else { 391 value = Integer.valueOf(1); 392 } 393 394 target.put(token, value); 395 } 396 397 if (endingLine) { 398 header = ""; 399 } 400 } 401 } 402 403 /** 404 * Parses a stream into tokens, and returns a Set of the unique tokens 405 * encountered. 406 * 407 * @param stream 408 * @return Set 409 */ 410 private Set<String> parse(Reader stream) throws java.io.IOException { 411 Set<String> tokens = new HashSet<String>(); 412 String token; 413 String header = ""; 414 415 // Build a Map of tokens encountered. 416 while ((token = nextToken(stream)) != null) { 417 boolean endingLine = false; 418 if (token.length() > 0 && token.charAt(token.length() - 1) == '\n') { 419 endingLine = true; 420 token = token.substring(0, token.length() - 1); 421 } 422 423 if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) { 424 if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:")) { 425 header = token; 426 if (!endingLine) { 427 continue; 428 } 429 } 430 431 token = header + token; 432 433 tokens.add(token); 434 } 435 436 if (endingLine) { 437 header = ""; 438 } 439 } 440 441 // Return the unique set of tokens encountered. 442 return tokens; 443 } 444 445 private String nextToken(Reader reader) throws java.io.IOException { 446 StringBuffer token = new StringBuffer(); 447 int i; 448 char ch, ch2; 449 boolean previousWasDigit = false; 450 boolean tokenCharFound = false; 451 452 if (!reader.ready()) { 453 return null; 454 } 455 456 while ((i = reader.read()) != -1) { 457 458 ch = (char) i; 459 460 if (ch == ':') { 461 String tokenString = token.toString() + ':'; 462 if (tokenString.equals("From:") || tokenString.equals("Return-Path:") || tokenString.equals("Subject:") || tokenString.equals("To:")) { 463 return tokenString; 464 } 465 } 466 467 if (Character.isLetter(ch) || ch == '-' || ch == '$' || ch == '\u20AC' // the 468 // EURO 469 // symbol 470 || ch == '!' || ch == '\'') { 471 tokenCharFound = true; 472 previousWasDigit = false; 473 token.append(ch); 474 } else if (Character.isDigit(ch)) { 475 tokenCharFound = true; 476 previousWasDigit = true; 477 token.append(ch); 478 } else if (previousWasDigit && (ch == '.' || ch == ',')) { 479 reader.mark(1); 480 previousWasDigit = false; 481 i = reader.read(); 482 if (i == -1) { 483 break; 484 } 485 ch2 = (char) i; 486 if (Character.isDigit(ch2)) { 487 tokenCharFound = true; 488 previousWasDigit = true; 489 token.append(ch); 490 token.append(ch2); 491 } else { 492 reader.reset(); 493 break; 494 } 495 } else if (ch == '\r') { 496 // cr found, ignore 497 } else if (ch == '\n') { 498 // eol found 499 tokenCharFound = true; 500 previousWasDigit = false; 501 token.append(ch); 502 break; 503 } else if (tokenCharFound) { 504 break; 505 } 506 } 507 508 if (tokenCharFound) { 509 // System.out.println("Token read: " + token); 510 return token.toString(); 511 } else { 512 return null; 513 } 514 } 515 516 /** 517 * Compute the probability that "token" is SPAM. 518 * 519 * @param token 520 * @return The probability that the token occurs within spam. 521 */ 522 private double computeProbability(String token) { 523 double hamFactor = 0; 524 double spamFactor = 0; 525 526 boolean foundInHam = false; 527 boolean foundInSpam = false; 528 529 double minThreshold = 0.01; 530 double maxThreshold = 0.99; 531 532 if (hamTokenCounts.containsKey(token)) { 533 foundInHam = true; 534 } 535 536 if (spamTokenCounts.containsKey(token)) { 537 foundInSpam = true; 538 } 539 540 if (foundInHam) { 541 hamFactor = 2 * ((Integer) hamTokenCounts.get(token)).doubleValue(); 542 if (!foundInSpam) { 543 minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002; 544 } 545 } 546 547 if (foundInSpam) { 548 spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue(); 549 if (!foundInHam) { 550 maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998; 551 } 552 } 553 554 if ((hamFactor + spamFactor) < 5) { 555 // This token hasn't been seen enough. 556 return 0.4; 557 } 558 559 double spamFreq = Math.min(1.0, spamFactor / spamMessageCount); 560 double hamFreq = Math.min(1.0, hamFactor / hamMessageCount); 561 562 return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq)))); 563 } 564 565 /** 566 * Returns a SortedSet of TokenProbabilityStrength built from the Corpus and 567 * the tokens passed in the "tokens" Set. The ordering is from the highest 568 * strength to the lowest strength. 569 * 570 * @param tokens 571 * @param workCorpus 572 * @return SortedSet of TokenProbabilityStrength objects. 573 */ 574 private SortedSet<TokenProbabilityStrength> getTokenProbabilityStrengths(Set<String> tokens, Map<String, Double> workCorpus) { 575 // Convert to a SortedSet of token probability strengths. 576 SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = new TreeSet<TokenProbabilityStrength>(); 577 578 Iterator<String> i = tokens.iterator(); 579 while (i.hasNext()) { 580 TokenProbabilityStrength tps = new TokenProbabilityStrength(); 581 582 tps.token = (String) i.next(); 583 584 if (workCorpus.containsKey(tps.token)) { 585 tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue()); 586 } else { 587 // This token has never been seen before, 588 // we'll give it initially the default probability. 589 Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY); 590 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY); 591 boolean isTokenDegeneratedFound = false; 592 593 Collection<String> degeneratedTokens = buildDegenerated(tps.token); 594 Iterator<String> iDegenerated = degeneratedTokens.iterator(); 595 String tokenDegenerated = null; 596 double strengthDegenerated; 597 while (iDegenerated.hasNext()) { 598 tokenDegenerated = (String) iDegenerated.next(); 599 if (workCorpus.containsKey(tokenDegenerated)) { 600 Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated); 601 strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue()); 602 if (strengthDegenerated > tps.strength) { 603 isTokenDegeneratedFound = true; 604 tps.strength = strengthDegenerated; 605 corpusProbability = probabilityTemp; 606 } 607 } 608 } 609 // to reduce memory usage, put in the corpus only if the 610 // probability is different from (stronger than) the default 611 if (isTokenDegeneratedFound) { 612 synchronized (workCorpus) { 613 workCorpus.put(tps.token, corpusProbability); 614 } 615 } 616 } 617 618 tokenProbabilityStrengths.add(tps); 619 } 620 621 return tokenProbabilityStrengths; 622 } 623 624 private Collection<String> buildDegenerated(String fullToken) { 625 ArrayList<String> tokens = new ArrayList<String>(); 626 String header; 627 String token; 628 String tokenLower; 629 630 // look for a header string termination 631 int headerEnd = fullToken.indexOf(':'); 632 if (headerEnd >= 0) { 633 header = fullToken.substring(0, headerEnd); 634 token = fullToken.substring(headerEnd); 635 } else { 636 header = ""; 637 token = fullToken; 638 } 639 640 // prepare a version of the token containing all lower case (for 641 // performance reasons) 642 tokenLower = token.toLowerCase(); 643 644 int end = token.length(); 645 do { 646 if (!token.substring(0, end).equals(tokenLower.substring(0, end))) { 647 tokens.add(header + tokenLower.substring(0, end)); 648 if (header.length() > 0) { 649 tokens.add(tokenLower.substring(0, end)); 650 } 651 } 652 if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') { 653 tokens.add(header + token.charAt(0) + tokenLower.substring(1, end)); 654 if (header.length() > 0) { 655 tokens.add(token.charAt(0) + tokenLower.substring(1, end)); 656 } 657 } 658 659 if (token.charAt(end - 1) != '!') { 660 break; 661 } 662 663 end--; 664 665 tokens.add(header + token.substring(0, end)); 666 if (header.length() > 0) { 667 tokens.add(token.substring(0, end)); 668 } 669 } while (end > 0); 670 671 return tokens; 672 } 673 674 /** 675 * Compute the spamminess probability of the interesting tokens in the 676 * tokenProbabilities SortedSet. 677 * 678 * @param tokenProbabilityStrengths 679 * @param workCorpus 680 * @return Computed spamminess. 681 */ 682 private double computeOverallProbability(SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths, Map<String, Double> workCorpus) { 683 double p = 1.0; 684 double np = 1.0; 685 double tempStrength = 0.5; 686 int count = MAX_INTERESTING_TOKENS; 687 Iterator<TokenProbabilityStrength> iterator = tokenProbabilityStrengths.iterator(); 688 while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) { 689 TokenProbabilityStrength tps = iterator.next(); 690 tempStrength = tps.strength; 691 692 // System.out.println(tps); 693 694 double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it 695 // to the default 696 Double theDoubleObject = (Double) workCorpus.get(tps.token); 697 // if either the original token or a degeneration was found use the 698 // double value, otherwise use the default 699 if (theDoubleObject != null) { 700 theDoubleValue = theDoubleObject.doubleValue(); 701 } 702 p *= theDoubleValue; 703 np *= (1.0 - theDoubleValue); 704 // System.out.println("Token " + tps + ", p=" + theDoubleValue + 705 // ", overall p=" + p / (p + np)); 706 } 707 708 return (p / (p + np)); 709 } 710 711 private boolean allDigits(String s) { 712 for (int i = 0; i < s.length(); i++) { 713 if (!Character.isDigit(s.charAt(i))) { 714 return false; 715 } 716 } 717 return true; 718 } 719 }