TOP
mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
198 Token* Tokenizer::add(const char* word, PRUint32 count)
199 {
200 PLDHashEntryHdr* entry = PL_DHashTableOperate(&mTokenTable, word, PL_DHASH_ADD);
201 Token* token = NS_STATIC_CAST(Token*, entry);
202 if (token) {
203 if (token->mWord == NULL) {
204 PRUint32 len = strlen(word);
205 NS_ASSERTION(len != 0, "adding zero length word to tokenizer");
206 token->mWord = copyWord(word, len);
207 NS_ASSERTION(token->mWord, "copyWord failed");
208 if (!token->mWord) {
209 PL_DHashTableRawRemove(&mTokenTable, entry);
210 return NULL;
211 }
212 token->mLength = len;
213 token->mCount = count;
214 token->mProbability = 0;
215 } else {
216 token->mCount += count;
217 }
218 }
219 return token;
220 }
599 void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* messageURI,
600 nsIJunkMailClassificationListener* listener)
601 {
602 Token* tokens = tokenizer.copyTokens();
603 if (!tokens) return;
604
605 // the algorithm in "A Plan For Spam" assumes that you have a large good
606 // corpus and a large junk corpus.
607 // that won't be the case with users who first use the junk mail feature
608 // so, we do certain things to encourage them to train.
609 //
610 // if there are no good tokens, assume the message is junk
611 // this will "encourage" the user to train
612 // and if there are no bad tokens, assume the message is not junk
613 // this will also "encourage" the user to train
614 // see bug #194238
615 if (listener && !mGoodCount && !mGoodTokens.countTokens()) {
616 listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::JUNK));
617 return;
618 }
619 else if (listener && !mBadCount && !mBadTokens.countTokens()) {
620 listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::GOOD));
621 return;
622 }
623
624 /* run the kernel of the Graham filter algorithm here. */
625 PRUint32 i, count = tokenizer.countTokens();
626 double ngood = mGoodCount, nbad = mBadCount;
627 for (i = 0; i < count; ++i) {
628 Token& token = tokens[i];
629 const char* word = token.mWord;
630 // ((g (* 2 (or (gethash word good) 0)))
631 Token* t = mGoodTokens.get(word);
632 double g = 2.0 * ((t != NULL) ? t->mCount : 0);
633 // (b (or (gethash word bad) 0)))
634 t = mBadTokens.get(word);
635 double b = ((t != NULL) ? t->mCount : 0);
636 if ((g + b) > 5) {
637 // (max .01
638 // (min .99 (float (/ (min 1 (/ b nbad))
639 // (+ (min 1 (/ g ngood))
640 // (min 1 (/ b nbad)))))))
641 token.mProbability = dmax(.01,
642 dmin(.99,
643 (dmin(1.0, (b / nbad)) /
644 (dmin(1.0, (g / ngood)) +
645 dmin(1.0, (b / nbad))))));
646 } else {
647 token.mProbability = 0.4;
648 }
649 }
650
651 // sort the array by the distance of the token probabilities from a 50-50 value of 0.5.
652 PRUint32 first, last = count;
653 if (count > 15) {
654 first = count - 15;
655 NS_QuickSort(tokens, count, sizeof(Token), compareTokens, NULL);
656 } else {
657 first = 0;
658 }
659
660 double prod1 = 1.0, prod2 = 1.0;
661 for (i = first; i < last; ++i) {
662 double value = tokens[i].mProbability;
663 prod1 *= value;
664 prod2 *= (1.0 - value);
665 }
666 double prob = (prod1 / (prod1 + prod2));
667 PRBool isJunk = (prob >= 0.90);
668
669 delete[] tokens;
670
671 if (listener)
672 listener->OnMessageClassified(messageURI, isJunk ? nsMsgJunkStatus(nsIJunkMailPlugin::JUNK) : nsMsgJunkStatus(nsIJunkMailPlugin::GOOD));
673 }
897 static PRBool readTokens(FILE* stream, Tokenizer& tokenizer)
898 {
899 PRUint32 tokenCount;
900 if (readUInt32(stream, &tokenCount) != 1)
901 return PR_FALSE;
902
903 PRUint32 bufferSize = 4096;
904 char* buffer = new char[bufferSize];
905 if (!buffer) return PR_FALSE;
906
907 for (PRUint32 i = 0; i < tokenCount; ++i) {
908 PRUint32 count;
909 if (readUInt32(stream, &count) != 1)
910 break;
911 PRUint32 size;
912 if (readUInt32(stream, &size) != 1)
913 break;
914 if (size >= bufferSize) {
915 delete[] buffer;
916 PRUint32 newBufferSize = 2 * bufferSize;
917 while (size >= newBufferSize)
918 newBufferSize *= 2;
919 buffer = new char[newBufferSize];
920 if (!buffer) return PR_FALSE;
921 bufferSize = newBufferSize;
922 }
923 if (fread(buffer, size, 1, stream) != 1)
924 break;
925 buffer[size] = '\0';
926 tokenizer.add(buffer, count);
927 }
928
929 delete[] buffer;
930
931 return PR_TRUE;
932 }
965 void nsBayesianFilter::readTrainingData()
966 {
967 nsCOMPtr<nsILocalFile> file;
968 nsresult rv = getTrainingFile(file);
969 if (NS_FAILED(rv)) return;
970
971 PRBool exists;
972 rv = file->Exists(&exists);
973 if (NS_FAILED(rv) || !exists) return;
974
975 // open the file, and write out training data using fprintf for now.
976 FILE* stream;
977 rv = file->OpenANSIFileDesc("rb", &stream);
978 if (NS_FAILED(rv)) return;
979
980 // FIXME: should make sure that the tokenizers are empty.
981 char cookie[4];
982 if (!((fread(cookie, sizeof(cookie), 1, stream) == 1) &&
983 (memcmp(cookie, kMagicCookie, sizeof(cookie)) == 0) &&
984 (readUInt32(stream, &mGoodCount) == 1) &&
985 (readUInt32(stream, &mBadCount) == 1) &&
986 readTokens(stream, mGoodTokens) &&
987 readTokens(stream, mBadTokens))) {
988 NS_WARNING("failed to read training data.");
989 }
990
991 fclose(stream);
992 }