TOP

mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
198 Token* Tokenizer::add(const char* word, PRUint32 count)
199 {
200     PLDHashEntryHdr* entry = PL_DHashTableOperate(&mTokenTable, word, PL_DHASH_ADD);
201     Token* token = NS_STATIC_CAST(Token*, entry);
202     if (token) {
203         if (token->mWord == NULL) {
204             PRUint32 len = strlen(word);
205             NS_ASSERTION(len != 0, "adding zero length word to tokenizer");
206             token->mWord = copyWord(word, len);
207             NS_ASSERTION(token->mWord, "copyWord failed");
208             if (!token->mWord) {
209                 PL_DHashTableRawRemove(&mTokenTable, entry);
210                 return NULL;
211             }
212             token->mLength = len;
213             token->mCount = count;
214             token->mProbability = 0;
215         } else {
216             token->mCount += count;
217         }
218     }
219     return token;
220 }

599 void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* messageURI,
600                                        nsIJunkMailClassificationListener* listener)
601 {
602     Token* tokens = tokenizer.copyTokens();
603     if (!tokens) return;
604   
605     // the algorithm in "A Plan For Spam" assumes that you have a large good
606     // corpus and a large junk corpus.
607     // that won't be the case with users who first use the junk mail feature
608     // so, we do certain things to encourage them to train.
609     //
610     // if there are no good tokens, assume the message is junk
611     // this will "encourage" the user to train
612     // and if there are no bad tokens, assume the message is not junk
613     // this will also "encourage" the user to train
614     // see bug #194238
615     if (listener && !mGoodCount && !mGoodTokens.countTokens()) {
616       listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::JUNK));
617       return;
618     }
619     else if (listener && !mBadCount && !mBadTokens.countTokens()) {
620       listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::GOOD));
621       return;
622     }
623 
624     /* run the kernel of the Graham filter algorithm here. */
625     PRUint32 i, count = tokenizer.countTokens();
626     double ngood = mGoodCount, nbad = mBadCount;
627     for (i = 0; i < count; ++i) {
628         Token& token = tokens[i];
629         const char* word = token.mWord;
630         // ((g (* 2 (or (gethash word good) 0)))
631         Token* t = mGoodTokens.get(word);
632         double g = 2.0 * ((t != NULL) ? t->mCount : 0);
633         // (b (or (gethash word bad) 0)))
634         t = mBadTokens.get(word);
635         double b = ((t != NULL) ? t->mCount : 0);
636         if ((g + b) > 5) {
637             // (max .01
638             //      (min .99 (float (/ (min 1 (/ b nbad))
639             //                         (+ (min 1 (/ g ngood))
640             //                            (min 1 (/ b nbad)))))))
641             token.mProbability = dmax(.01,
642                                      dmin(.99,
643                                          (dmin(1.0, (b / nbad)) /
644                                               (dmin(1.0, (g / ngood)) +
645                                                dmin(1.0, (b / nbad))))));
646         } else {
647             token.mProbability = 0.4;
648         }
649     }
650     
651     // sort the array by the distance of the token probabilities from a 50-50 value of 0.5.
652     PRUint32 first, last = count;
653     if (count > 15) {
654         first = count - 15;
655         NS_QuickSort(tokens, count, sizeof(Token), compareTokens, NULL);
656     } else {
657         first = 0;
658     }
659 
660     double prod1 = 1.0, prod2 = 1.0;
661     for (i = first; i < last; ++i) {
662         double value = tokens[i].mProbability;
663         prod1 *= value;
664         prod2 *= (1.0 - value);
665     }
666     double prob = (prod1 / (prod1 + prod2));
667     PRBool isJunk = (prob >= 0.90);
668 
669     delete[] tokens;
670 
671     if (listener)
672         listener->OnMessageClassified(messageURI, isJunk ? nsMsgJunkStatus(nsIJunkMailPlugin::JUNK) : nsMsgJunkStatus(nsIJunkMailPlugin::GOOD));
673 }

897 static PRBool readTokens(FILE* stream, Tokenizer& tokenizer)
898 {
899     PRUint32 tokenCount;
900     if (readUInt32(stream, &tokenCount) != 1)
901         return PR_FALSE;
902 
903     PRUint32 bufferSize = 4096;
904     char* buffer = new char[bufferSize];
905     if (!buffer) return PR_FALSE;
906 
907     for (PRUint32 i = 0; i < tokenCount; ++i) {
908         PRUint32 count;
909         if (readUInt32(stream, &count) != 1)
910             break;
911         PRUint32 size;
912         if (readUInt32(stream, &size) != 1)
913             break;
914         if (size >= bufferSize) {
915             delete[] buffer;
916             PRUint32 newBufferSize = 2 * bufferSize;
917             while (size >= newBufferSize)
918                 newBufferSize *= 2;
919             buffer = new char[newBufferSize];
920             if (!buffer) return PR_FALSE;
921             bufferSize = newBufferSize;
922         }
923         if (fread(buffer, size, 1, stream) != 1)
924             break;
925         buffer[size] = '\0';
926         tokenizer.add(buffer, count);
927     }
928     
929     delete[] buffer;
930     
931     return PR_TRUE;
932 }

965 void nsBayesianFilter::readTrainingData()
966 {
967     nsCOMPtr<nsILocalFile> file;
968     nsresult rv = getTrainingFile(file);
969     if (NS_FAILED(rv)) return;
970     
971     PRBool exists;
972     rv = file->Exists(&exists);
973     if (NS_FAILED(rv) || !exists) return;
974 
975     // open the file, and write out training data using fprintf for now.
976     FILE* stream;
977     rv = file->OpenANSIFileDesc("rb", &stream);
978     if (NS_FAILED(rv)) return;
979 
980     // FIXME:  should make sure that the tokenizers are empty.
981     char cookie[4];
982     if (!((fread(cookie, sizeof(cookie), 1, stream) == 1) &&
983           (memcmp(cookie, kMagicCookie, sizeof(cookie)) == 0) &&
984           (readUInt32(stream, &mGoodCount) == 1) &&
985           (readUInt32(stream, &mBadCount) == 1) &&
986            readTokens(stream, mGoodTokens) &&
987            readTokens(stream, mBadTokens))) {
988         NS_WARNING("failed to read training data.");
989     }
990     
991     fclose(stream);
992 }