Main Page | Compound List | File List | Compound Members | File Members

SpamUtil.C

00001 
00002 /*
00003   This email filter was written by Ian Kaplan, Bear Products
00004   International.  It is copyrighted by Ian Kaplan, 2004,
00005   www.bearcave.com.
00006 
00007   You have permission to use this software without restriction on two
00008   conditions:
00009 
00010     1. You must preserve this copyright notice in this software and
00011        any software derived from it.
00012 
00013     2. You accept any risk entailed in using this software.  By
00014        using this software, you acknowledge that you have a
00015        sophisticated background in software engineering and 
00016        understand the way this software functions.  You further
00017        acknowledge that using this software may result in the
00018        irretrievable loss of important e-email and you alone
00019        are responsible for this loss.
00020 
00021   If either of these conditions are unacceptable, you may not use any
00022   part of this software.
00023 
00024  */
00025 
00026 #include <assert.h>
00027 #include <stdio.h>
00028 
00029 #include <string.h>
00030 #include <ctype.h>
00031 
00032 #include <sys/types.h>
00033 #include <sys/stat.h>
00034 #ifndef _WIN32
00035 #include <unistd.h>
00036 #include <errno.h>
00037 #endif
00038 
00039 #include "SpamUtil.h"
00040 
00041 
00045 const int SpamUtil::getFileSize( FILE *fp, const char *fileName, char *msgbuf )
00046 {
00047   int fileSize = -1;
00048   if (fp != 0) {
00049     struct stat s;
00050     int fd = fileno( fp );
00051     if (fstat( fd, &s) == 0) {
00052       fileSize = s.st_size;
00053     }
00054     else {
00055       char *err_reason = strerror( errno);
00056       sprintf( msgbuf, "SpamUtil::getFileSize: could not open %s.  Reason: %s", 
00057                fileName, err_reason );
00058     }
00059   }
00060   return fileSize;
00061 } // getFileSize
00062 
00063 
00074 void SpamUtil::strncpy(char *pDest, const char *pSrc, size_t destSize )
00075 {
00076   const size_t maxCopy = destSize-1;
00077   size_t ix;
00078   for (ix = 0; ix < maxCopy && pSrc[ix] != '\0'; ix++) {
00079     pDest[ix] = pSrc[ix];
00080   }
00081   pDest[ix] = '\0';
00082 } // strncpy
00083 
00084 
00089 const char *SpamUtil::skipWhiteSpace( const char *pBuf )
00090 {
00091   while (*pBuf != '\0' && isspace( *pBuf )) {
00092     pBuf++;
00093   }
00094   return pBuf;
00095 } // skipWhiteSpace
00096 
00097 
00101 const char *SpamUtil::trimEnd(char *str )
00102 {
00103   if (str != 0 && str[0] != '\0') {
00104     int len = strlen( str );
00105     int i;
00106     for (i = len-1; i >=0 && isspace(str[i]); i--)
00107       /* nada */;
00108     str[i+1] = '\0';
00109   }
00110   return str;
00111 } // trimEnd
00112 
00113 
00119 void SpamUtil::trim(const char *str )
00120 {
00121   if (str != 0) {
00122     char *start = (char *)skipWhiteSpace( str );
00123 
00124     char *ptr = (char *)str;
00125     if (*start != '\0') {
00126       size_t len = strlen( str );
00127       char *end = (char *)&str[len-1];
00128 
00129       while (end > start && isspace( *end )) {
00130         end--;
00131       }
00132       size_t trimLen = (end - start) + 1;
00133       for (size_t i = 0; i < trimLen; i++) {
00134         *ptr = *start;
00135         start++;
00136         ptr++;
00137       }
00138     }
00139    *ptr = '\0';
00140   }
00141 } // trim
00142 
00143 
00144 
00149 bool SpamUtil::isBlankLine(const char *buf)
00150 {
00151   bool blankLine = true;
00152   if (buf != 0) {
00153     for (; *buf != '\0'; buf++) {
00154       if (! isspace( *buf ) ) {
00155         blankLine = false;
00156         break;
00157       }
00158     } // for
00159   }
00160   return blankLine;
00161 } // isBlankLine
00162 
00163 
00164 
00170 bool SpamUtil::match(const char *bufStart,
00171                      const size_t len,
00172                      const char *word )
00173 {
00174   assert( bufStart != 0 && word != 0);
00175 
00176   bool wordsMatch = false;
00177   if (len > 0) {
00178     wordsMatch = true;
00179     for (size_t i = 0; i < len; i++) {
00180       if (tolower(bufStart[i]) != word[i]) {
00181         wordsMatch = false;
00182         break;
00183       }
00184     } // for
00185   }
00186 
00187   return wordsMatch;
00188 } // match
00189 
00190 
00196 bool SpamUtil::match(const char *bufStart,
00197                      const char *bufEnd,
00198                      const char *word )
00199 {
00200   assert( bufStart != 0 && word != 0);
00201 
00202   bool wordsMatch = false;
00203   int regionLen = bufEnd - bufStart;
00204   int wordLen = strlen( word );
00205   if (wordLen > 0 && wordLen == regionLen) {
00206     wordsMatch = true;
00207     for (size_t i = 0; i < wordLen; i++) {
00208       if (tolower(bufStart[i]) != word[i]) {
00209         wordsMatch = false;
00210         break;
00211       }
00212     } // for
00213   }
00214 
00215   return wordsMatch;
00216 } // match
00217 
00218 
00219 
00220 const char *SpamUtil::findColon( const char *buf )
00221 {
00222   const char *pColon = 0;
00223   if (buf != 0) {
00224     for (; *buf != '\0'; buf++) {
00225       if (*buf == ':') {
00226         pColon = buf;
00227         break;
00228       }
00229     } // for
00230   }
00231   return pColon;
00232 } // findColon
00233 
00234 
00235 
00242 void SpamUtil::toLower(char *dest, const char *src, size_t size)
00243 {
00244   // convert to lower case
00245   size_t j;
00246   for (j = 0; j < size-1 && src[j] != '\0'; j++) {
00247     dest[j] = tolower(src[j]);
00248   } // for
00249   dest[j] = '\0';
00250 }
00251 
00252 
00258 void SpamUtil::toLower(char *dest, const char *src)
00259 {
00260   // convert to lower case
00261   size_t j;
00262   for (j = 0; src[j] != '\0'; j++) {
00263     dest[j] = tolower(src[j]);
00264   } // for
00265   dest[j] = '\0';
00266 }
00267 
00268 
00272 SpamUtil::contentType SpamUtil::classifySection( const char *buf )
00273 {
00274   contentType type = UNKNOWN;
00275   char tempBuf[128];
00276 
00277   SpamUtil().toLower(tempBuf, buf, sizeof(tempBuf));
00278 
00279   const char *typeName = "unknown";
00280   if (strstr(buf, "multipart") != 0) {
00281     type = MULTIPART;
00282   }
00283   else if (strstr(buf, "html") != 0) {
00284     type = HTML;
00285   }
00286   else if (strstr(buf, "text") != 0) {
00287     type = TEXT;
00288   }
00289   else if (strstr(buf, "image") != 0) {
00290     type = IMAGE;
00291   }
00292   else if (strstr(buf, "audio") != 0) {
00293     type = AUDIO;
00294   }
00295   else if (strstr(buf, "application") != 0) {
00296     type = APPLICATION;
00297   }
00298 
00299   return type;
00300 } // classifySection
00301 
00302 
00307 const char *SpamUtil::typeToStr( contentType type )
00308 {
00309   const char *typeName;
00310   
00311   switch (type) {
00312   case BAD_CONT:
00313     typeName = "bad content";
00314     break;
00315   case UNKNOWN:
00316     typeName = "unknown";
00317     break;
00318   case BLANK:
00319     typeName = "blank section";
00320     break;
00321   case HTML:
00322     typeName = "html";
00323     break;
00324   case TEXT:
00325     typeName = "text";
00326     break;
00327   case MULTIPART:
00328     typeName = "multipart";
00329     break;
00330   case IMAGE:
00331     typeName = "image";
00332     break;
00333   case APPLICATION:
00334     typeName = "application";
00335     break;
00336   case AUDIO:
00337     typeName = "audio";
00338     break;
00339   case WINDOZ:
00340     typeName = "windows chars";
00341     break;
00342   case BASE64:
00343     typeName = "base64";
00344     break;
00345   default:
00346     typeName = "bad content type";
00347     break;
00348   }
00349   return typeName;
00350 } // typeToStr
00351 
00352 
00353 
00354 const char *SpamUtil::classificationToStr( MailFilter::classification klass )
00355 {
00356   const char *str;
00357 
00358   switch (klass) {
00359   case MailFilter::UNKNOWN:
00360     str = "UNKNOWN";
00361     break;
00362   case MailFilter::EMAIL:
00363     str = "EMAIL";
00364     break;
00365   case MailFilter::SUSPECT:
00366     str = "SUSPECT";
00367     break;
00368   case MailFilter::GARBAGE:
00369     str = "GARBAGE";
00370     break;
00371   default:
00372     str = "bad value";
00373     break;
00374   }
00375 
00376   return str;
00377 }  // classificationToStr
00378 
00379 
00380 
00406 MailFilter::classification SpamUtil::checkLine(const char *buf, 
00407                                                SpamParameters &params,
00408                                                char *foundStr,
00409                                                const size_t foundStrSize )
00410 {
00411   const char *HTMLtag = "<html>"; // remember, input line is in lower case
00412   const char *BODYtag = "<body";  // body may have attributes, so the closing
00413                                   // angle bracket is omitted.
00414   MailFilter::classification klass = MailFilter::UNKNOWN;
00415   std::vector<const char *> spamWords = params.getSection(SpamParameters::spam_words);
00416   std::vector<const char *> killWords = params.getSection(SpamParameters::kill_words);
00417 
00418   foundStr[0] = '\0';
00419   char *pHtmlTag = 0;
00420 
00421   if ((pHtmlTag = strstr(buf, HTMLtag)) == 0) {
00422     pHtmlTag = strstr(buf, BODYtag);
00423   }
00424 
00425   if (pHtmlTag != 0) {
00426     strncpy(foundStr, pHtmlTag, foundStrSize);
00427     klass = MailFilter::SUSPECT;
00428   }
00429   else {
00430     const size_t len = spamWords.size();
00431     for (size_t i = 0; i < len; i++) {
00432       if (strstr(buf, spamWords[i]) != 0) {
00433         strncpy(foundStr, spamWords[i], foundStrSize);
00434         klass = MailFilter::SUSPECT;
00435         break;
00436       }
00437     } // for
00438 
00439     if (klass == MailFilter::UNKNOWN) {
00440       const size_t len = killWords.size();
00441       for (size_t i = 0; i < len; i++) {
00442         if (strstr(buf, killWords[i]) != 0) {
00443           strncpy(foundStr, killWords[i], foundStrSize);
00444           klass = MailFilter::GARBAGE;
00445           break;
00446         }
00447       }
00448     }
00449   }
00450   return klass;
00451 } // checkLine
00452 

Generated on Sat Mar 27 13:07:38 2004 for Mail Filter by doxygen 1.3.3