Main Page | Compound List | File List | Compound Members | File Members

SpamUtil Class Reference

#include <SpamUtil.h>

List of all members.

Public Types

enum  contentType {
  BAD_CONT, UNKNOWN, BLANK, HTML,
  TEXT, MULTIPART, AUDIO, IMAGE,
  APPLICATION, WINDOZ, BASE64
}

Public Member Functions

const char * classificationToStr (MailFilter::classification klass)

Static Public Member Functions

const int getFileSize (FILE *fp, const char *fileName, char *msgbuf)
const char * trimEnd (char *str)
void trim (const char *str)
const char * findColon (const char *buf)
const char * skipWhiteSpace (const char *pBuf)
bool isBlankLine (const char *buf)
void strncpy (char *pDest, const char *pSrc, size_t destSize)
void toLower (char *dest, const char *src)
void toLower (char *dest, const char *src, size_t size)
bool match (const char *bufStart, const size_t len, const char *word)
bool match (const char *bufStart, const char *bufEnd, const char *word)
contentType classifySection (const char *buf)
const char * typeToStr (contentType type)
MailFilter::classification checkLine (const char *buf, SpamParameters &params, char *foundStr, const size_t foundStrSize)

Private Member Functions

 SpamUtil (const SpamUtil &rhs)


Detailed Description

Common utility methods for the spam filter. These are all stateless, static functions. The SpamUtil class is also stateless. The functions in SpamUtil can be called by creating a class temporary, since they are all statics. For example:
SpamUtil().trim( myString );

Definition at line 41 of file SpamUtil.h.


Constructor & Destructor Documentation

SpamUtil::SpamUtil const SpamUtil rhs  )  [private]
 

disallow the copy constructor

Referenced by classifySection().


Member Function Documentation

MailFilter::classification SpamUtil::checkLine const char *  buf,
SpamParameters params,
char *  foundStr,
const size_t  foundStrSize
[static]
 

Check the line in buf against the spam words and the kill words. If a spam word is found, the function returns SUSPECT. If a kill word is found the function returns GARBAGE.

Important Note: the input line must be converted into lower case, or the matches against the spam and kill words may fail when they should not.

If this function is called on lines read from the HTML section of an email, the HTML tags should be stripped out. Since HTML tags are not stripped in the text section, if the HTMLtag or BODYtag strings are found, the email is marked as suspect (spammers will sometimes include HTML in a text section and count on the flexibility of email software to convert it).

Parameters:
buf a line, where all the characters in lower case (call SpamUtil::toLower to put in lower case.
params a reference to the SpamParameters object which encapsulates the vectors for spam and kill words.
foundStr a character array that will be used to return the spam word that was found.
foundStrSize the size of foundStr.

Definition at line 406 of file SpamUtil.C.

References SpamParameters::getSection(), and strncpy().

00410 {
00411   const char *HTMLtag = "<html>"; // remember, input line is in lower case
00412   const char *BODYtag = "<body";  // body may have attributes, so the closing
00413                                   // angle bracket is omitted.
00414   MailFilter::classification klass = MailFilter::UNKNOWN;
00415   std::vector<const char *> spamWords = params.getSection(SpamParameters::spam_words);
00416   std::vector<const char *> killWords = params.getSection(SpamParameters::kill_words);
00417 
00418   foundStr[0] = '\0';
00419   char *pHtmlTag = 0;
00420 
00421   if ((pHtmlTag = strstr(buf, HTMLtag)) == 0) {
00422     pHtmlTag = strstr(buf, BODYtag);
00423   }
00424 
00425   if (pHtmlTag != 0) {
00426     strncpy(foundStr, pHtmlTag, foundStrSize);
00427     klass = MailFilter::SUSPECT;
00428   }
00429   else {
00430     const size_t len = spamWords.size();
00431     for (size_t i = 0; i < len; i++) {
00432       if (strstr(buf, spamWords[i]) != 0) {
00433         strncpy(foundStr, spamWords[i], foundStrSize);
00434         klass = MailFilter::SUSPECT;
00435         break;
00436       }
00437     } // for
00438 
00439     if (klass == MailFilter::UNKNOWN) {
00440       const size_t len = killWords.size();
00441       for (size_t i = 0; i < len; i++) {
00442         if (strstr(buf, killWords[i]) != 0) {
00443           strncpy(foundStr, killWords[i], foundStrSize);
00444           klass = MailFilter::GARBAGE;
00445           break;
00446         }
00447       }
00448     }
00449   }
00450   return klass;
00451 } // checkLine

SpamUtil::contentType SpamUtil::classifySection const char *  buf  )  [static]
 

Classify a Content-Type section

Definition at line 272 of file SpamUtil.C.

References SpamUtil().

00273 {
00274   contentType type = UNKNOWN;
00275   char tempBuf[128];
00276 
00277   SpamUtil().toLower(tempBuf, buf, sizeof(tempBuf));
00278 
00279   const char *typeName = "unknown";
00280   if (strstr(buf, "multipart") != 0) {
00281     type = MULTIPART;
00282   }
00283   else if (strstr(buf, "html") != 0) {
00284     type = HTML;
00285   }
00286   else if (strstr(buf, "text") != 0) {
00287     type = TEXT;
00288   }
00289   else if (strstr(buf, "image") != 0) {
00290     type = IMAGE;
00291   }
00292   else if (strstr(buf, "audio") != 0) {
00293     type = AUDIO;
00294   }
00295   else if (strstr(buf, "application") != 0) {
00296     type = APPLICATION;
00297   }
00298 
00299   return type;
00300 } // classifySection

const int SpamUtil::getFileSize FILE *  fp,
const char *  fileName,
char *  msgbuf
[static]
 

Return the file size, in bytes or -1 if there is an error.

Definition at line 45 of file SpamUtil.C.

00046 {
00047   int fileSize = -1;
00048   if (fp != 0) {
00049     struct stat s;
00050     int fd = fileno( fp );
00051     if (fstat( fd, &s) == 0) {
00052       fileSize = s.st_size;
00053     }
00054     else {
00055       char *err_reason = strerror( errno);
00056       sprintf( msgbuf, "SpamUtil::getFileSize: could not open %s.  Reason: %s", 
00057                fileName, err_reason );
00058     }
00059   }
00060   return fileSize;
00061 } // getFileSize

bool SpamUtil::isBlankLine const char *  buf  )  [static]
 

Return true if the line is blank (only white space) or null terminated at the start. False otherwise.

Definition at line 149 of file SpamUtil.C.

00150 {
00151   bool blankLine = true;
00152   if (buf != 0) {
00153     for (; *buf != '\0'; buf++) {
00154       if (! isspace( *buf ) ) {
00155         blankLine = false;
00156         break;
00157       }
00158     } // for
00159   }
00160   return blankLine;
00161 } // isBlankLine

bool SpamUtil::match const char *  bufStart,
const char *  bufEnd,
const char *  word
[static]
 

Check whether a region of a string (defined by bufStart and bufEnd) matches the contents of the string in "word". If there is a match, return true, otherwise return false.

Definition at line 196 of file SpamUtil.C.

00199 {
00200   assert( bufStart != 0 && word != 0);
00201 
00202   bool wordsMatch = false;
00203   int regionLen = bufEnd - bufStart;
00204   int wordLen = strlen( word );
00205   if (wordLen > 0 && wordLen == regionLen) {
00206     wordsMatch = true;
00207     for (size_t i = 0; i < wordLen; i++) {
00208       if (tolower(bufStart[i]) != word[i]) {
00209         wordsMatch = false;
00210         break;
00211       }
00212     } // for
00213   }
00214 
00215   return wordsMatch;
00216 } // match

bool SpamUtil::match const char *  bufStart,
const size_t  len,
const char *  word
[static]
 

Check whether the start of the string matches another string (word). If they match, return true, otherwise return false. Note that this function does the match in lower case.

Definition at line 170 of file SpamUtil.C.

00173 {
00174   assert( bufStart != 0 && word != 0);
00175 
00176   bool wordsMatch = false;
00177   if (len > 0) {
00178     wordsMatch = true;
00179     for (size_t i = 0; i < len; i++) {
00180       if (tolower(bufStart[i]) != word[i]) {
00181         wordsMatch = false;
00182         break;
00183       }
00184     } // for
00185   }
00186 
00187   return wordsMatch;
00188 } // match

const char * SpamUtil::skipWhiteSpace const char *  pBuf  )  [static]
 

Skip white spaces at the start of a string. The function returns a pointer to the first non-white space character.

Definition at line 89 of file SpamUtil.C.

Referenced by trim().

00090 {
00091   while (*pBuf != '\0' && isspace( *pBuf )) {
00092     pBuf++;
00093   }
00094   return pBuf;
00095 } // skipWhiteSpace

void SpamUtil::strncpy char *  pDest,
const char *  pSrc,
size_t  destSize
[static]
 

Copy pSrc into pDest until a null char is encounted or until destSize-1 characters are copied. Null terminate the string, even if the source is longer than the dest. This is slightly different behavior than the POSIX strncpy, which will not always null terminate. Also, this function only inserts a single null. In constrast, POSIX strncpy will pad the dest with nulls when the length of src is less than dest. Finally, another difference is that this function does not return a char* pointer.

Definition at line 74 of file SpamUtil.C.

Referenced by checkLine().

00075 {
00076   const size_t maxCopy = destSize-1;
00077   size_t ix;
00078   for (ix = 0; ix < maxCopy && pSrc[ix] != '\0'; ix++) {
00079     pDest[ix] = pSrc[ix];
00080   }
00081   pDest[ix] = '\0';
00082 } // strncpy

void SpamUtil::toLower char *  dest,
const char *  src,
size_t  size
[static]
 

Convert a string to lower case. The destination and source may be the same address. The size argument is the size of the destination. The source is assumed to be a null terminated string.

Definition at line 242 of file SpamUtil.C.

00243 {
00244   // convert to lower case
00245   size_t j;
00246   for (j = 0; j < size-1 && src[j] != '\0'; j++) {
00247     dest[j] = tolower(src[j]);
00248   } // for
00249   dest[j] = '\0';
00250 }

void SpamUtil::toLower char *  dest,
const char *  src
[static]
 

Convert a string to lower case. The destination and source may be the same address. The source is assumed to be a null terminated string.

Definition at line 258 of file SpamUtil.C.

00259 {
00260   // convert to lower case
00261   size_t j;
00262   for (j = 0; src[j] != '\0'; j++) {
00263     dest[j] = tolower(src[j]);
00264   } // for
00265   dest[j] = '\0';
00266 }

void SpamUtil::trim const char *  str  )  [static]
 

This function modifies the contents of the string pointed to by the str argument. The result is a string from which the leading and trailing white space have been removed.

Definition at line 119 of file SpamUtil.C.

References skipWhiteSpace().

00120 {
00121   if (str != 0) {
00122     char *start = (char *)skipWhiteSpace( str );
00123 
00124     char *ptr = (char *)str;
00125     if (*start != '\0') {
00126       size_t len = strlen( str );
00127       char *end = (char *)&str[len-1];
00128 
00129       while (end > start && isspace( *end )) {
00130         end--;
00131       }
00132       size_t trimLen = (end - start) + 1;
00133       for (size_t i = 0; i < trimLen; i++) {
00134         *ptr = *start;
00135         start++;
00136         ptr++;
00137       }
00138     }
00139    *ptr = '\0';
00140   }
00141 } // trim

const char * SpamUtil::trimEnd char *  str  )  [static]
 

Remove white space from the end of a string.

Definition at line 101 of file SpamUtil.C.

00102 {
00103   if (str != 0 && str[0] != '\0') {
00104     int len = strlen( str );
00105     int i;
00106     for (i = len-1; i >=0 && isspace(str[i]); i--)
00107       /* nada */;
00108     str[i+1] = '\0';
00109   }
00110   return str;
00111 } // trimEnd

const char * SpamUtil::typeToStr contentType  type  )  [static]
 

Return the character string that corresponds to the contentType enumeration.

Definition at line 307 of file SpamUtil.C.

00308 {
00309   const char *typeName;
00310   
00311   switch (type) {
00312   case BAD_CONT:
00313     typeName = "bad content";
00314     break;
00315   case UNKNOWN:
00316     typeName = "unknown";
00317     break;
00318   case BLANK:
00319     typeName = "blank section";
00320     break;
00321   case HTML:
00322     typeName = "html";
00323     break;
00324   case TEXT:
00325     typeName = "text";
00326     break;
00327   case MULTIPART:
00328     typeName = "multipart";
00329     break;
00330   case IMAGE:
00331     typeName = "image";
00332     break;
00333   case APPLICATION:
00334     typeName = "application";
00335     break;
00336   case AUDIO:
00337     typeName = "audio";
00338     break;
00339   case WINDOZ:
00340     typeName = "windows chars";
00341     break;
00342   case BASE64:
00343     typeName = "base64";
00344     break;
00345   default:
00346     typeName = "bad content type";
00347     break;
00348   }
00349   return typeName;
00350 } // typeToStr


The documentation for this class was generated from the following files:
Generated on Sat Mar 27 13:07:38 2004 for Mail Filter by doxygen 1.3.3