Main Page | Compound List | File List | Compound Members | File Members

MailBody.C

Go to the documentation of this file.
00001 
00037 #include "MailBody.h"
00038 
00045 MailBody::MailBody( SpamParameters &param, HeaderInfo &headInfo ) : 
00046   mParams( param ), mHeadInfo( headInfo )
00047 {
00048   log = pLogger->getLogger("MailBody");
00049 } // MailBody
00050 
00051 
00072 MailBody::lineType MailBody::getLine(char *buf, 
00073                                      const size_t bufSize, 
00074                                      FILE *fp, 
00075                                      const char *boundary)
00076 {
00077   lineType type = EndOfFile;
00078 
00079   char *pStr;
00080   // skip any blank lines
00081   do {
00082      pStr = fgets(buf, bufSize, fp);
00083   } while (pStr != 0 && SpamUtil().isBlankLine(buf));
00084 
00085   if (pStr != 0) {
00086 
00087     type = LINE;
00088     if (boundary != 0) {
00089       if (buf[0] == '-' && buf[1] == '-') {
00090         if (strstr(buf+2, boundary) != 0) {
00091           type = BOUNDARY;
00092         }
00093       }
00094     }
00095 
00096     if (type != BOUNDARY) {
00097       SpamUtil().toLower(buf, buf, bufSize);
00098     }
00099   }
00100 
00101   return type;
00102 } // getLine;
00103 
00104 
00112 MailBody::lineType MailBody::getHtmlLine(char *buf, 
00113                                          const size_t bufSize, 
00114                                          FILE *fp, 
00115                                          const char *boundary)
00116 {
00117   char *tmpBuf = new char[ bufSize ];
00118   lineType type = EndOfFile;
00119 
00120   *buf = '\0';
00121 
00122   char *pStr;
00123   // skip any blank lines
00124   do {
00125      pStr = fgets(tmpBuf, bufSize, fp);
00126   } while (pStr != 0 && SpamUtil().isBlankLine(tmpBuf));
00127 
00128   if (pStr != 0) {
00129 
00130     type = LINE;
00131     if (boundary != 0) {
00132       if (tmpBuf[0] == '-' && tmpBuf[1] == '-') {
00133         if (strstr(tmpBuf+2, boundary) != 0) {
00134           type = BOUNDARY;
00135         }
00136       }
00137     }
00138 
00139     if (type != BOUNDARY) {
00140       size_t i = 0; // buf index
00141       bool copyChar = true;
00142       for (size_t j = 0; j < bufSize-1 && tmpBuf[j] != '\0'; j++) {
00143         char ch = tmpBuf[j];
00144         if (ch == '<') {
00145           copyChar = false;
00146         }
00147         else if (ch == '>') {
00148           copyChar = true;
00149         }
00150         else if (copyChar) {
00151           buf[i] = tolower(ch);
00152           i++;
00153         }
00154       } // for
00155       buf[i] = '\0';
00156     }
00157   }
00158   delete [] tmpBuf;
00159 
00160   return type;
00161 } // getHtmlLine
00162 
00163 
00170 MailBody::lineType MailBody::findSection( const char *boundary, FILE *fp )
00171 {
00172 
00173   const size_t BUF_SIZE = 128;
00174   char buf[ BUF_SIZE ];
00175 
00176   lineType ty;
00177 
00178   while ((ty = getLine(buf, sizeof(buf), fp, boundary )) != MailBody::EndOfFile) {
00179     if (ty == BOUNDARY) {
00180       break;
00181     }
00182   }
00183   
00184   return ty;
00185 } // findSection
00186 
00187 
00188 
00204 SpamUtil::contentType MailBody::classifyMailSection(FILE *fp)
00205 {
00206   SpamUtil::contentType type = SpamUtil::UNKNOWN;
00207   char buf[128];
00208   char *bufPtr;
00209     
00210   // get the line following the boundary line (may be blank)
00211   if ((bufPtr = fgets(buf, sizeof(buf), fp)) != 0) {
00212     if (SpamUtil().isBlankLine(buf)) {
00213       type = SpamUtil::BLANK;
00214     }
00215     else {
00216       static const char *CONTENT_TYPE = "Content-Type:";
00217       static const size_t content_typeLen = strlen( CONTENT_TYPE );
00218       const char *ptr;
00219 
00220       // see if we can find Content-Type:, if not, its an empty section
00221       if ((ptr = strstr(buf, CONTENT_TYPE)) != 0) {
00222         type = SpamUtil().classifySection( ptr+content_typeLen );
00223         // get the line after Content-Type
00224         if ((bufPtr = fgets(buf, sizeof(buf), fp)) != 0) {
00225           // if there is a colon, it's probably a Content-Transfer-Encoding line
00226           if (SpamUtil().findColon( bufPtr ) == 0) {
00227             // OK, no colon.  If the type was TEXT look for Windows char set.
00228             if (type == SpamUtil::TEXT) {
00229               static const char *CHARSET = "charset";
00230               static const size_t charsetLen = strlen( CHARSET );
00231               if ((ptr = strstr(buf, "charset")) != 0) {
00232                 if (strstr(ptr+charsetLen, "Windows") != 0) {
00233                   type = SpamUtil::WINDOZ;
00234                 } // Windows string
00235               } // charset string
00236             } // type == TEXT
00237             bufPtr = fgets(buf, sizeof(buf), fp);
00238           } // find colon
00239           if (bufPtr != 0) {
00240             // look for the base64 in Content-Transfer-Encoding: base64
00241             if (strstr(buf, "base64") != 0) {
00242               type = SpamUtil::BASE64;
00243             }
00244           }
00245         } // fgets != 0
00246       } // Content-Type string
00247     }
00248   } // fgets != 0
00249   else {
00250     log.log(Logger::ERROR, "classifyMailSection", "line expected after boundary");
00251   }
00252 
00253   return type;
00254 } // classifyMailSection
00255 
00256 
00257 
00262 void MailBody::mailBodyMsg( MailFilter::classification klass,
00263                                 const char *foundStr,
00264                                 const char *funcName )
00265 {
00266   char msg[256];
00267   char startMsg[64];
00268   const char *pMsg = msg;
00269 
00270   const char *klassStr = "email";
00271   if (klass == MailFilter::SUSPECT) {
00272     klassStr = "suspect";
00273   }
00274   else if (klass == MailFilter::GARBAGE) {
00275     klassStr = "garbage";
00276   }
00277   sprintf(startMsg, "email classified as %s", klassStr);
00278   if (foundStr[0] != '\0') {
00279     sprintf(msg, "%s, found \"%s\"", startMsg, foundStr );
00280   }
00281   else {
00282     pMsg = startMsg;
00283   }
00284   log.log(Logger::DEBUG, funcName, pMsg );
00285 } // mailBodyMsg
00286 
00287 
00288 
00346 MailFilter::classification MailBody::processBySection(const char *boundary, 
00347                                                       FILE *fp)
00348 {
00349   MailFilter::classification klass = MailFilter::UNKNOWN;
00350   log.log(Logger::DEBUG, "processBySection", "enter");
00351 
00352   log.log(Logger::DEBUG, "processBySection", "processing first section");
00353 
00354   if (findSection(boundary, fp) == BOUNDARY) {
00355     char foundStr[128];
00356     foundStr[0] = '\0';
00357 
00358     log.log(Logger::DEBUG, "processBySection", "found boundary");
00359 
00360     SpamUtil::contentType type = classifyMailSection(fp);
00361     if (type == SpamUtil::TEXT) {
00362       char buf[4096];
00363 
00364       bool foundNonBlankLine = false;
00365       MailBody::lineType lineTy;
00366       while ((lineTy = getLine(buf, sizeof(buf), fp, boundary)) == LINE) {
00367         klass = SpamUtil().checkLine(buf, 
00368                                      mParams,
00369                                      foundStr,
00370                                      sizeof(foundStr));
00371         foundNonBlankLine = true;
00372         if (klass != MailFilter::UNKNOWN) {
00373           mHeadInfo.reason(foundStr);
00374           break;
00375         }
00376       } // while
00377 
00378       // The text section was empty, so it is probably spam, since 
00379       // legitimate email usually provides a text version and an HTML
00380       // version.
00381       if (! foundNonBlankLine) {
00382         log.log(Logger::DEBUG, "processBySection", "found blank text section");
00383         klass = MailFilter::SUSPECT;
00384       }
00385       else {
00386         char msg[128];
00387         sprintf(msg, "first section type = %s", SpamUtil().typeToStr( type ));
00388         log.log(Logger::DEBUG, "processBySection", msg );
00389       }
00390 
00391       // if we don't know it's spam or garbage, check the type of the
00392       // section that follows the text section (it may, for example, be
00393       // a base64 section).
00394       if (lineTy == BOUNDARY) {
00395         log.log(Logger::DEBUG, "processBySection", "found second section boundary" );
00396         SpamUtil::contentType secondSecType = classifyMailSection(fp);
00397         type = secondSecType;
00398         char msg[128];
00399         sprintf(msg, "second section type = %s", SpamUtil().typeToStr( secondSecType ));
00400         log.log(Logger::DEBUG, "processBySection", msg );
00401 
00402         if (secondSecType == SpamUtil::HTML && (foundNonBlankLine)) {
00403           // There was a non-blank text section.  Some spammers will fill
00404           // in text from on-line books in this section and then include
00405           // the spam in the HTML section in an attempt to fool spam filters
00406           // (especially baysian filters).  So check for spam words here as well.
00407           log.log(Logger::DEBUG, "processBySection", "processing HTML section" );
00408           while ((lineTy = getHtmlLine(buf, sizeof(buf), fp, boundary)) == LINE) {
00409             klass = SpamUtil().checkLine(buf, 
00410                                          mParams,
00411                                          foundStr,
00412                                          sizeof(foundStr)); // true means HTML section
00413             if (klass != MailFilter::UNKNOWN) {
00414               mHeadInfo.reason(foundStr);
00415               break;
00416             }
00417           } // while
00418         } // if section is HTML
00419 
00420       }
00421     }
00422     else if (type == SpamUtil::HTML || 
00423              type == SpamUtil::BLANK || 
00424              type == SpamUtil::WINDOZ) {
00425       // We know it's suspect, the only question now is, is it garbage?
00426       klass = MailFilter::SUSPECT;
00427       if (type == SpamUtil::HTML) {
00428         log.log(Logger::DEBUG, "processBySection", "begins with HTML section");
00429       }
00430       else if (type == SpamUtil::BLANK) {
00431         log.log(Logger::DEBUG, "processBySection", "found boundary, no Content-Type");
00432       }
00433       if (findSection(boundary, fp) == BOUNDARY) {
00434         type = classifyMailSection(fp);
00435       }
00436     }
00437 
00438     if (type == SpamUtil::WINDOZ || 
00439         type == SpamUtil::IMAGE  ||
00440         type == SpamUtil::AUDIO  ||
00441         type == SpamUtil::MULTIPART) {
00442       char msg[128];
00443       sprintf(msg, "found section type %s", SpamUtil().typeToStr( type ) );
00444       log.log(Logger::DEBUG, "processBySection", msg );
00445       klass = MailFilter::SUSPECT;
00446     }
00447     else if (type == SpamUtil::BASE64) {
00448       log.log(Logger::DEBUG, "processBySection", "found base64 section");
00449       if (mParams.hasFlag("kill_base64")) {
00450         klass = MailFilter::GARBAGE;
00451         mHeadInfo.reason("found base64 encoding");
00452       }
00453       else {
00454         klass = MailFilter::SUSPECT;
00455       }
00456     }
00457     // if the email class is still "UNKNOWN" then we assume it is 
00458     // email.
00459     if (klass == MailFilter::UNKNOWN) {
00460       klass = MailFilter::EMAIL;
00461     }
00462 
00463     mailBodyMsg( klass, foundStr, "processBySection");
00464   }
00465   else {
00466     log.log(Logger::ERROR, "processBySection", "boundary not found");
00467   }
00468 
00469   log.log(Logger::DEBUG, "processBySection", "exit");
00470   return klass;
00471 } // processBySection
00472 
00473 
00480 MailFilter::classification MailBody::processTextBody( FILE *fp )
00481 {
00482   MailFilter::classification klass = MailFilter::UNKNOWN;
00483   log.log(Logger::DEBUG, "processTextBody", "enter");
00484   char buf[256];
00485   char foundStr[128];
00486 
00487   foundStr[0] = '\0';
00488   MailBody::lineType lineTy;
00489   while ((lineTy = getLine(buf, sizeof(buf), fp, 0)) == LINE) {
00490     klass = SpamUtil().checkLine(buf, 
00491                                  mParams,
00492                                  foundStr,
00493                                  sizeof(foundStr));
00494     if (klass != MailFilter::UNKNOWN) {
00495       break;
00496     }
00497   } // while
00498 
00499   if (klass != MailFilter::UNKNOWN) {
00500     mHeadInfo.reason( foundStr );
00501   }
00502 
00503   // if the email class is still "UNKNOWN" then we assume it is 
00504   // email.
00505   if (klass == MailFilter::UNKNOWN) {
00506     klass = MailFilter::EMAIL;
00507   }
00508   mailBodyMsg( klass, foundStr, "processTextBody");
00509   
00510   log.log(Logger::DEBUG, "processTextBody", "exit");
00511   return klass;
00512 } // processTextBody
00513 
00514 
00521 MailFilter::classification MailBody::checkBody(const char *boundary, FILE *fp)
00522 {
00523   MailFilter::classification klass = MailFilter::UNKNOWN;
00524   if (! feof(fp)) {
00525     if (boundary != 0 && boundary[0] != '\0') {
00526       klass = processBySection( boundary, fp );
00527     }
00528     else {
00529       klass = processTextBody( fp );
00530     }
00531   }
00532   return klass;
00533 } // checkBody

Generated on Sat Mar 27 13:07:38 2004 for Mail Filter by doxygen 1.3.3