libkmime

kmime_charfreq.cpp

00001 /*
00002     kmime_charfreq.cpp
00003 
00004     KMime, the KDE internet mail/usenet news message library.
00005     Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
00006 
00007     This program is free software; you can redistribute it and/or modify
00008     it under the terms of the GNU General Public License as published by
00009     the Free Software Foundation; version 2 of the License.
00010     You should have received a copy of the GNU General Public License
00011     along with this program; if not, write to the Free Software Foundation,
00012     Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, US
00013 */
00014 
00015 #include "kmime_charfreq.h"
00016 
00017 namespace KMime {
00018 
00019 CharFreq::CharFreq( const QByteArray & buf )
00020   : NUL(0),
00021     CTL(0),
00022     CR(0), LF(0),
00023     CRLF(0),
00024     printable(0),
00025     eightBit(0),
00026     total(0),
00027     lineMin(0xffffffff),
00028     lineMax(0),
00029     mTrailingWS(false),
00030     mLeadingFrom(false)
00031 {
00032   if ( !buf.isEmpty() )
00033     count( buf.data(), buf.size() );
00034 }
00035 
00036 CharFreq::CharFreq( const char * buf, size_t len )
00037   : NUL(0),
00038     CTL(0),
00039     CR(0), LF(0),
00040     CRLF(0),
00041     printable(0),
00042     eightBit(0),
00043     total(0),
00044     lineMin(0xffffffff),
00045     lineMax(0),
00046     mTrailingWS(false),
00047     mLeadingFrom(false)
00048 {
00049   if ( buf && len > 0 )
00050     count( buf, len );
00051 }
00052 
00053 static inline bool isWS( char ch ) { return ( ch == '\t' || ch == ' ' ); }
00054 
00055 void CharFreq::count( const char * it, size_t len ) {
00056 
00057   const char * end = it + len;
00058   uint currentLineLength = 0;
00059   // initialize the prevChar with LF so that From_ detection works w/o
00060   // special-casing:
00061   char prevChar = '\n';
00062   char prevPrevChar = 0;
00063 
00064   for ( ; it != end ; ++it ) {
00065     ++currentLineLength;
00066     switch ( *it ) {
00067     case '\0': ++NUL; break;
00068     case '\r': ++CR;  break;
00069     case '\n': ++LF;
00070       if ( prevChar == '\r' ) { --currentLineLength; ++CRLF; }
00071       if ( currentLineLength >= lineMax ) lineMax = currentLineLength-1;
00072       if ( currentLineLength <= lineMin ) lineMin = currentLineLength-1;
00073       if ( !mTrailingWS )
00074     if ( isWS( prevChar ) || ( prevChar == '\r' && isWS( prevPrevChar ) ) )
00075       mTrailingWS = true;
00076       currentLineLength = 0;
00077       break;
00078     case 'F': // check for lines starting with From_ if not found already:
00079       if ( !mLeadingFrom )
00080     if ( prevChar == '\n' && end - it >= 5 && !qstrncmp( "From ", it, 5 ) )
00081       mLeadingFrom = true;
00082       ++printable;
00083       break;
00084     default:
00085       {
00086     uchar c = *it;
00087     if ( c == '\t' || c >= ' ' && c <= '~' )
00088       ++printable;
00089     else if ( c == 127 || c < ' ' )
00090       ++CTL;
00091     else
00092       ++eightBit;
00093       }
00094     }
00095     prevPrevChar = prevChar;
00096     prevChar = *it;
00097   }
00098 
00099   // consider the length of the last line
00100   if ( currentLineLength >= lineMax ) lineMax = currentLineLength;
00101   if ( currentLineLength <= lineMin ) lineMin = currentLineLength;
00102 
00103   // check whether the last character is tab or space
00104   if ( isWS( prevChar ) )
00105     mTrailingWS = true;
00106 
00107   total = len;
00108 }
00109 
00110 bool CharFreq::isEightBitData() const {
00111   return type() == EightBitData;
00112 }
00113 
00114 bool CharFreq::isEightBitText() const {
00115   return type() == EightBitText;
00116 }
00117 
00118 bool CharFreq::isSevenBitData() const {
00119   return type() == SevenBitData;
00120 }
00121 
00122 bool CharFreq::isSevenBitText() const {
00123   return type() == SevenBitText;
00124 }
00125 
00126 bool CharFreq::hasTrailingWhitespace() const {
00127   return mTrailingWS;
00128 }
00129 
00130 bool CharFreq::hasLeadingFrom() const {
00131   return mLeadingFrom;
00132 }
00133 
00134 CharFreq::Type CharFreq::type() const {
00135 #if 0
00136   qDebug( "Total: %d; NUL: %d; CTL: %d;\n"
00137       "CR: %d; LF: %d; CRLF: %d;\n"
00138       "lineMin: %d; lineMax: %d;\n"
00139       "printable: %d; eightBit: %d;\n"
00140           "trailing whitespace: %s;\n"
00141           "leading 'From ': %s;\n",
00142       total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax,
00143       printable, eightBit,
00144       mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no" );
00145 #endif
00146   if ( NUL ) // must be binary
00147     return Binary;
00148 
00149   // doesn't contain NUL's:
00150   if ( eightBit ) {
00151     if ( lineMax > 988 ) return EightBitData; // not allowed in 8bit
00152     if ( CR != CRLF || controlCodesRatio() > 0.2 ) return EightBitData;
00153     return EightBitText;
00154   }
00155 
00156   // doesn't contain NUL's, nor 8bit chars:
00157   if ( lineMax > 988 ) return SevenBitData;
00158   if ( CR != CRLF || controlCodesRatio() > 0.2 ) return SevenBitData;
00159 
00160   // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars:
00161   return SevenBitText;
00162 }
00163 
00164 float CharFreq::printableRatio() const {
00165   if ( total ) return float(printable) / float(total);
00166   else         return 0;
00167 }
00168 
00169 float CharFreq::controlCodesRatio() const {
00170   if ( total ) return float(CTL) / float(total);
00171   else         return 0;
00172 }
00173 
00174 } // namespace KMime
00175 
00176 
KDE Home | KDE Accessibility Home | Description of Access Keys