libkmime

kmime_codec_qp.cpp

00001 /*  -*- c++ -*-
00002     kmime_codec_qp.cpp
00003 
00004     This file is part of KMime, the KDE internet mail/usenet news message library.
00005     Copyright (c) 2002 Marc Mutz <mutz@kde.org>
00006 
00007     KMime is free software; you can redistribute it and/or modify it
00008     under the terms of the GNU General Public License, version 2, as
00009     published by the Free Software Foundation.
00010 
00011     KMime is distributed in the hope that it will be useful, but
00012     WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     General Public License for more details.
00015 
00016     You should have received a copy of the GNU General Public License
00017     along with this library; if not, write to the Free Software
00018     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00019 
00020     In addition, as a special exception, the copyright holders give
00021     permission to link the code of this library with any edition of
00022     the Qt library by Trolltech AS, Norway (or with modified versions
00023     of Qt that use the same license as Qt), and distribute linked
00024     combinations including the two.  You must obey the GNU General
00025     Public License in all respects for all of the code used other than
00026     Qt.  If you modify this file, you may extend this exception to
00027     your version of the file, but you are not obligated to do so.  If
00028     you do not wish to do so, delete this exception statement from
00029     your version.
00030 */
00031 
00032 #include "kmime_codec_qp.h"
00033 
00034 #include "kmime_util.h"
00035 
00036 #include <kdebug.h>
00037 
00038 #include <cassert>
00039 
00040 using namespace KMime;
00041 
00042 namespace KMime {
00043 
00044 // some helpful functions:
00045 
00046 static inline char binToHex( uchar value ) {
00047   if ( value > 9 )
00048     return value + 'A' - 10;
00049   else
00050     return value + '0';
00051 }
00052 
00053 static inline uchar highNibble( uchar ch ) {
00054   return ch >> 4;
00055 }
00056 
00057 static inline uchar lowNibble( uchar ch ) {
00058   return ch & 0xF;
00059 }
00060 
00061 static inline bool keep( uchar ch ) {
00062   // no CTLs, except HT and not '?'
00063   return !( ch < ' ' && ch != '\t' || ch == '?' );
00064 }
00065 
00066 //
00067 // QuotedPrintableCodec
00068 //
00069 
00070 class QuotedPrintableEncoder : public Encoder {
00071   char mInputBuffer[16];
00072   uchar mCurrentLineLength; // 0..76
00073   uchar mAccu;
00074   uint mInputBufferReadCursor  : 4; // 0..15
00075   uint mInputBufferWriteCursor : 4; // 0..15
00076   enum {
00077     Never, AtBOL, Definitely
00078   } mAccuNeedsEncoding    : 2;
00079   bool mSawLineEnd        : 1;
00080   bool mSawCR             : 1;
00081   bool mFinishing         : 1;
00082   bool mFinished          : 1;
00083 protected:
00084   friend class QuotedPrintableCodec;
00085   QuotedPrintableEncoder( bool withCRLF=false )
00086     : Encoder( withCRLF ), mCurrentLineLength(0), mAccu(0),
00087       mInputBufferReadCursor(0), mInputBufferWriteCursor(0),
00088       mAccuNeedsEncoding(Never),
00089       mSawLineEnd(false), mSawCR(false), mFinishing(false),
00090       mFinished(false) {}
00091 
00092   bool needsEncoding( uchar ch ) {
00093     return ( ch > '~' || ch < ' ' && ch != '\t' || ch == '=' );
00094   }
00095   bool needsEncodingAtEOL( uchar ch ) {
00096     return ( ch == ' ' || ch == '\t' );
00097   }
00098   bool needsEncodingAtBOL( uchar ch ) {
00099     return ( ch == 'F' || ch == '.' || ch == '-' );
00100   }
00101   bool fillInputBuffer( const char* & scursor, const char * const send );
00102   bool processNextChar();
00103   void createOutputBuffer( char* & dcursor, const char * const dend );
00104 public:
00105   virtual ~QuotedPrintableEncoder() {}
00106 
00107   bool encode( const char* & scursor, const char * const send,
00108            char* & dcursor, const char * const dend );
00109 
00110   bool finish( char* & dcursor, const char * const dend );
00111 };
00112 
00113 
00114 class QuotedPrintableDecoder : public Decoder {
00115   const char mEscapeChar;
00116   char mBadChar;
00118   uchar mAccu;
00128   const bool mQEncoding;
00129   bool mInsideHexChar;
00130   bool mFlushing;
00131   bool mExpectLF;
00132   bool mHaveAccu;
00133 protected:
00134   friend class QuotedPrintableCodec;
00135   friend class Rfc2047QEncodingCodec;
00136   friend class Rfc2231EncodingCodec;
00137   QuotedPrintableDecoder( bool withCRLF=false,
00138               bool aQEncoding=false, char aEscapeChar='=' )
00139     : Decoder( withCRLF ),
00140       mEscapeChar(aEscapeChar),
00141       mBadChar(0),
00142       mAccu(0),
00143       mQEncoding(aQEncoding),
00144       mInsideHexChar(false),
00145       mFlushing(false),
00146       mExpectLF(false),
00147       mHaveAccu(false) {}
00148 public:
00149   virtual ~QuotedPrintableDecoder() {}
00150 
00151   bool decode( const char* & scursor, const char * const send,
00152            char* & dcursor, const char * const dend );
00153   // ### really no finishing needed???
00154   bool finish( char* &, const char * const ) { return true; }
00155 };
00156 
00157 
00158 class Rfc2047QEncodingEncoder : public Encoder {
00159   uchar      mAccu;
00160   uchar      mStepNo;
00161   const char mEscapeChar;
00162   bool       mInsideFinishing : 1;
00163 protected:
00164   friend class Rfc2047QEncodingCodec;
00165   friend class Rfc2231EncodingCodec;
00166   Rfc2047QEncodingEncoder( bool withCRLF=false, char aEscapeChar='=' )
00167     : Encoder( withCRLF ),
00168       mAccu(0), mStepNo(0), mEscapeChar( aEscapeChar ),
00169       mInsideFinishing( false )
00170   {
00171     // else an optimization in ::encode might break.
00172     assert( aEscapeChar == '=' || aEscapeChar == '%' );
00173   }
00174 
00175   // this code assumes that isEText( mEscapeChar ) == false!
00176   bool needsEncoding( uchar ch ) {
00177     if ( ch > 'z' ) return true; // {|}~ DEL and 8bit chars need
00178     if ( !isEText( ch ) ) return true; // all but a-zA-Z0-9!/*+- need, too
00179     if ( mEscapeChar == '%' && ( ch == '*' || ch == '/' ) )
00180       return true; // not allowed in rfc2231 encoding
00181     return false;
00182   }
00183 
00184 public:
00185   virtual ~Rfc2047QEncodingEncoder() {}
00186 
00187   bool encode( const char* & scursor, const char * const send,
00188            char* & dcursor, const char * const dend );
00189   bool finish( char* & dcursor, const char * const dend );
00190 };
00191 
00192 // this doesn't access any member variables, so it can be defined static
00193 // but then we can't call it from virtual functions
00194 static int QuotedPrintableDecoder_maxDecodedSizeFor( int insize, bool withCRLF ) {
00195   // all chars unencoded:
00196   int result = insize;
00197   // but maybe all of them are \n and we need to make them \r\n :-o
00198   if ( withCRLF )
00199     result += insize;
00200 
00201   // there might be an accu plus escape
00202   result += 2;
00203 
00204   return result;
00205 }
00206 
00207 Encoder * QuotedPrintableCodec::makeEncoder( bool withCRLF ) const {
00208   return new QuotedPrintableEncoder( withCRLF );
00209 }
00210 
00211 Decoder * QuotedPrintableCodec::makeDecoder( bool withCRLF ) const {
00212   return new QuotedPrintableDecoder( withCRLF );
00213 }
00214 
00215 int QuotedPrintableCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const {
00216     return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
00217 }
00218 
00219 Encoder * Rfc2047QEncodingCodec::makeEncoder( bool withCRLF ) const {
00220   return new Rfc2047QEncodingEncoder( withCRLF );
00221 }
00222 
00223 Decoder * Rfc2047QEncodingCodec::makeDecoder( bool withCRLF ) const {
00224   return new QuotedPrintableDecoder( withCRLF, true );
00225 }
00226 
00227 int Rfc2047QEncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const {
00228     return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
00229 }
00230 
00231 Encoder * Rfc2231EncodingCodec::makeEncoder( bool withCRLF ) const {
00232   return new Rfc2047QEncodingEncoder( withCRLF, '%' );
00233 }
00234 
00235 Decoder * Rfc2231EncodingCodec::makeDecoder( bool withCRLF ) const {
00236   return new QuotedPrintableDecoder( withCRLF, true, '%' );
00237 }
00238 
00239 int Rfc2231EncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const {
00240     return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
00241 }
00242 
00243   /********************************************************/
00244   /********************************************************/
00245   /********************************************************/
00246 
00247 bool QuotedPrintableDecoder::decode( const char* & scursor, const char * const send,
00248                      char* & dcursor, const char * const dend ) {
00249   if ( mWithCRLF )
00250     kdWarning() << "CRLF output for decoders isn't yet supported!" << endl;
00251 
00252   while ( scursor != send && dcursor != dend ) {
00253     if ( mFlushing ) {
00254       // we have to flush chars in the aftermath of an decoding
00255       // error. The way to request a flush is to
00256       // - store the offending character in mBadChar and
00257       // - set mFlushing to true.
00258       // The supported cases are (H: hexchar, X: bad char):
00259       // =X, =HX, CR
00260       // mBadChar is only written out if it is not by itself illegal in
00261       // quoted-printable (e.g. CTLs, 8Bits).
00262       // A fast way to suppress mBadChar output is to set it to NUL.
00263       if ( mInsideHexChar ) {
00264     // output '='
00265     *dcursor++ = mEscapeChar;
00266     mInsideHexChar = false;
00267       } else if ( mHaveAccu ) {
00268     // output the high nibble of the accumulator:
00269     *dcursor++ = binToHex( highNibble( mAccu ) );
00270     mHaveAccu = false;
00271     mAccu = 0;
00272       } else {
00273     // output mBadChar
00274     assert( mAccu == 0 );
00275     if ( mBadChar ) {
00276       if ( mBadChar >= '>' && mBadChar <= '~' ||
00277            mBadChar >= '!' && mBadChar <= '<' )
00278         *dcursor++ = mBadChar;
00279       mBadChar = 0;
00280     }
00281     mFlushing = false;
00282       }
00283       continue;
00284     }
00285     assert( mBadChar == 0 );
00286 
00287     uchar ch = *scursor++;
00288     uchar value = 255;
00289 
00290     if ( mExpectLF && ch != '\n' ) {
00291       kdWarning() << "QuotedPrintableDecoder: "
00292     "illegally formed soft linebreak or lonely CR!" << endl;
00293       mInsideHexChar = false;
00294       mExpectLF = false;
00295       assert( mAccu == 0 );
00296     }
00297 
00298     if ( mInsideHexChar ) {
00299       // next char(s) represent nibble instead of itself:
00300       if ( ch <= '9' ) {
00301     if ( ch >= '0' ) {
00302       value = ch - '0';
00303     } else {
00304       switch ( ch ) {
00305       case '\r':
00306         mExpectLF = true;
00307         break;
00308       case '\n':
00309         // soft line break, but only if mAccu is NUL.
00310         if ( !mHaveAccu ) {
00311           mExpectLF = false;
00312           mInsideHexChar = false;
00313           break;
00314         }
00315         // else fall through
00316       default:
00317         kdWarning() << "QuotedPrintableDecoder: "
00318           "illegally formed hex char! Outputting verbatim." << endl;
00319         mBadChar = ch;
00320         mFlushing = true;
00321       }
00322       continue;
00323     }
00324       } else { // ch > '9'
00325     if ( ch <= 'F' ) {
00326       if ( ch >= 'A' ) {
00327         value = 10 + ch - 'A';
00328       } else { // [:-@]
00329         mBadChar = ch;
00330         mFlushing = true;
00331         continue;
00332       }
00333     } else { // ch > 'F'
00334       if ( ch <= 'f' && ch >= 'a' ) {
00335         value = 10 + ch - 'a';
00336       } else {
00337         mBadChar = ch;
00338         mFlushing = true;
00339         continue;
00340       }
00341     }
00342       }
00343 
00344       assert( value < 16 );
00345       assert( mBadChar == 0 );
00346       assert( !mExpectLF );
00347 
00348       if ( mHaveAccu ) {
00349     *dcursor++ = char( mAccu | value );
00350     mAccu = 0;
00351     mHaveAccu = false;
00352     mInsideHexChar = false;
00353       } else {
00354     mHaveAccu = true;
00355     mAccu = value << 4;
00356       }
00357     } else { // not mInsideHexChar
00358       if ( ch <= '~' && ch >= ' ' || ch == '\t' ) {
00359     if ( ch == mEscapeChar ) {
00360       mInsideHexChar = true;
00361     } else if ( mQEncoding && ch == '_' ) {
00362       *dcursor++ = char(0x20);
00363     } else {
00364       *dcursor++ = char(ch);
00365     }
00366       } else if ( ch == '\n' ) {
00367     *dcursor++ = '\n';
00368     mExpectLF = false;
00369       } else if ( ch == '\r' ) {
00370     mExpectLF = true;
00371       } else {
00372     kdWarning() << "QuotedPrintableDecoder: " << ch <<
00373       " illegal character in input stream! Ignoring." << endl;
00374       }
00375     }
00376   }
00377 
00378   return (scursor == send);
00379 }
00380 
00381 bool QuotedPrintableEncoder::fillInputBuffer( const char* & scursor,
00382                           const char * const send ) {
00383   // Don't read more if there's still a tail of a line in the buffer:
00384   if ( mSawLineEnd )
00385     return true;
00386 
00387   // Read until the buffer is full or we have found CRLF or LF (which
00388   // don't end up in the input buffer):
00389   for ( ; ( mInputBufferWriteCursor + 1 ) % 16 != mInputBufferReadCursor
00390       && scursor != send ; mInputBufferWriteCursor++ ) {
00391     char ch = *scursor++;
00392     if ( ch == '\r' ) {
00393       mSawCR = true;
00394     } else if ( ch == '\n' ) {
00395       // remove the CR from the input buffer (if any) and return that
00396       // we found a line ending:
00397       if ( mSawCR ) {
00398     mSawCR = false;
00399     assert( mInputBufferWriteCursor != mInputBufferReadCursor );
00400     mInputBufferWriteCursor--;
00401       }
00402       mSawLineEnd = true;
00403       return true; // saw CRLF or LF
00404     } else {
00405       mSawCR = false;
00406     }
00407     mInputBuffer[ mInputBufferWriteCursor ] = ch;
00408   }
00409   mSawLineEnd = false;
00410   return false; // didn't see a line ending...
00411 }
00412 
00413 bool QuotedPrintableEncoder::processNextChar() {
00414 
00415   // If we process a buffer which doesn't end in a line break, we
00416   // can't process all of it, since the next chars that will be read
00417   // could be a line break. So we empty the buffer only until a fixed
00418   // number of chars is left (except when mFinishing, which means that
00419   // the data doesn't end in newline):
00420   const int minBufferFillWithoutLineEnd = 4;
00421 
00422   assert( mOutputBufferCursor == 0 );
00423 
00424   int bufferFill = int(mInputBufferWriteCursor) - int(mInputBufferReadCursor) ;
00425   if ( bufferFill < 0 )
00426     bufferFill += 16;
00427 
00428   assert( bufferFill >=0 && bufferFill <= 15 );
00429 
00430   if ( !mFinishing && !mSawLineEnd &&
00431        bufferFill < minBufferFillWithoutLineEnd )
00432     return false;
00433 
00434   // buffer is empty, return false:
00435   if ( mInputBufferReadCursor == mInputBufferWriteCursor )
00436     return false;
00437 
00438   // Real processing goes here:
00439   mAccu = mInputBuffer[ mInputBufferReadCursor++ ];
00440   if ( needsEncoding( mAccu ) ) // always needs encoding or
00441     mAccuNeedsEncoding = Definitely;
00442   else if ( ( mSawLineEnd || mFinishing )  // needs encoding at end of line
00443         && bufferFill == 1             // or end of buffer
00444         && needsEncodingAtEOL( mAccu ) )
00445     mAccuNeedsEncoding = Definitely;
00446   else if ( needsEncodingAtBOL( mAccu ) )
00447     mAccuNeedsEncoding = AtBOL;
00448   else
00449     // never needs encoding
00450     mAccuNeedsEncoding = Never;
00451 
00452   return true;
00453 }
00454 
00455 // Outputs processed (verbatim or hex-encoded) chars and inserts soft
00456 // line breaks as necessary. Depends on processNextChar's directions
00457 // on whether or not to encode the current char, and whether or not
00458 // the current char is the last one in it's input line:
00459 void QuotedPrintableEncoder::createOutputBuffer( char* & dcursor,
00460                          const char * const dend )
00461 {
00462   const int maxLineLength = 76; // rfc 2045
00463 
00464   assert( mOutputBufferCursor == 0 );
00465 
00466   bool lastOneOnThisLine = mSawLineEnd
00467     && mInputBufferReadCursor == mInputBufferWriteCursor;
00468 
00469   int neededSpace = 1;
00470   if ( mAccuNeedsEncoding == Definitely)
00471     neededSpace = 3;
00472 
00473   // reserve space for the soft hyphen (=)
00474   if ( !lastOneOnThisLine )
00475     neededSpace++;
00476 
00477   if ( mCurrentLineLength > maxLineLength - neededSpace ) {
00478     // current line too short, insert soft line break:
00479     write( '=', dcursor, dend );
00480     writeCRLF( dcursor, dend );
00481     mCurrentLineLength = 0;
00482   }
00483 
00484   if ( Never == mAccuNeedsEncoding ||
00485        AtBOL == mAccuNeedsEncoding && mCurrentLineLength != 0 ) {
00486     write( mAccu, dcursor, dend );
00487     mCurrentLineLength++;
00488   } else {
00489     write( '=', dcursor, dend );
00490     write( binToHex( highNibble( mAccu ) ), dcursor, dend );
00491     write( binToHex( lowNibble( mAccu ) ), dcursor, dend );
00492     mCurrentLineLength += 3;
00493   }
00494 }
00495 
00496 
00497 bool QuotedPrintableEncoder::encode( const char* & scursor, const char * const send,
00498                      char* & dcursor, const char * const dend )
00499 {
00500   // support probing by the caller:
00501   if ( mFinishing ) return true;
00502 
00503   while ( scursor != send && dcursor != dend ) {
00504     if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) )
00505       return (scursor == send);
00506 
00507     assert( mOutputBufferCursor == 0 );
00508 
00509     // fill input buffer until eol has been reached or until the
00510     // buffer is full, whatever comes first:
00511     fillInputBuffer( scursor, send );
00512 
00513     if ( processNextChar() )
00514       // there was one...
00515       createOutputBuffer( dcursor, dend );
00516     else if ( mSawLineEnd &&
00517           mInputBufferWriteCursor == mInputBufferReadCursor ) {
00518       // load a hard line break into output buffer:
00519       writeCRLF( dcursor, dend );
00520       // signal fillInputBuffer() we are ready for the next line:
00521       mSawLineEnd = false;
00522       mCurrentLineLength = 0;
00523     } else
00524       // we are supposedly finished with this input block:
00525       break;
00526   }
00527 
00528   // make sure we write as much as possible and don't stop _writing_
00529   // just because we have no more _input_:
00530   if ( mOutputBufferCursor ) flushOutputBuffer( dcursor, dend );
00531 
00532   return (scursor == send);
00533 
00534 } // encode
00535 
00536 bool QuotedPrintableEncoder::finish( char* & dcursor,
00537                      const char * const dend ) {
00538   mFinishing = true;
00539 
00540   if ( mFinished )
00541     return flushOutputBuffer( dcursor, dend );
00542 
00543   while ( dcursor != dend ) {
00544     if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) )
00545       return false;
00546 
00547     assert( mOutputBufferCursor == 0 );
00548 
00549     if ( processNextChar() )
00550       // there was one...
00551       createOutputBuffer( dcursor, dend );
00552     else if ( mSawLineEnd &&
00553           mInputBufferWriteCursor == mInputBufferReadCursor ) {
00554       // load a hard line break into output buffer:
00555       writeCRLF( dcursor, dend );
00556       mSawLineEnd = false;
00557       mCurrentLineLength = 0;
00558     } else {
00559       mFinished = true;
00560       return flushOutputBuffer( dcursor, dend );
00561     }
00562   }
00563 
00564   return mFinished && !mOutputBufferCursor;
00565 
00566 } // finish
00567 
00568 
00569 bool Rfc2047QEncodingEncoder::encode( const char* & scursor, const char * const send,
00570                       char* & dcursor, const char * const dend )
00571 {
00572   if ( mInsideFinishing ) return true;
00573 
00574   while ( scursor != send && dcursor != dend ) {
00575     uchar value;
00576     switch ( mStepNo ) {
00577     case 0:
00578       // read the next char and decide if and how do encode:
00579       mAccu = *scursor++;
00580       if ( !needsEncoding( mAccu ) ) {
00581     *dcursor++ = char(mAccu);
00582       } else if ( mEscapeChar == '=' && mAccu == 0x20 ) {
00583     // shortcut encoding for 0x20 (latin-1/us-ascii SPACE)
00584     // (not for rfc2231 encoding)
00585     *dcursor++ = '_';
00586       } else {
00587     // needs =XY encoding - write escape char:
00588     *dcursor++ = mEscapeChar;
00589     mStepNo = 1;
00590       }
00591       continue;
00592     case 1:
00593       // extract hi-nibble:
00594       value = highNibble(mAccu);
00595       mStepNo = 2;
00596       break;
00597     case 2:
00598       // extract lo-nibble:
00599       value = lowNibble(mAccu);
00600       mStepNo = 0;
00601       break;
00602     default: assert( 0 );
00603     }
00604 
00605     // and write:
00606     *dcursor++ = binToHex( value );
00607   }
00608 
00609   return (scursor == send);
00610 } // encode
00611 
00612 #include <qstring.h>
00613 
00614 bool Rfc2047QEncodingEncoder::finish( char* & dcursor, const char * const dend ) {
00615   mInsideFinishing = true;
00616 
00617   // write the last bits of mAccu, if any:
00618   while ( mStepNo != 0 && dcursor != dend ) {
00619     uchar value;
00620     switch ( mStepNo ) {
00621     case 1:
00622       // extract hi-nibble:
00623       value = highNibble(mAccu);
00624       mStepNo = 2;
00625       break;
00626     case 2:
00627       // extract lo-nibble:
00628       value = lowNibble(mAccu);
00629       mStepNo = 0;
00630       break;
00631     default: assert( 0 );
00632     }
00633 
00634     // and write:
00635     *dcursor++ = binToHex( value );
00636   }
00637 
00638   return mStepNo == 0;
00639 }
00640 
00641 
00642 
00643 
00644 } // namespace KMime
KDE Home | KDE Accessibility Home | Description of Access Keys