libkdepim

qutf7codec.cpp

00001 /*
00002   qutf7codec.cpp
00003 
00004   A QTextCodec for UTF-7 (rfc2152).
00005   Copyright (c) 2001 Marc Mutz <mutz@kde.org>
00006   See file COPYING for details
00007 
00008   This program is free software; you can redistribute it and/or modify
00009   it under the terms of the GNU General Public License, version 2.0,
00010   as published by the Free Software Foundation.
00011 
00012   You should have received a copy of the GNU General Public License
00013   along with this program; if not, write to the Free Software
00014   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00015   02110-1301, US
00016 
00017   As a special exception, permission is granted to use this plugin
00018   with any version of Qt by TrollTech AS, Norway. In this case, the
00019   use of this plugin doesn't cause the resulting executable to be
00020   covered by the GNU General Public License.
00021   This exception does not however invalidate any other reasons why the
00022   executable file might be covered by the GNU General Public License.
00023 */
00024 
00025 
00026 #include "qutf7codec.h"
00027 
00028 #ifndef QT_NO_TEXTCODEC
00029 
00030 int QUtf7Codec::mibEnum() const {
00031   return 1012;
00032 }
00033 
00034 int QStrictUtf7Codec::mibEnum() const {
00035   return -1012;
00036 }
00037 
00038 const char* QUtf7Codec::name() const {
00039   return "UTF-7";
00040 }
00041 
00042 const char* QStrictUtf7Codec::name() const {
00043   return "X-QT-UTF-7-STRICT";
00044 }
00045 
00046 const char* QUtf7Codec::mimeName() const {
00047   return "UTF-7";
00048 }
00049 
00050 bool QUtf7Codec::canEncode( QChar ) const {
00051   return TRUE;
00052 }
00053 
00054 bool QUtf7Codec::canEncode( const QString & ) const {
00055   return TRUE;
00056 }
00057 
00058 static uchar base64Set[] = {
00059   0x00, 0x00, 0x00, 0x00, // '\0' ...
00060   0x00, 0x11, 0xFF, 0xC0, // ' ' ... '?'
00061   0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
00062   0x7F, 0xFF, 0xFF, 0xE0  // '`' ... DEL
00063 };
00064 
00065 static uchar base64SetWithLastTwoBitsZero[] = {
00066   0x00, 0x00, 0x00, 0x00, // '\0' ...
00067   0x00, 0x00, 0x88, 0x80, // ' ' ... '?'
00068   0x44, 0x44, 0x44, 0x40, // '@' ... '_'
00069   0x11, 0x11, 0x11, 0x00  // '`' ... DEL
00070 };
00071 
00072 static uchar directSet[] = {
00073   0x00, 0x00, 0x00, 0x00, // '\0' ...
00074   0x01, 0xCF, 0xFF, 0xE1, // ' ' ... '?'
00075   0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
00076   0x7F, 0xFF, 0xFF, 0xE0  // '`' ... DEL
00077 };
00078 
00079 static uchar optDirectSet[] = {
00080   0x00, 0x00, 0x00, 0x00, // '\0' ...
00081   0x7E, 0x20, 0x00, 0x1E, // ' ' ... '?'
00082   0x80, 0x00, 0x00, 0x17, // '@' ... '_'
00083   0x80, 0x00, 0x00, 0x1C  // '`' ... DEL
00084 };
00085 
00086 static inline bool isOfSet(uchar ch, uchar* set) {
00087   return set[ ch/8 ] & (0x80 >> ( ch%8 ));
00088 }
00089 
00090 int QUtf7Codec::heuristicContentMatch(const char* chars, int len) const
00091 {
00092   int stepNo = 0;
00093   int i;
00094   bool shifted = FALSE;
00095   bool rightAfterEscape = FALSE;
00096   bool onlyNullBitsSinceLastBoundary = TRUE;
00097   for ( i = 0; i < len ; i++ ) {
00098     if ((unsigned char)chars[i] >= 128) // 8bit chars not allowed.
00099       break;
00100     if (shifted) {
00101       if ( isOfSet(chars[i],base64Set) ) {
00102     switch (stepNo) {
00103     case 0:
00104       onlyNullBitsSinceLastBoundary = TRUE;
00105       break;
00106     case 3:
00107       onlyNullBitsSinceLastBoundary
00108         = isOfSet(chars[i],base64SetWithLastTwoBitsZero);
00109       break;
00110     case 6:
00111       onlyNullBitsSinceLastBoundary
00112         = ( chars[i] == 'A' || chars[i] == 'Q' ||
00113         chars[i] == 'g' || chars[i] == 'w' );
00114       break;
00115     default:
00116        onlyNullBitsSinceLastBoundary
00117          = onlyNullBitsSinceLastBoundary && (chars[i] == 'A');
00118     }
00119     stepNo = (stepNo + 1) % 8;
00120     rightAfterEscape = FALSE;
00121       } else {
00122     if (rightAfterEscape && chars[i] != '-')
00123       break; // a '+' must be followed by '-' or a base64 char
00124     if (!onlyNullBitsSinceLastBoundary)
00125       break; // non-zero bits in the tail of the base64 encoding
00126     shifted = FALSE;
00127     stepNo = 0;
00128       }
00129     } else {
00130       if (chars[i] == '+') {
00131     shifted = TRUE;
00132     rightAfterEscape = TRUE;
00133       }
00134     }
00135   }
00136   return i;
00137 }
00138 
00139 class QUtf7Decoder : public QTextDecoder {
00140   // the storage for our unicode char until it's finished
00141   ushort uc;
00142   // the state of the base64 decoding
00143   // can be 0 (just finished three unicode chars)
00144   //        1 (have the upper  6 bits of uc already)
00145   //        2 (have the upper 12 bits of uc already)
00146   //        3 (have the upper  2 bits of uc already)
00147   // ..........
00148   //        7 (have the upper 10 bits of uc already)
00149   //   =>   n (have the upper (n * 6) % 16 bits of uc already)
00150   // "stepNo" cycles through all it's values every three
00151   // unicode chars.
00152   char stepNo;
00153   // remembers if we are in shifted-sequence mode
00154   bool shifted;
00155   // remembers if we're just after the initial '+'
00156   // of a shifted-sequence.
00157   bool rightAfterEscape;
00158 public:
00159   QUtf7Decoder() : uc(0), stepNo(0), shifted(FALSE), rightAfterEscape(FALSE)
00160   {
00161   }
00162 
00163 private:
00164   inline void resetParser()
00165   {
00166     uc = 0;
00167     stepNo = 0;
00168     shifted = FALSE;
00169     rightAfterEscape = FALSE;
00170   }
00171 
00172 public:
00173   QString toUnicode(const char* chars, int len)
00174   {
00175     QString result = "";
00176     for (int i=0; i<len; i++) {
00177       uchar ch = chars[i];
00178 
00179       //
00180       // check for 8bit char's:
00181       // 
00182       if ( ch > 127 ) {
00183     qWarning("QUtf7Decoder: 8bit char found in input. "
00184          "Parser has been re-initialized!");
00185     resetParser();
00186     result += QChar::replacement;
00187     continue;
00188       }
00189 
00190       if (shifted) { // in shifted mode
00191 
00192     //
00193     // first, we check specialities that only occur
00194     // right after the escaping '+':
00195     //
00196     if ( rightAfterEscape && ch == '-' ) {
00197       // a "+-" sequence is a short-circuit encoding
00198       // for just '+':
00199       resetParser();
00200       result += QChar('+');
00201       // we're already done for this "ch", so
00202       continue;
00203     }
00204 
00205     //
00206     // Here we're going to extract the bits represented by "ch":
00207     //
00208     ushort bits;
00209     if ( ch >= 'A' && ch <= 'Z' ) {
00210       bits = ch - 'A';
00211     } else if ( ch >= 'a' && ch <= 'z' ) {
00212       bits = ch - 'a' + 26;
00213     } else if ( ch >= '0' && ch <= '9' ) {
00214       bits = ch - '0' + 52;
00215     } else if ( ch == '+' ) {
00216       bits = 62;
00217     } else if ( ch == '/' ) {
00218       bits = 63;
00219     } else {
00220       bits = 0; // keep compiler happy
00221 
00222       //
00223       // ch is not of the base64 alphabet.
00224       // Here we are going to check the sequence's validity:
00225       //
00226       if ( rightAfterEscape ) {
00227         // any non-base64 char following an escaping '+'
00228         // makes for an ill-formed sequence.
00229         // Note that we catch (the valid) "+-" pair
00230         // right at the beginning.
00231         qWarning("QUtf7Decoder: ill-formed input: "
00232              "non-base64 char after escaping \"+\"!");
00233       }
00234       // pending bits from base64 encoding must be all 0:
00235       if (stepNo >= 1 && uc) {
00236         qWarning("QUtf7Decoder: ill-formed sequence: "
00237              "non-zero bits in shifted-sequence tail!");
00238       }
00239       resetParser();
00240 
00241       // a '-' signifies the end of the shifted-sequence,
00242       // so we just swallow it.
00243       if ( ch == '-' )
00244         continue;
00245       // end of validity checking. Process ch now...
00246     }
00247 
00248     if ( /*still*/ shifted ) {
00249       //
00250       // now we're going to stuff the "bits" bit bucket into
00251       // the right position inside "uc", emitting a resulting
00252       // QChar if possible.
00253       //
00254       switch (stepNo) {
00255         // "bits" are the 6 msb's of uc
00256       case 0: uc = bits << 10; break;
00257 
00258       case 1: uc |= bits << 4; break;
00259 
00260         // 4 bits of "bits" complete the first ushort
00261       case 2: uc |= bits >> 2; result += QChar(uc);
00262         // 2 bits of "bits" make the msb's of the next ushort
00263               uc = bits << 14; break;
00264       case 3: uc |= bits << 8; break;
00265       case 4: uc |= bits << 2; break;
00266 
00267         // 2 bits of "bits" complete the second ushort
00268       case 5: uc |= bits >> 4; result += QChar(uc);
00269         // 4 bits of "bits" make the msb's of the next ushort
00270               uc = bits << 12; break;
00271       case 6: uc |= bits << 6; break;
00272 
00273         // these 6 bits complete the third ushort
00274         // and also one round of 8 chars -> 3 ushort decoding
00275       case 7: uc |= bits;      result += QChar(uc);
00276               uc = 0;          break;
00277       default: ;
00278       } // switch (stepNo)
00279       // increase the step counter
00280       stepNo++;
00281       stepNo %= 8;
00282       rightAfterEscape = FALSE;
00283       // and look at the next char.
00284       continue;
00285     } // fi (still) shifted
00286       } // fi shifted
00287 
00288       //
00289       // if control reaches here, we either weren't in a
00290       // shifted sequence or we just left one by seeing
00291       // a non-base64-char.
00292       // Either way, we have to process "ch" outside
00293       // a shifted-sequence now:
00294       //
00295       if ( ch == '+' ) {
00296     // '+' is the escape char for entering a
00297     // shifted sequence:
00298     shifted = TRUE;
00299     stepNo = 0;
00300     // also, we're right at the beginning where
00301     // special rules apply:
00302     rightAfterEscape = TRUE;
00303       } else {
00304     // US-ASCII values are directly used
00305     result += QChar(ch);
00306       }
00307     }
00308 
00309     return result;
00310 
00311   } // toUnicode()
00312 
00313 }; // class QUtf7Decoder
00314 
00315 QTextDecoder* QUtf7Codec::makeDecoder() const
00316 {
00317   return new QUtf7Decoder;
00318 }
00319 
00320 
00321 class QUtf7Encoder : public QTextEncoder {
00322   uchar dontNeedEncodingSet[16];
00323   ushort outbits;
00324   uint stepNo : 2;
00325   bool shifted : 1;
00326   bool mayContinueShiftedSequence : 1;
00327 public:
00328   QUtf7Encoder(bool encOpt, bool encLwsp)
00329     : outbits(0), stepNo(0),
00330       shifted(FALSE), mayContinueShiftedSequence(FALSE)
00331   {
00332     for ( int i = 0; i < 16 ; i++) {
00333       dontNeedEncodingSet[i] = directSet[i];
00334       if (!encOpt)
00335     dontNeedEncodingSet[i] |= optDirectSet[i];
00336     }
00337     if(!encLwsp) {
00338       dontNeedEncodingSet[' '/8] |= 0x80 >> (' '%8);
00339       dontNeedEncodingSet['\n'/8] |= 0x80 >> ('\n'%8);
00340       dontNeedEncodingSet['\r'/8] |= 0x80 >> ('\r'%8);
00341       dontNeedEncodingSet['\t'/8] |= 0x80 >> ('\t'%8);
00342     }
00343   }
00344 
00345 private:
00346 
00347   char toBase64( ushort u ) {
00348     if ( u < 26 )
00349       return (char)u + 'A';
00350     else if ( u < 52 )
00351       return (char)u - 26 + 'a';
00352     else if ( u < 62 )
00353       return (char)u - 52 + '0';
00354     else if ( u == 62 )
00355       return '+';
00356     else
00357       return '/';
00358   }
00359 
00360   void addToShiftedSequence(QCString::Iterator & t, ushort u) {
00361     switch (stepNo) {
00362       // no outbits; use uppermost 6 bits of u
00363     case 0:
00364       *t++ = toBase64( u >> 10 );
00365       *t++ = toBase64( (u & 0x03FF /* umask top 6 bits */ ) >> 4 );
00366       // save 4 lowest-order bits in outbits[5..2]
00367       outbits = (u & 0x000F) << 2;
00368       break;
00369 
00370       // outbits available; use top two bits of u to complete
00371       // the previous char
00372     case 1:
00373       if (!mayContinueShiftedSequence) {
00374     // if mayContinue, this char has already been written
00375     *t++ = toBase64( outbits | ( u >> 14 ) );
00376       }
00377       *t++ = toBase64( (u & 0x3F00 /* mask top 2 bits */ ) >> 8 );
00378       *t++ = toBase64( (u & 0x00FC /* mask msbyte */ ) >> 2 );
00379       // save 2 lowest-significant bits in outbits[5..4]
00380       outbits = (u & 0x0003) << 4;
00381       break;
00382 
00383       // outbits available; use top four bits of u to complete
00384       // the previous char
00385     case 2:
00386       if (!mayContinueShiftedSequence) {
00387     // if mayContinue, this char has already been written
00388     *t++ = toBase64( outbits | ( u >> 12 ) );
00389       }
00390       *t++ = toBase64( (u & 0x0FFF) >> 6 );
00391       *t++ = toBase64( u & 0x003F );
00392       break;
00393 
00394     default: ;
00395     }
00396     stepNo = (stepNo + 1) % 3;
00397   }
00398 
00399   void endShiftedSequence(QCString::Iterator & t) {
00400     switch (stepNo) {
00401     case 1: // four outbits still to be written
00402     case 2: // two outbits still to be written
00403       *t++ = toBase64( outbits );
00404       break;
00405     case 0:      // nothing to do
00406     default: ;
00407     }
00408     outbits = 0;
00409   }
00410 
00411   // depending on the stepNo, checks whether we can continue
00412   // an already ended shifted-sequence with char "u".
00413   // This is only possible if the topmost bits fit the
00414   // already written ones (which are all 0 between calls)
00415   bool continueOK( ushort u ) {
00416     return stepNo == 0 ||
00417       ( stepNo == 1 && (u & 0xF000) == 0 ) ||
00418       ( stepNo == 2 && (u & 0xC000) == 0 );
00419   }
00420 
00421   void processDoesntNeedEncoding(QCString::Iterator & t, ushort ch) {
00422     // doesn't need encoding
00423     if (shifted) {
00424       endShiftedSequence(t);
00425       // add "lead-out" to dis-ambiguate following chars:
00426       if (isOfSet((char)ch,base64Set) || ch == '-' ) {
00427     *t++ = '-';
00428       }
00429     } else if (mayContinueShiftedSequence) {
00430       // if mayContinue is set, this means the
00431       // shifted-sequence needs a lead-out.
00432       mayContinueShiftedSequence = FALSE;
00433       if (isOfSet(ch,base64Set) || ch == '-' ) {
00434     *t++ = '-';
00435       }
00436     }
00437     *t++ = (uchar)ch;
00438     shifted = FALSE;
00439     stepNo = 0;
00440   }
00441 
00442 public:
00443   QCString fromUnicode(const QString & uc, int & len_in_out)
00444   {
00445     // allocate place for worst case:
00446     //   len/2 * (5+1) for an alternating sequence of e.g. "A\",
00447     // + 4             for a worst-case of another +ABC encoded char
00448     // + 1             for the trailing \0
00449     // 
00450     int maxreslen = 3 * len_in_out + 5;
00451     QCString result( maxreslen );
00452 
00453 #if 0
00454     //    if (len_in_out == 1) {
00455     cout << "\nlen_in_out: " << len_in_out
00456      <<"; shifted: " << (shifted ? "true" : "false")
00457      << ";\n" << "mayContinue: "
00458      << (mayContinueShiftedSequence ? "true" : "false")
00459      << "; stepNo: " << stepNo << ";\n"
00460      << "outbits: " << outbits << endl;
00461       //    }
00462 #endif
00463 
00464     // source and destination cursor
00465     const QChar * s = uc.unicode();
00466     QCString::Iterator t = result.data();
00467 
00468     if ( uc.isNull() ) {
00469       // return to ascii requested:
00470       if ( mayContinueShiftedSequence )
00471     *t++ = '-';
00472     } else {
00473       // normal operation:
00474       for (int i = 0 ; i < len_in_out ;
00475        i++/*, checkOutBuf(result,maxreslen,t,i,len_in_out,5)*/ ) {
00476     ushort ch = s[i].unicode();
00477     
00478     //
00479     // first, we check whether we might get around encoding:
00480     //
00481     if ( ch < 128 ) {
00482       //
00483       // ch is usAscii, so we have a chance that we don't
00484       // need to encode it.
00485       //
00486       if ( isOfSet((uchar)ch,dontNeedEncodingSet) ) {
00487         processDoesntNeedEncoding(t,ch);
00488         continue;
00489       } else if ( ch == '+' ) {
00490         // '+' is the shift escape character
00491         if (shifted || mayContinueShiftedSequence) {
00492           // if we are already in shifted mode, we just
00493           // encode the '+', too. Compare
00494           // 24bits ("-+-") + some from ending the shifted-sequence
00495           // with 21,33 bits
00496           addToShiftedSequence(t,ch);
00497           mayContinueShiftedSequence = FALSE;
00498           shifted = TRUE;
00499         } else {
00500           // shortcut encoding of '+':
00501           *t++ = '+';
00502           *t++ = '-';
00503         }
00504         continue; // done
00505       } // else fall through to encoding
00506     }
00507     //
00508     // need encoding
00509     //
00510     if (!shifted && (!mayContinueShiftedSequence || !continueOK(ch) ) ) {
00511       *t++ = '+';
00512       stepNo = 0;
00513     }
00514     addToShiftedSequence(t,ch);
00515     shifted = TRUE;
00516     mayContinueShiftedSequence = FALSE;
00517       }
00518 
00519       if ( shifted ) {
00520     endShiftedSequence(t);
00521     mayContinueShiftedSequence = TRUE;
00522       };
00523       shifted = FALSE;
00524     }
00525 
00526     *t = '\0';
00527     len_in_out = t - result.data();
00528 
00529 #if 0
00530     cout << "len_in_out: " << len_in_out << "; "
00531      << "mayContinue: " << (mayContinueShiftedSequence ? "true" : "false")
00532      << "; stepNo: " << stepNo << endl;
00533 #endif
00534 
00535     Q_ASSERT(len_in_out <= maxreslen-1);
00536 
00537     return result;
00538   } // fromUnicode()
00539 
00540 }; // class QUtf7Encoder
00541 
00542 QTextEncoder* QUtf7Codec::makeEncoder() const {
00543   return new QUtf7Encoder( false, false );
00544 }
00545 
00546 QTextEncoder* QStrictUtf7Codec::makeEncoder() const {
00547   return new QUtf7Encoder( true, false );
00548 }
00549 
00550 #endif // QT_NO_TEXTCODEC
KDE Home | KDE Accessibility Home | Description of Access Keys