/[winpt]/trunk/Src/wptUTF8.cpp

Diff of /trunk/Src/wptUTF8.cpp

Parent Directory | Revision Log | View Patch Patch

-revision 128 by twoaday,
Mon Dec 19 13:05:59 2005 UTC
+revision 185 by twoaday,
Mon Mar 20 12:48:52 2006 UTC
 Line 1
  /* wptUTF8.cpp - UTF8 conversation
   *      Copyright (C) 1994, 1998-2001 Free Software Foundation, Inc.
-  *      Copyright (C) 2002, 2004, 2005 Timo Schulz
+  *      Copyright (C) 2002, 2004, 2005, 2006 Timo Schulz
   *
   * This file is part of WinPT.
   *
 Line 33
  #include "wptErrors.h"
- static u16 latin2_unicode[128] = {
+ /* convert latin1 string @string into utf8. */
-x0080,0x0081,0x0082,0x0083,0x0084,0x0085,0x0086,0x0087,
+ char *
-x0088,0x0089,0x008A,0x008B,0x008C,0x008D,0x008E,0x008F,
+ native_to_utf8( const char *string )
-x0090,0x0091,0x0092,0x0093,0x0094,0x0095,0x0096,0x0097,
-x0098,0x0099,0x009A,0x009B,0x009C,0x009D,0x009E,0x009F,
-x00A0,0x0104,0x02D8,0x0141,0x00A4,0x013D,0x015A,0x00A7,
-x00A8,0x0160,0x015E,0x0164,0x0179,0x00AD,0x017D,0x017B,
-x00B0,0x0105,0x02DB,0x0142,0x00B4,0x013E,0x015B,0x02C7,
-x00B8,0x0161,0x015F,0x0165,0x017A,0x02DD,0x017E,0x017C,
-x0154,0x00C1,0x00C2,0x0102,0x00C4,0x0139,0x0106,0x00C7,
-x010C,0x00C9,0x0118,0x00CB,0x011A,0x00CD,0x00CE,0x010E,
-x0110,0x0143,0x0147,0x00D3,0x00D4,0x0150,0x00D6,0x00D7,
-x0158,0x016E,0x00DA,0x0170,0x00DC,0x00DD,0x0162,0x00DF,
-x0155,0x00E1,0x00E2,0x0103,0x00E4,0x013A,0x0107,0x00E7,
-x010D,0x00E9,0x0119,0x00EB,0x011B,0x00ED,0x00EE,0x010F,
-x0111,0x0144,0x0148,0x00F3,0x00F4,0x0151,0x00F6,0x00F7,
-x0159,0x016F,0x00FA,0x0171,0x00FC,0x00FD,0x0163,0x02D9
- };
- static const char *active_charset_name = "iso-8859-1";
- static u16 *active_charset = NULL;
- static int no_translation = 0;
- static int
- ascii_strcasecmp( const char *a, const char *b )
  {
-     if( a == b )
+     const byte *s;
-         return 0;
+     char *buffer;
+     byte *p;
-     for (; *a && *b; a++, b++) {
+     size_t length=0;
-         if (*a != *b && toupper(*a) != toupper(*b))
-             break;
+     for (s=(byte*)string; *s; s++) {
+       length++;
+       if (*s & 0x80)
+           length++;
      }
+     buffer = (char*)malloc (length + 1);
-     return *a == *b? 0 : (toupper (*a) - toupper (*b));
+     for (p = (byte*)buffer, s=(byte*)string; *s; s++) {
- }
+       if (*s & 0x80) {
+           *p++ = 0xc0 | ((*s >> 6) & 3);
- int
+           *p++ = 0x80 | ( *s & 0x3f );
- set_native_charset( const char *newset )
+       }
- {
+       else
-   if( !ascii_strcasecmp( newset, "iso-8859-1" ) ) {
+           *p++ = *s;
-       active_charset_name = "iso-8859-1";
+     }
-       no_translation = 0;
+     *p = 0;
-       active_charset = NULL;
+     return buffer;
-   }
-   else if( !ascii_strcasecmp( newset, "iso-8859-2" ) ) {
-       active_charset_name = "iso-8859-2";
-       no_translation = 0;
-       active_charset = latin2_unicode;
-   }
-   else if( !ascii_strcasecmp (newset, "utf8" )
-            || !ascii_strcasecmp(newset, "utf-8") ) {
-       active_charset_name = "utf-8";
-       no_translation = 1;
-       active_charset = NULL;
-   }
-   else
-       return WPTERR_GENERAL;
-   return 0;
  }
- const char*
- get_native_charset( void )
- {
-     return active_charset_name;
- }
- /****************
+ /* Convert utf8 string @str to native CP. */
-  * Convert string, which is in native encoding to UTF8 and return the
+ static char*
-  * new allocated UTF8 string.
+ utf8_to_native (const char *string)
-  */
- char *
- native_to_utf8( const char *string )
  {
-   const byte *s;
+     wchar_t *result;
-   char *buffer;
+     char *native;
-   byte *p;
+     int n;
-   size_t length=0;
+     /* Convert utf8 to unicode. */
-   if (no_translation)
+     n = MultiByteToWideChar (CP_UTF8, 0, string, -1, NULL, 0);
-       buffer = strdup( string );
+     if (n < 0)
-   else if( active_charset ) {
+         return NULL;
-       for(s=(byte*)string; *s; s++ ) {
-           length++;
+     result = (wchar_t*)malloc ((n+1) * sizeof *result);
-           if( *s & 0x80 )
+     if (!result)
-               length += 2; /* we may need 3 bytes */
+         BUG (0);
-       }
-       buffer = (char *)malloc( length + 1 );
+     n = MultiByteToWideChar (CP_UTF8, 0, string, -1, result, n);
-       for(p=(byte *)buffer, s=(byte *)string; *s; s++ ) {
+     if (n < 0) {
-           if( *s & 0x80 ) {
+         free (result);
-               u16 val = active_charset[ *s & 0x7f ];
+         return NULL;
-               if( val < 0x0800 ) {
+     }
-                   *p++ = 0xc0 | ( (val >> 6) & 0x1f );
-                   *p++ = 0x80 | (  val & 0x3f );
-               }
-               else {
-                   *p++ = 0xe0 | ( (val >> 12) & 0x0f );
-                   *p++ = 0x80 | ( (val >>  6) & 0x3f );
-                   *p++ = 0x80 | (  val & 0x3f );
-               }
-           }
-           else
-               *p++ = *s;
-       }
-       *p = 0;
-   }
-   else {
-       for(s=(byte*)string; *s; s++ ) {
-           length++;
-           if( *s & 0x80 )
-               length++;
-       }
-       buffer = (char*)malloc( length + 1 );
-       for(p=(byte*)buffer, s=(byte*)string; *s; s++ ) {
-           if( *s & 0x80 ) {
-               *p++ = 0xc0 | ((*s >> 6) & 3);
-               *p++ = 0x80 | ( *s & 0x3f );
-           }
-           else
-               *p++ = *s;
-       }
-       *p = 0;
-   }
-   return buffer;
- } /* native_to_utf8 */
- /****************
+     /* Convert wide char into native char. */
-  * Convert string, which is in UTF8 to native encoding.  illegal
+     /*
-  * encodings by some "\xnn" and quote all control characters. A
+     n = WideCharToMultiByte (GetACP (), 0, result, -1, NULL, 0, NULL, NULL);
-  * character with value DELIM will always be quoted, it must be a
+     if (n < 0)
-  * vanilla ASCII character.
+         return NULL;
-   */
+     */
- char *
+     n = wcstombs (NULL, result, wcslen (result));
- utf8_to_native( const char *string, size_t length, int delim )
+     if (n < 0)
- {
+         return NULL;
-     int nleft;
-     int i;
+     native = (char*)malloc (n+1);
-     byte encbuf[8];
+     if (!native)
-     int encidx;
+         BUG (0);
-     const byte *s;
-     size_t n;
+     /*
-     byte *buffer = NULL, *p = NULL;
+     n = WideCharToMultiByte (CP_ACP, 0, string, -1, result, n, NULL, NULL);
-     unsigned long val = 0;
+     if (n < 0) {
-     size_t slen;
+         free (result);
-     int resync = 0;
+         return NULL;
+     }
-     /* 1. pass (p==NULL): count the extended utf-8 characters */
+     */
-     /* 2. pass (p!=NULL): create string */
+     n = wcstombs (native, result, -1);
-     for( ;; ) {
+     if (n < 0) {
-         for( slen=length, nleft=encidx=0, n=0, s=(byte*)string; slen; s++, slen-- ) {
+         free (result);
-             if( resync ) {
+         return NULL;
-                 if( !(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)) ) {
-                     /* still invalid */
-                     if( p ) {
-                         sprintf((char*)p, "\\x%02x", *s );
-                         p += 4;
-                     }
-                     n += 4;
-                     continue;
-                 }
-                 resync = 0;
-             }
-             if( !nleft ) {
-                 if( !(*s & 0x80) ) { /* plain ascii */
-                     if( *s < 0x20 || *s == 0x7f || *s == delim) {
-                         n++;
-                         if( p )
-                             *p++ = '\\';
-                         switch( *s ) {
-                         case '\n': n++; if( p ) *p++ = 'n'; break;
-                         case '\r': n++; if( p ) *p++ = 'r'; break;
-                         case '\f': n++; if( p ) *p++ = 'f'; break;
-                         case '\v': n++; if( p ) *p++ = 'v'; break;
-                         case '\b': n++; if( p ) *p++ = 'b'; break;
-                         case     0 : n++; if( p ) *p++ = '0'; break;
-                         default:
-                             n += 3;
-                             if ( p ) {
-                                 sprintf( (char*)p, "x%02x", *s );
-                                 p += 3;
-                             }
-                             break;
-                         }
-                     }
-                     else {
-                         if( p ) *p++ = *s;
-                         n++;
-                     }
-                 }
-                 else if( (*s & 0xe0) == 0xc0 ) { /* 110x xxxx */
-                     val = *s & 0x1f;
-                     nleft = 1;
-                     encidx = 0;
-                     encbuf[encidx++] = *s;
-                 }
-                 else if( (*s & 0xf0) == 0xe0 ) { /* 1110 xxxx */
-                     val = *s & 0x0f;
-                     nleft = 2;
-                     encidx = 0;
-                     encbuf[encidx++] = *s;
-                 }
-                 else if( (*s & 0xf8) == 0xf0 ) { /* 1111 0xxx */
-                     val = *s & 0x07;
-                     nleft = 3;
-                     encidx = 0;
-                     encbuf[encidx++] = *s;
-                 }
-                 else if( (*s & 0xfc) == 0xf8 ) { /* 1111 10xx */
-                     val = *s & 0x03;
-                     nleft = 4;
-                     encidx = 0;
-                     encbuf[encidx++] = *s;
-                 }
-                 else if( (*s & 0xfe) == 0xfc ) { /* 1111 110x */
-                     val = *s & 0x01;
-                     nleft = 5;
-                     encidx = 0;
-                     encbuf[encidx++] = *s;
-                 }
-                 else {  /* invalid encoding: print as \xnn */
-                     if( p ) {
-                         sprintf((char*)p, "\\x%02x", *s );
-                         p += 4;
-                     }
-                     n += 4;
-                     resync = 1;
-                 }
-             }
-             else if( *s < 0x80 || *s >= 0xc0 ) { /* invalid */
-                 if( p ) {
-                     for(i=0; i < encidx; i++ ) {
-                         sprintf((char*)p, "\\x%02x", encbuf[i] );
-                         p += 4;
-                     }
-                     sprintf((char*)p, "\\x%02x", *s );
-                     p += 4;
-                 }
-                 n += 4 + 4*encidx;
-                 nleft = 0;
-                 encidx = 0;
-                 resync = 1;
-             }
-             else {
-                 encbuf[encidx++] = *s;
-                 val <<= 6;
-                 val |= *s & 0x3f;
-                 if( !--nleft ) { /* ready */
-                     if (no_translation) {
-                         if( p ) {
-                             for(i=0; i < encidx; i++ )
-                                 *p++ = encbuf[i];
-                         }
-                         n += encidx;
-                         encidx = 0;
-                     }
-                     else if( active_charset ) { /* table lookup */
-                         for(i=0; i < 128; i++ ) {
-                             if( active_charset[i] == val )
-                                 break;
-                         }
-                         if( i < 128 ) { /* we can print this one */
-                             if( p ) *p++ = i+128;
-                             n++;
-                         }
-                         else { /* we do not have a translation: print utf8 */
-                             if( p ) {
-                                 for(i=0; i < encidx; i++ ) {
-                                     sprintf((char*)p, "\\x%02x", encbuf[i] );
-                                     p += 4;
-                                 }
-                             }
-                             n += encidx*4;
-                             encidx = 0;
-                         }
-                     }
-                     else { /* native set */
-                         if( val >= 0x80 && val < 256 ) {
-                             n++;    /* we can simply print this character */
-                             if( p ) *p++ = val;
-                         }
-                         else { /* we do not have a translation: print utf8 */
-                             if( p ) {
-                                 for(i=0; i < encidx; i++ ) {
-                                     sprintf((char*)p, "\\x%02x", encbuf[i] );
-                                     p += 4;
-                                 }
-                             }
-                             n += encidx*4;
-                             encidx = 0;
-                         }
-                     }
-                 }
-             }
-         }
-         if( !buffer ) { /* allocate the buffer after the first pass */
-             buffer = p = (byte *)malloc( n + 1 );
-         }
-         else {
-             *p = 0; /* make a string */
-             return (char*)buffer;
-         }
      }
- }
+     return native;
+ }
+ /* CP850 -> CP1251 */
  static void
  conv_charset (byte *string, size_t size, int what)
  {
-     int i;
+     size_t i;
      if( what == 0 ) {
          for( i = 0; i < size; i++, string++ ) {
-Line 555 
 conv_charset (byte *string, size_t size,
+Line 327 
 conv_charset (byte *string, size_t size,
              }
          }
      }
- } /* conv_charset */
+ }
  /* XXX: the conv_charset() call fails when the user-id was created
-Line 565 
 char*
+Line 337 
 char*
  utf8_to_wincp (const char * s, size_t len)
  {
      char *decs;
-     decs = utf8_to_native (s, len, 0);
+     decs = utf8_to_native (s);
      conv_charset ((byte *)decs, strlen (decs), 1);
      return decs;
  }

 Legend:



Removed from v.128
 


changed lines


 
Added in v.185
 Legend:



Removed from v.128
 


changed lines


 
Added in v.185
-Removed from v.128
+Added in v.185

[email protected]	ViewVC Help
Powered by ViewVC 1.1.26