1 |
/* wptUTF8.cpp - UTF8 conversation |
/* wptUTF8.cpp - UTF8 conversation |
2 |
* Copyright (C) 1994, 1998-2001 Free Software Foundation, Inc. |
* Copyright (C) 1994, 1998-2001 Free Software Foundation, Inc. |
3 |
* Copyright (C) 2002, 2004, 2005 Timo Schulz |
* Copyright (C) 2002, 2004, 2005, 2006 Timo Schulz |
4 |
* |
* |
5 |
* This file is part of WinPT. |
* This file is part of WinPT. |
6 |
* |
* |
33 |
#include "wptErrors.h" |
#include "wptErrors.h" |
34 |
|
|
35 |
|
|
36 |
static u16 latin2_unicode[128] = { |
/* convert latin1 string @string into utf8. */ |
37 |
0x0080,0x0081,0x0082,0x0083,0x0084,0x0085,0x0086,0x0087, |
char * |
38 |
0x0088,0x0089,0x008A,0x008B,0x008C,0x008D,0x008E,0x008F, |
native_to_utf8( const char *string ) |
|
0x0090,0x0091,0x0092,0x0093,0x0094,0x0095,0x0096,0x0097, |
|
|
0x0098,0x0099,0x009A,0x009B,0x009C,0x009D,0x009E,0x009F, |
|
|
0x00A0,0x0104,0x02D8,0x0141,0x00A4,0x013D,0x015A,0x00A7, |
|
|
0x00A8,0x0160,0x015E,0x0164,0x0179,0x00AD,0x017D,0x017B, |
|
|
0x00B0,0x0105,0x02DB,0x0142,0x00B4,0x013E,0x015B,0x02C7, |
|
|
0x00B8,0x0161,0x015F,0x0165,0x017A,0x02DD,0x017E,0x017C, |
|
|
0x0154,0x00C1,0x00C2,0x0102,0x00C4,0x0139,0x0106,0x00C7, |
|
|
0x010C,0x00C9,0x0118,0x00CB,0x011A,0x00CD,0x00CE,0x010E, |
|
|
0x0110,0x0143,0x0147,0x00D3,0x00D4,0x0150,0x00D6,0x00D7, |
|
|
0x0158,0x016E,0x00DA,0x0170,0x00DC,0x00DD,0x0162,0x00DF, |
|
|
0x0155,0x00E1,0x00E2,0x0103,0x00E4,0x013A,0x0107,0x00E7, |
|
|
0x010D,0x00E9,0x0119,0x00EB,0x011B,0x00ED,0x00EE,0x010F, |
|
|
0x0111,0x0144,0x0148,0x00F3,0x00F4,0x0151,0x00F6,0x00F7, |
|
|
0x0159,0x016F,0x00FA,0x0171,0x00FC,0x00FD,0x0163,0x02D9 |
|
|
}; |
|
|
|
|
|
|
|
|
static const char *active_charset_name = "iso-8859-1"; |
|
|
static u16 *active_charset = NULL; |
|
|
static int no_translation = 0; |
|
|
|
|
|
|
|
|
static int |
|
|
ascii_strcasecmp( const char *a, const char *b ) |
|
39 |
{ |
{ |
40 |
if( a == b ) |
const byte *s; |
41 |
return 0; |
char *buffer; |
42 |
|
byte *p; |
43 |
for (; *a && *b; a++, b++) { |
size_t length=0; |
44 |
if (*a != *b && toupper(*a) != toupper(*b)) |
|
45 |
break; |
for (s=(byte*)string; *s; s++) { |
46 |
|
length++; |
47 |
|
if (*s & 0x80) |
48 |
|
length++; |
49 |
} |
} |
50 |
|
buffer = (char*)malloc (length + 1); |
51 |
return *a == *b? 0 : (toupper (*a) - toupper (*b)); |
for (p = (byte*)buffer, s=(byte*)string; *s; s++) { |
52 |
} |
if (*s & 0x80) { |
53 |
|
*p++ = 0xc0 | ((*s >> 6) & 3); |
54 |
int |
*p++ = 0x80 | ( *s & 0x3f ); |
55 |
set_native_charset( const char *newset ) |
} |
56 |
{ |
else |
57 |
if( !ascii_strcasecmp( newset, "iso-8859-1" ) ) { |
*p++ = *s; |
58 |
active_charset_name = "iso-8859-1"; |
} |
59 |
no_translation = 0; |
*p = 0; |
60 |
active_charset = NULL; |
return buffer; |
|
} |
|
|
else if( !ascii_strcasecmp( newset, "iso-8859-2" ) ) { |
|
|
active_charset_name = "iso-8859-2"; |
|
|
no_translation = 0; |
|
|
active_charset = latin2_unicode; |
|
|
} |
|
|
else if( !ascii_strcasecmp (newset, "utf8" ) |
|
|
|| !ascii_strcasecmp(newset, "utf-8") ) { |
|
|
active_charset_name = "utf-8"; |
|
|
no_translation = 1; |
|
|
active_charset = NULL; |
|
|
} |
|
|
else |
|
|
return WPTERR_GENERAL; |
|
|
|
|
|
return 0; |
|
61 |
} |
} |
62 |
|
|
|
const char* |
|
|
get_native_charset( void ) |
|
|
{ |
|
|
return active_charset_name; |
|
|
} |
|
63 |
|
|
64 |
|
|
65 |
/**************** |
/* Convert utf8 string @str to native CP. */ |
66 |
* Convert string, which is in native encoding to UTF8 and return the |
static char* |
67 |
* new allocated UTF8 string. |
utf8_to_native (const char *string) |
|
*/ |
|
|
char * |
|
|
native_to_utf8( const char *string ) |
|
68 |
{ |
{ |
69 |
const byte *s; |
wchar_t *result; |
70 |
char *buffer; |
char *native; |
71 |
byte *p; |
int n; |
72 |
size_t length=0; |
|
73 |
|
/* Convert utf8 to unicode. */ |
74 |
if (no_translation) |
n = MultiByteToWideChar (CP_UTF8, 0, string, -1, NULL, 0); |
75 |
buffer = strdup( string ); |
if (n < 0) |
76 |
else if( active_charset ) { |
return NULL; |
77 |
for(s=(byte*)string; *s; s++ ) { |
|
78 |
length++; |
result = (wchar_t*)malloc ((n+1) * sizeof *result); |
79 |
if( *s & 0x80 ) |
if (!result) |
80 |
length += 2; /* we may need 3 bytes */ |
BUG (0); |
81 |
} |
|
82 |
buffer = (char *)malloc( length + 1 ); |
n = MultiByteToWideChar (CP_UTF8, 0, string, -1, result, n); |
83 |
for(p=(byte *)buffer, s=(byte *)string; *s; s++ ) { |
if (n < 0) { |
84 |
if( *s & 0x80 ) { |
free (result); |
85 |
u16 val = active_charset[ *s & 0x7f ]; |
return NULL; |
86 |
if( val < 0x0800 ) { |
} |
|
*p++ = 0xc0 | ( (val >> 6) & 0x1f ); |
|
|
*p++ = 0x80 | ( val & 0x3f ); |
|
|
} |
|
|
else { |
|
|
*p++ = 0xe0 | ( (val >> 12) & 0x0f ); |
|
|
*p++ = 0x80 | ( (val >> 6) & 0x3f ); |
|
|
*p++ = 0x80 | ( val & 0x3f ); |
|
|
} |
|
|
} |
|
|
else |
|
|
*p++ = *s; |
|
|
} |
|
|
*p = 0; |
|
|
} |
|
|
else { |
|
|
for(s=(byte*)string; *s; s++ ) { |
|
|
length++; |
|
|
if( *s & 0x80 ) |
|
|
length++; |
|
|
} |
|
|
buffer = (char*)malloc( length + 1 ); |
|
|
for(p=(byte*)buffer, s=(byte*)string; *s; s++ ) { |
|
|
if( *s & 0x80 ) { |
|
|
*p++ = 0xc0 | ((*s >> 6) & 3); |
|
|
*p++ = 0x80 | ( *s & 0x3f ); |
|
|
} |
|
|
else |
|
|
*p++ = *s; |
|
|
} |
|
|
*p = 0; |
|
|
} |
|
|
|
|
|
return buffer; |
|
|
} /* native_to_utf8 */ |
|
87 |
|
|
88 |
/**************** |
/* Convert wide char into native char. */ |
89 |
* Convert string, which is in UTF8 to native encoding. illegal |
/* |
90 |
* encodings by some "\xnn" and quote all control characters. A |
n = WideCharToMultiByte (GetACP (), 0, result, -1, NULL, 0, NULL, NULL); |
91 |
* character with value DELIM will always be quoted, it must be a |
if (n < 0) |
92 |
* vanilla ASCII character. |
return NULL; |
93 |
*/ |
*/ |
94 |
char * |
n = wcstombs (NULL, result, wcslen (result)); |
95 |
utf8_to_native( const char *string, size_t length, int delim ) |
if (n < 0) |
96 |
{ |
return NULL; |
97 |
int nleft; |
|
98 |
int i; |
native = (char*)malloc (n+1); |
99 |
byte encbuf[8]; |
if (!native) |
100 |
int encidx; |
BUG (0); |
101 |
const byte *s; |
|
102 |
size_t n; |
/* |
103 |
byte *buffer = NULL, *p = NULL; |
n = WideCharToMultiByte (CP_ACP, 0, string, -1, result, n, NULL, NULL); |
104 |
unsigned long val = 0; |
if (n < 0) { |
105 |
size_t slen; |
free (result); |
106 |
int resync = 0; |
return NULL; |
107 |
|
} |
108 |
/* 1. pass (p==NULL): count the extended utf-8 characters */ |
*/ |
109 |
/* 2. pass (p!=NULL): create string */ |
n = wcstombs (native, result, -1); |
110 |
for( ;; ) { |
if (n < 0) { |
111 |
for( slen=length, nleft=encidx=0, n=0, s=(byte*)string; slen; s++, slen-- ) { |
free (result); |
112 |
if( resync ) { |
return NULL; |
|
if( !(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)) ) { |
|
|
/* still invalid */ |
|
|
if( p ) { |
|
|
sprintf((char*)p, "\\x%02x", *s ); |
|
|
p += 4; |
|
|
} |
|
|
n += 4; |
|
|
continue; |
|
|
} |
|
|
resync = 0; |
|
|
} |
|
|
if( !nleft ) { |
|
|
if( !(*s & 0x80) ) { /* plain ascii */ |
|
|
if( *s < 0x20 || *s == 0x7f || *s == delim) { |
|
|
n++; |
|
|
if( p ) |
|
|
*p++ = '\\'; |
|
|
switch( *s ) { |
|
|
case '\n': n++; if( p ) *p++ = 'n'; break; |
|
|
case '\r': n++; if( p ) *p++ = 'r'; break; |
|
|
case '\f': n++; if( p ) *p++ = 'f'; break; |
|
|
case '\v': n++; if( p ) *p++ = 'v'; break; |
|
|
case '\b': n++; if( p ) *p++ = 'b'; break; |
|
|
case 0 : n++; if( p ) *p++ = '0'; break; |
|
|
default: |
|
|
n += 3; |
|
|
if ( p ) { |
|
|
sprintf( (char*)p, "x%02x", *s ); |
|
|
p += 3; |
|
|
} |
|
|
break; |
|
|
} |
|
|
} |
|
|
else { |
|
|
if( p ) *p++ = *s; |
|
|
n++; |
|
|
} |
|
|
} |
|
|
else if( (*s & 0xe0) == 0xc0 ) { /* 110x xxxx */ |
|
|
val = *s & 0x1f; |
|
|
nleft = 1; |
|
|
encidx = 0; |
|
|
encbuf[encidx++] = *s; |
|
|
} |
|
|
else if( (*s & 0xf0) == 0xe0 ) { /* 1110 xxxx */ |
|
|
val = *s & 0x0f; |
|
|
nleft = 2; |
|
|
encidx = 0; |
|
|
encbuf[encidx++] = *s; |
|
|
} |
|
|
else if( (*s & 0xf8) == 0xf0 ) { /* 1111 0xxx */ |
|
|
val = *s & 0x07; |
|
|
nleft = 3; |
|
|
encidx = 0; |
|
|
encbuf[encidx++] = *s; |
|
|
} |
|
|
else if( (*s & 0xfc) == 0xf8 ) { /* 1111 10xx */ |
|
|
val = *s & 0x03; |
|
|
nleft = 4; |
|
|
encidx = 0; |
|
|
encbuf[encidx++] = *s; |
|
|
} |
|
|
else if( (*s & 0xfe) == 0xfc ) { /* 1111 110x */ |
|
|
val = *s & 0x01; |
|
|
nleft = 5; |
|
|
encidx = 0; |
|
|
encbuf[encidx++] = *s; |
|
|
} |
|
|
else { /* invalid encoding: print as \xnn */ |
|
|
if( p ) { |
|
|
sprintf((char*)p, "\\x%02x", *s ); |
|
|
p += 4; |
|
|
} |
|
|
n += 4; |
|
|
resync = 1; |
|
|
} |
|
|
} |
|
|
else if( *s < 0x80 || *s >= 0xc0 ) { /* invalid */ |
|
|
if( p ) { |
|
|
for(i=0; i < encidx; i++ ) { |
|
|
sprintf((char*)p, "\\x%02x", encbuf[i] ); |
|
|
p += 4; |
|
|
} |
|
|
sprintf((char*)p, "\\x%02x", *s ); |
|
|
p += 4; |
|
|
} |
|
|
n += 4 + 4*encidx; |
|
|
nleft = 0; |
|
|
encidx = 0; |
|
|
resync = 1; |
|
|
} |
|
|
else { |
|
|
encbuf[encidx++] = *s; |
|
|
val <<= 6; |
|
|
val |= *s & 0x3f; |
|
|
if( !--nleft ) { /* ready */ |
|
|
if (no_translation) { |
|
|
if( p ) { |
|
|
for(i=0; i < encidx; i++ ) |
|
|
*p++ = encbuf[i]; |
|
|
} |
|
|
n += encidx; |
|
|
encidx = 0; |
|
|
} |
|
|
else if( active_charset ) { /* table lookup */ |
|
|
for(i=0; i < 128; i++ ) { |
|
|
if( active_charset[i] == val ) |
|
|
break; |
|
|
} |
|
|
if( i < 128 ) { /* we can print this one */ |
|
|
if( p ) *p++ = i+128; |
|
|
n++; |
|
|
} |
|
|
else { /* we do not have a translation: print utf8 */ |
|
|
if( p ) { |
|
|
for(i=0; i < encidx; i++ ) { |
|
|
sprintf((char*)p, "\\x%02x", encbuf[i] ); |
|
|
p += 4; |
|
|
} |
|
|
} |
|
|
n += encidx*4; |
|
|
encidx = 0; |
|
|
} |
|
|
} |
|
|
else { /* native set */ |
|
|
if( val >= 0x80 && val < 256 ) { |
|
|
n++; /* we can simply print this character */ |
|
|
if( p ) *p++ = val; |
|
|
} |
|
|
else { /* we do not have a translation: print utf8 */ |
|
|
if( p ) { |
|
|
for(i=0; i < encidx; i++ ) { |
|
|
sprintf((char*)p, "\\x%02x", encbuf[i] ); |
|
|
p += 4; |
|
|
} |
|
|
} |
|
|
n += encidx*4; |
|
|
encidx = 0; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
} |
|
|
if( !buffer ) { /* allocate the buffer after the first pass */ |
|
|
buffer = p = (byte *)malloc( n + 1 ); |
|
|
} |
|
|
else { |
|
|
*p = 0; /* make a string */ |
|
|
return (char*)buffer; |
|
|
} |
|
113 |
} |
} |
|
} |
|
114 |
|
|
115 |
|
return native; |
116 |
|
} |
117 |
|
|
118 |
|
/* CP850 -> CP1251 */ |
119 |
static void |
static void |
120 |
conv_charset (byte *string, size_t size, int what) |
conv_charset (byte *string, size_t size, int what) |
121 |
{ |
{ |
122 |
int i; |
size_t i; |
123 |
|
|
124 |
if( what == 0 ) { |
if( what == 0 ) { |
125 |
for( i = 0; i < size; i++, string++ ) { |
for( i = 0; i < size; i++, string++ ) { |
327 |
} |
} |
328 |
} |
} |
329 |
} |
} |
330 |
} /* conv_charset */ |
} |
331 |
|
|
332 |
|
|
333 |
/* XXX: the conv_charset() call fails when the user-id was created |
/* XXX: the conv_charset() call fails when the user-id was created |
337 |
utf8_to_wincp (const char * s, size_t len) |
utf8_to_wincp (const char * s, size_t len) |
338 |
{ |
{ |
339 |
char *decs; |
char *decs; |
340 |
decs = utf8_to_native (s, len, 0); |
decs = utf8_to_native (s); |
341 |
conv_charset ((byte *)decs, strlen (decs), 1); |
conv_charset ((byte *)decs, strlen (decs), 1); |
342 |
return decs; |
return decs; |
343 |
} |
} |