Commit 3fdc34d5 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

* Use uint32_t instead of wchar_t to represent UTF32 characters, given wchar_t...

* Use uint32_t instead of wchar_t to represent UTF32 characters, given wchar_t is not 32-bits on all platforms
* Cosmetic fixes
* Check for invalid UTF-32 sequence
* Accept all non-printable characters within UTF-8 strings (EnsureUTF8's job is to ensure valid UTF8 sequence, not to ensure printable characters)
parent e3f12b87
...@@ -46,7 +46,7 @@ int utf8_fprintf( FILE *, const char *, ... ); ...@@ -46,7 +46,7 @@ int utf8_fprintf( FILE *, const char *, ... );
#endif #endif
VLC_EXPORT( char *, EnsureUTF8, ( char * ) ); VLC_EXPORT( char *, EnsureUTF8, ( char * ) );
VLC_EXPORT( char *, FromUTF32, ( const wchar_t * ) ); VLC_EXPORT( char *, FromUTF32, ( const uint32_t * ) );
VLC_EXPORT( char *, __vlc_fix_readdir_charset, ( vlc_object_t *, const char * ) ); VLC_EXPORT( char *, __vlc_fix_readdir_charset, ( vlc_object_t *, const char * ) );
#define vlc_fix_readdir_charset(a,b) __vlc_fix_readdir_charset(VLC_OBJECT(a),b) #define vlc_fix_readdir_charset(a,b) __vlc_fix_readdir_charset(VLC_OBJECT(a),b)
......
...@@ -426,7 +426,7 @@ struct module_symbols_t ...@@ -426,7 +426,7 @@ struct module_symbols_t
void (*osd_Message_inner) (spu_t *, int, char *, ...); void (*osd_Message_inner) (spu_t *, int, char *, ...);
int (*osd_ShowTextAbsolute_inner) (spu_t *, int, char *, text_style_t *, int, int, int, mtime_t, mtime_t); int (*osd_ShowTextAbsolute_inner) (spu_t *, int, char *, text_style_t *, int, int, int, mtime_t, mtime_t);
char * (*config_GetUserDir_inner) (void); char * (*config_GetUserDir_inner) (void);
char * (*FromUTF32_inner) (const wchar_t *); char * (*FromUTF32_inner) (const uint32_t *);
int (*__input_Read_inner) (vlc_object_t *, input_item_t *, vlc_bool_t); int (*__input_Read_inner) (vlc_object_t *, input_item_t *, vlc_bool_t);
int (*__net_ConnectUDP_inner) (vlc_object_t *p_this, const char *psz_host, int i_port, int hlim); int (*__net_ConnectUDP_inner) (vlc_object_t *p_this, const char *psz_host, int i_port, int hlim);
int (*__intf_Interact_inner) (vlc_object_t *,interaction_dialog_t *); int (*__intf_Interact_inner) (vlc_object_t *,interaction_dialog_t *);
......
...@@ -506,7 +506,7 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... ) ...@@ -506,7 +506,7 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
/***************************************************************************** /*****************************************************************************
* EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks * EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
***************************************************************************** *****************************************************************************
* Not Todo : convert Latin1 to UTF-8 on the flu * Not Todo : convert Latin1 to UTF-8 on the fly
* It is not possible given UTF-8 needs more space * It is not possible given UTF-8 needs more space
* Returns str if it was valid UTF-8, NULL if not. * Returns str if it was valid UTF-8, NULL if not.
*****************************************************************************/ *****************************************************************************/
...@@ -519,11 +519,8 @@ char *EnsureUTF8( char *str ) ...@@ -519,11 +519,8 @@ char *EnsureUTF8( char *str )
while( (c = *ptr) != '\0' ) while( (c = *ptr) != '\0' )
{ {
/* US-ASCII, 1 byte */ /* US-ASCII, 1 byte */
if( ( ( c >= 0x20 ) && ( c <= 0x7F ) ) if( c <= 0x7F )
|| ( c == 0x09 ) || ( c == 0x0A ) || ( c == 0x0D ) )
{
ptr++; /* OK */ ptr++; /* OK */
}
else else
/* 2 bytes */ /* 2 bytes */
if( ( c >= 0xC2 ) && ( c <= 0xDF ) ) if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
...@@ -532,10 +529,7 @@ char *EnsureUTF8( char *str ) ...@@ -532,10 +529,7 @@ char *EnsureUTF8( char *str )
if( isutf8cont( c ) ) if( isutf8cont( c ) )
ptr += 2; /* OK */ ptr += 2; /* OK */
else else
{ goto error;
*ptr++ = '?'; /* invalid */
str = NULL;
}
} }
else else
/* 3 bytes */ /* 3 bytes */
...@@ -548,16 +542,10 @@ char *EnsureUTF8( char *str ) ...@@ -548,16 +542,10 @@ char *EnsureUTF8( char *str )
if( isutf8cont( c ) ) if( isutf8cont( c ) )
ptr += 3; /* OK */ ptr += 3; /* OK */
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC ) if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
...@@ -570,16 +558,10 @@ char *EnsureUTF8( char *str ) ...@@ -570,16 +558,10 @@ char *EnsureUTF8( char *str )
if( isutf8cont( c ) ) if( isutf8cont( c ) )
ptr += 3; /* OK */ ptr += 3; /* OK */
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
if( c == 0xED ) if( c == 0xED )
...@@ -591,16 +573,10 @@ char *EnsureUTF8( char *str ) ...@@ -591,16 +573,10 @@ char *EnsureUTF8( char *str )
if( isutf8cont( c ) ) if( isutf8cont( c ) )
ptr += 3; /* OK */ ptr += 3; /* OK */
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
/* 4 bytes */ /* 4 bytes */
...@@ -616,22 +592,13 @@ char *EnsureUTF8( char *str ) ...@@ -616,22 +592,13 @@ char *EnsureUTF8( char *str )
if( isutf8cont( c ) ) if( isutf8cont( c ) )
ptr += 4; /* OK */ ptr += 4; /* OK */
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
if( ( c >= 0xF1 ) && ( c <= 0xF3 ) ) if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
...@@ -645,23 +612,13 @@ char *EnsureUTF8( char *str ) ...@@ -645,23 +612,13 @@ char *EnsureUTF8( char *str )
c = ptr[3]; c = ptr[3];
if( isutf8cont( c ) ) if( isutf8cont( c ) )
ptr += 4; /* OK */ ptr += 4; /* OK */
else goto error;
{
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
if( c == 0xF4 ) if( c == 0xF4 )
...@@ -676,37 +633,40 @@ char *EnsureUTF8( char *str ) ...@@ -676,37 +633,40 @@ char *EnsureUTF8( char *str )
if( isutf8cont( c ) ) if( isutf8cont( c ) )
ptr += 4; /* OK */ ptr += 4; /* OK */
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
*ptr++ = '?';
str = NULL;
}
} }
else else
{ goto error;
continue;
error:
*ptr++ = '?'; *ptr++ = '?';
str = NULL; str = NULL;
} }
}
return str; return str;
} }
/********************************************************************** /**
* UTF32toUTF8: converts an array from UTF-32 to UTF-8 * UTF32toUTF8(): converts an array from UTF-32 to UTF-8.
*********************************************************************/ *
char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen ) * @param src the UTF32 table to be converted
* @param len the number of code points to be converted from src
* (ie. the number of uint32_t in the table pointed to by src)
* @param newlen an optional pointer. If not NULL, *newlen will
* contain the total number of bytes written.
*
* @return the result of the conversion (must be free'd())
* or NULL on error (in that case, *newlen is undefined).
*/
char *UTF32toUTF8( const uint32_t *src, size_t len, size_t *newlen )
{ {
char *res, *out; char *res, *out;
...@@ -741,6 +701,7 @@ char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen ) ...@@ -741,6 +701,7 @@ char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
continue; continue;
} }
else else
if( uv < 0x110000 )
{ {
*out++ = (( uv >> 18) | 0xf0); *out++ = (( uv >> 18) | 0xf0);
*out++ = (((uv >> 12) & 0x3f) | 0x80); *out++ = (((uv >> 12) & 0x3f) | 0x80);
...@@ -748,6 +709,11 @@ char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen ) ...@@ -748,6 +709,11 @@ char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
*out++ = (( uv & 0x3f) | 0x80); *out++ = (( uv & 0x3f) | 0x80);
continue; continue;
} }
else
{
free( res );
return NULL;
}
} }
len = out - res; len = out - res;
res = realloc( res, len ); res = realloc( res, len );
...@@ -756,15 +722,16 @@ char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen ) ...@@ -756,15 +722,16 @@ char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
return res; return res;
} }
/********************************************************************** /**
* FromUTF32: converts an UTF-32 string to UTF-8 * FromUTF32(): converts an UTF-32 string to UTF-8.
********************************************************************** *
* The result must be free()'d. NULL on error. * @return the result of the conversion (must be free()'d),
*********************************************************************/ * or NULL in case of error.
char *FromUTF32( const wchar_t *src ) */
char *FromUTF32( const uint32_t *src )
{ {
size_t len; size_t len;
const wchar_t *in; const uint32_t *in;
/* determine the size of the string */ /* determine the size of the string */
for( len = 1, in = src; GetWBE( in ); len++ ) for( len = 1, in = src; GetWBE( in ); len++ )
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment