Commit d34aa78d authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

IsUTF8: check if a string is a valid UTF8 sequence without modifying it

EnsureUTF8 would replace invalid bytes sequences with question marks.

Most of the code was already there anyway. This allows UTF-8 autodetection
without a dedicated UTF8-to-UTF8 iconv() handle.
parent f0ffeb17
...@@ -46,6 +46,8 @@ int utf8_fprintf( FILE *, const char *, ... ); ...@@ -46,6 +46,8 @@ int utf8_fprintf( FILE *, const char *, ... );
#endif #endif
VLC_EXPORT( char *, EnsureUTF8, ( char * ) ); VLC_EXPORT( char *, EnsureUTF8, ( char * ) );
VLC_EXPORT( const char *, IsUTF8, ( const char * ) );
VLC_EXPORT( char *, FromUTF32, ( const uint32_t * ) ); VLC_EXPORT( char *, FromUTF32, ( const uint32_t * ) );
VLC_EXPORT( char *, FromUTF16, ( const uint16_t * ) ); VLC_EXPORT( char *, FromUTF16, ( const uint16_t * ) );
......
...@@ -485,6 +485,7 @@ struct module_symbols_t ...@@ -485,6 +485,7 @@ struct module_symbols_t
char * (*decode_encoded_URI_duplicate_inner) (const char *psz); char * (*decode_encoded_URI_duplicate_inner) (const char *psz);
void (*resolve_xml_special_chars_inner) (char *psz_value); void (*resolve_xml_special_chars_inner) (char *psz_value);
char * (*FromUTF16_inner) (const uint16_t *); char * (*FromUTF16_inner) (const uint16_t *);
const char * (*IsUTF8_inner) (const char *);
}; };
# if defined (__PLUGIN__) # if defined (__PLUGIN__)
# define aout_FiltersCreatePipeline (p_symbols)->aout_FiltersCreatePipeline_inner # define aout_FiltersCreatePipeline (p_symbols)->aout_FiltersCreatePipeline_inner
...@@ -950,6 +951,7 @@ struct module_symbols_t ...@@ -950,6 +951,7 @@ struct module_symbols_t
# define decode_encoded_URI_duplicate (p_symbols)->decode_encoded_URI_duplicate_inner # define decode_encoded_URI_duplicate (p_symbols)->decode_encoded_URI_duplicate_inner
# define resolve_xml_special_chars (p_symbols)->resolve_xml_special_chars_inner # define resolve_xml_special_chars (p_symbols)->resolve_xml_special_chars_inner
# define FromUTF16 (p_symbols)->FromUTF16_inner # define FromUTF16 (p_symbols)->FromUTF16_inner
# define IsUTF8 (p_symbols)->IsUTF8_inner
# elif defined (HAVE_DYNAMIC_PLUGINS) && !defined (__BUILTIN__) # elif defined (HAVE_DYNAMIC_PLUGINS) && !defined (__BUILTIN__)
/****************************************************************** /******************************************************************
* STORE_SYMBOLS: store VLC APIs into p_symbols for plugin access. * STORE_SYMBOLS: store VLC APIs into p_symbols for plugin access.
...@@ -1418,6 +1420,7 @@ struct module_symbols_t ...@@ -1418,6 +1420,7 @@ struct module_symbols_t
((p_symbols)->decode_encoded_URI_duplicate_inner) = decode_encoded_URI_duplicate; \ ((p_symbols)->decode_encoded_URI_duplicate_inner) = decode_encoded_URI_duplicate; \
((p_symbols)->resolve_xml_special_chars_inner) = resolve_xml_special_chars; \ ((p_symbols)->resolve_xml_special_chars_inner) = resolve_xml_special_chars; \
((p_symbols)->FromUTF16_inner) = FromUTF16; \ ((p_symbols)->FromUTF16_inner) = FromUTF16; \
((p_symbols)->IsUTF8_inner) = IsUTF8; \
(p_symbols)->net_ConvertIPv4_deprecated = NULL; \ (p_symbols)->net_ConvertIPv4_deprecated = NULL; \
(p_symbols)->__stats_CounterGet_deprecated = NULL; \ (p_symbols)->__stats_CounterGet_deprecated = NULL; \
(p_symbols)->__stats_TimerDumpAll_deprecated = NULL; \ (p_symbols)->__stats_TimerDumpAll_deprecated = NULL; \
......
...@@ -299,9 +299,9 @@ void LocaleFree( const char *str ) ...@@ -299,9 +299,9 @@ void LocaleFree( const char *str )
#endif #endif
} }
/***************************************************************************** /**
* utf8_fopen: Calls fopen() after conversion of file name to OS locale * utf8_fopen: Calls fopen() after conversion of file name to OS locale
*****************************************************************************/ */
FILE *utf8_fopen( const char *filename, const char *mode ) FILE *utf8_fopen( const char *filename, const char *mode )
{ {
#if !(defined (WIN32) || defined (UNDER_CE)) #if !(defined (WIN32) || defined (UNDER_CE))
...@@ -337,9 +337,9 @@ FILE *utf8_fopen( const char *filename, const char *mode ) ...@@ -337,9 +337,9 @@ FILE *utf8_fopen( const char *filename, const char *mode )
#endif #endif
} }
/***************************************************************************** /**
* utf8_mkdir: Calls mkdir() after conversion of file name to OS locale * utf8_mkdir: Calls mkdir() after conversion of file name to OS locale
*****************************************************************************/ */
int utf8_mkdir( const char *dirname ) int utf8_mkdir( const char *dirname )
{ {
#if defined (UNDER_CE) || defined (WIN32) #if defined (UNDER_CE) || defined (WIN32)
...@@ -464,9 +464,9 @@ int utf8_lstat( const char *filename, void *buf) ...@@ -464,9 +464,9 @@ int utf8_lstat( const char *filename, void *buf)
return utf8_statEx( filename, buf, VLC_FALSE ); return utf8_statEx( filename, buf, VLC_FALSE );
} }
/***************************************************************************** /**
* utf8_*printf: *printf with conversion from UTF-8 to local encoding * utf8_*printf: *printf with conversion from UTF-8 to local encoding
*****************************************************************************/ */
static int utf8_vasprintf( char **str, const char *fmt, va_list ap ) static int utf8_vasprintf( char **str, const char *fmt, va_list ap )
{ {
char *utf8; char *utf8;
...@@ -502,15 +502,9 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... ) ...@@ -502,15 +502,9 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
return res; return res;
} }
/*****************************************************************************
* EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks static char *CheckUTF8( char *str, char rep )
*****************************************************************************
* Not Todo : convert Latin1 to UTF-8 on the fly
* It is not possible given UTF-8 needs more space
* Returns str if it was valid UTF-8, NULL if not.
*****************************************************************************/
#define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF)) #define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF))
char *EnsureUTF8( char *str )
{ {
unsigned char *ptr, c; unsigned char *ptr, c;
...@@ -646,6 +640,8 @@ char *EnsureUTF8( char *str ) ...@@ -646,6 +640,8 @@ char *EnsureUTF8( char *str )
continue; continue;
error: error:
if( rep == 0 )
return NULL;
*ptr++ = '?'; *ptr++ = '?';
str = NULL; str = NULL;
} }
...@@ -653,6 +649,32 @@ error: ...@@ -653,6 +649,32 @@ error:
return str; return str;
} }
/**
* EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
* Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
* so we don't try that, even though it would be less disruptive.
*
* @return str if it was valid UTF-8, NULL if not.
*/
char *EnsureUTF8( char *str )
{
return CheckUTF8( str, '?' );
}
/**
* IsUTF8: checks whether a string is a valid UTF-8 byte sequence.
*
* @param str nul-terminated string to be checked
*
* @return str if it was valid UTF-8, NULL if not.
*/
const char *IsUTF8( const char *str )
{
return CheckUTF8( (char *)str, 0 );
}
/** /**
* UTF32toUTF8(): converts an array from UTF-32 (host byte order) * UTF32toUTF8(): converts an array from UTF-32 (host byte order)
* to UTF-8. * to UTF-8.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment