Commit d34aa78d authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

IsUTF8: check if a string is a valid UTF8 sequence without modifying it

EnsureUTF8 would replace invalid bytes sequences with question marks.

Most of the code was already there anyway. This allows UTF-8 autodetection
without a dedicated UTF8-to-UTF8 iconv() handle.
parent f0ffeb17
......@@ -46,6 +46,8 @@ int utf8_fprintf( FILE *, const char *, ... );
#endif
VLC_EXPORT( char *, EnsureUTF8, ( char * ) );
VLC_EXPORT( const char *, IsUTF8, ( const char * ) );
VLC_EXPORT( char *, FromUTF32, ( const uint32_t * ) );
VLC_EXPORT( char *, FromUTF16, ( const uint16_t * ) );
......
......@@ -485,6 +485,7 @@ struct module_symbols_t
char * (*decode_encoded_URI_duplicate_inner) (const char *psz);
void (*resolve_xml_special_chars_inner) (char *psz_value);
char * (*FromUTF16_inner) (const uint16_t *);
const char * (*IsUTF8_inner) (const char *);
};
# if defined (__PLUGIN__)
# define aout_FiltersCreatePipeline (p_symbols)->aout_FiltersCreatePipeline_inner
......@@ -950,6 +951,7 @@ struct module_symbols_t
# define decode_encoded_URI_duplicate (p_symbols)->decode_encoded_URI_duplicate_inner
# define resolve_xml_special_chars (p_symbols)->resolve_xml_special_chars_inner
# define FromUTF16 (p_symbols)->FromUTF16_inner
# define IsUTF8 (p_symbols)->IsUTF8_inner
# elif defined (HAVE_DYNAMIC_PLUGINS) && !defined (__BUILTIN__)
/******************************************************************
* STORE_SYMBOLS: store VLC APIs into p_symbols for plugin access.
......@@ -1418,6 +1420,7 @@ struct module_symbols_t
((p_symbols)->decode_encoded_URI_duplicate_inner) = decode_encoded_URI_duplicate; \
((p_symbols)->resolve_xml_special_chars_inner) = resolve_xml_special_chars; \
((p_symbols)->FromUTF16_inner) = FromUTF16; \
((p_symbols)->IsUTF8_inner) = IsUTF8; \
(p_symbols)->net_ConvertIPv4_deprecated = NULL; \
(p_symbols)->__stats_CounterGet_deprecated = NULL; \
(p_symbols)->__stats_TimerDumpAll_deprecated = NULL; \
......
......@@ -299,9 +299,9 @@ void LocaleFree( const char *str )
#endif
}
/*****************************************************************************
/**
* utf8_fopen: Calls fopen() after conversion of file name to OS locale
*****************************************************************************/
*/
FILE *utf8_fopen( const char *filename, const char *mode )
{
#if !(defined (WIN32) || defined (UNDER_CE))
......@@ -337,9 +337,9 @@ FILE *utf8_fopen( const char *filename, const char *mode )
#endif
}
/*****************************************************************************
/**
* utf8_mkdir: Calls mkdir() after conversion of file name to OS locale
*****************************************************************************/
*/
int utf8_mkdir( const char *dirname )
{
#if defined (UNDER_CE) || defined (WIN32)
......@@ -464,9 +464,9 @@ int utf8_lstat( const char *filename, void *buf)
return utf8_statEx( filename, buf, VLC_FALSE );
}
/*****************************************************************************
/**
* utf8_*printf: *printf with conversion from UTF-8 to local encoding
*****************************************************************************/
*/
static int utf8_vasprintf( char **str, const char *fmt, va_list ap )
{
char *utf8;
......@@ -502,15 +502,9 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
return res;
}
/*****************************************************************************
* EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
*****************************************************************************
* Not Todo : convert Latin1 to UTF-8 on the fly
* It is not possible given UTF-8 needs more space
* Returns str if it was valid UTF-8, NULL if not.
*****************************************************************************/
static char *CheckUTF8( char *str, char rep )
#define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF))
char *EnsureUTF8( char *str )
{
unsigned char *ptr, c;
......@@ -646,6 +640,8 @@ char *EnsureUTF8( char *str )
continue;
error:
if( rep == 0 )
return NULL;
*ptr++ = '?';
str = NULL;
}
......@@ -653,6 +649,32 @@ error:
return str;
}
/**
* EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
* Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
* so we don't try that, even though it would be less disruptive.
*
* @return str if it was valid UTF-8, NULL if not.
*/
char *EnsureUTF8( char *str )
{
return CheckUTF8( str, '?' );
}
/**
* IsUTF8: checks whether a string is a valid UTF-8 byte sequence.
*
* @param str nul-terminated string to be checked
*
* @return str if it was valid UTF-8, NULL if not.
*/
const char *IsUTF8( const char *str )
{
return CheckUTF8( (char *)str, 0 );
}
/**
* UTF32toUTF8(): converts an array from UTF-32 (host byte order)
* to UTF-8.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment