* Use uint32_t instead of wchar_t to represent UTF32 characters, given wchar_t...

* Use uint32_t instead of wchar_t to represent UTF32 characters, given wchar_t is not 32-bits on all platforms * Cosmetic fixes * Check for invalid UTF-32 sequence * Accept all non-printable characters within UTF-8 strings (EnsureUTF8's job is to ensure valid UTF8 sequence, not to ensure printable characters)

* Use uint32_t instead of wchar_t to represent UTF32 characters, given wchar_t...
* Use uint32_t instead of wchar_t to represent UTF32 characters, given wchar_t is not 32-bits on all platforms * Cosmetic fixes * Check for invalid UTF-32 sequence * Accept all non-printable characters within UTF-8 strings (EnsureUTF8's job is to ensure valid UTF8 sequence, not to ensure printable characters)
3fdc34d5 · Rémi Denis-Courmont · e3f12b87 · 3fdc34d5 · 3fdc34d5 · 3fdc34d5
Commit 3fdc34d5 authored Mar 12, 2006 by Rémi Denis-Courmont
Show whitespace changes
Inline Side-by-side

Showing with 76 additions and 109 deletions

include/charset.h include/charset.h +1 -1

include/vlc_symbols.h include/vlc_symbols.h +1 -1

src/misc/unicode.c src/misc/unicode.c +74 -107

No files found.
--- a/include/charset.h
+++ b/include/charset.h
@@ -46,7 +46,7 @@ int utf8_fprintf( FILE *, const char *, ... );
 #endif
 VLC_EXPORT( char *, EnsureUTF8, ( char * ) );
-VLC_EXPORT( char *, FromUTF32, ( const wchar_t * ) );
+VLC_EXPORT( char *, FromUTF32, ( const uint32_t * ) );
 VLC_EXPORT( char *, __vlc_fix_readdir_charset, ( vlc_object_t *, const char * ) );
 #define vlc_fix_readdir_charset(a,b) __vlc_fix_readdir_charset(VLC_OBJECT(a),b)

--- a/include/vlc_symbols.h
+++ b/include/vlc_symbols.h
@@ -426,7 +426,7 @@ struct module_symbols_t
    void (*osd_Message_inner) (spu_t *, int, char *, ...);
    int (*osd_ShowTextAbsolute_inner) (spu_t *, int, char *, text_style_t *, int, int, int, mtime_t, mtime_t);
    char * (*config_GetUserDir_inner) (void);
-    char * (*FromUTF32_inner) (const wchar_t *);
+    char * (*FromUTF32_inner) (const uint32_t *);
    int (*__input_Read_inner) (vlc_object_t *, input_item_t *, vlc_bool_t);
    int (*__net_ConnectUDP_inner) (vlc_object_t *p_this, const char *psz_host, int i_port, int hlim);
    int (*__intf_Interact_inner) (vlc_object_t *,interaction_dialog_t *);

--- a/src/misc/unicode.c
+++ b/src/misc/unicode.c
@@ -506,7 +506,7 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
 /*****************************************************************************
 * EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
 *****************************************************************************
- * Not Todo : convert Latin1 to UTF-8 on the flu
+ * Not Todo : convert Latin1 to UTF-8 on the fly
 * It is not possible given UTF-8 needs more space
 * Returns str if it was valid UTF-8, NULL if not.
 *****************************************************************************/
@@ -519,11 +519,8 @@ char *EnsureUTF8( char *str )
    while( (c = *ptr) != '\0' )
    {
        /* US-ASCII, 1 byte */
-        if( ( ( c >= 0x20 ) && ( c <= 0x7F ) )
+        if( c <= 0x7F )
-         || ( c == 0x09 ) || ( c == 0x0A ) || ( c == 0x0D ) )
-        {
            ptr++; /* OK */
-        }
        else
        /* 2 bytes */
        if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
@@ -532,10 +529,7 @@ char *EnsureUTF8( char *str )
            if( isutf8cont( c ) )
                ptr += 2; /* OK */
            else
-            {
+                goto error;
-                *ptr++ = '?'; /* invalid */
-                str = NULL;
-            }
        }
        else
        /* 3 bytes */
@@ -548,16 +542,10 @@ char *EnsureUTF8( char *str )
                if( isutf8cont( c ) )
                    ptr += 3; /* OK */
                else
-                {
+                    goto error;
-                    *ptr++ = '?';
-                    str = NULL;
-                }
            }
            else
-            {
+                goto error;
-                *ptr++ = '?';
-                str = NULL;
-            }
        }
        else
        if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
@@ -570,16 +558,10 @@ char *EnsureUTF8( char *str )
                if( isutf8cont( c ) )
                    ptr += 3; /* OK */
                else
-                {
+                    goto error;
-                    *ptr++ = '?';
-                    str = NULL;
-                }
            }
            else
-            {
+                goto error;
-                *ptr++ = '?';
-                str = NULL;
-            }
        }
        else
        if( c == 0xED )
@@ -591,16 +573,10 @@ char *EnsureUTF8( char *str )
                if( isutf8cont( c ) )
                    ptr += 3; /* OK */
                else
-                {
+                    goto error;
-                    *ptr++ = '?';
-                    str = NULL;
-                }
            }
            else
-            {
+                goto error;
-                *ptr++ = '?';
-                str = NULL;
-            }
        }
        else
        /* 4 bytes */
@@ -616,22 +592,13 @@ char *EnsureUTF8( char *str )
                    if( isutf8cont( c ) )
                        ptr += 4; /* OK */
                    else
-                    {
+                        goto error;
-                        *ptr++ = '?';
-                        str = NULL;
-                    }
                }
                else
-                {
+                    goto error;
-                    *ptr++ = '?';
-                    str = NULL;
-                }
            }
            else
-            {
+                goto error;
-                *ptr++ = '?';
-                str = NULL;
-            }
        }
        else
        if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
@@ -645,23 +612,13 @@ char *EnsureUTF8( char *str )
                    c = ptr[3];
                    if( isutf8cont( c ) )
                        ptr += 4; /* OK */
-                    else
+                    goto error;
-                    {
-                        *ptr++ = '?';
-                        str = NULL;
-                    }
                }
                else
-                {
+                    goto error;
-                    *ptr++ = '?';
-                    str = NULL;
-                }
            }
            else
-            {
+                goto error;
-                *ptr++ = '?';
-                str = NULL;
-            }
        }
        else
        if( c == 0xF4 )
@@ -676,37 +633,40 @@ char *EnsureUTF8( char *str )
                    if( isutf8cont( c ) )
                        ptr += 4; /* OK */
                    else
-                    {
+                        goto error;
-                        *ptr++ = '?';
-                        str = NULL;
-                    }
                }
                else
-                {
+                    goto error;
-                    *ptr++ = '?';
-                    str = NULL;
-                }
            }
            else
-            {
+                goto error;
-                *ptr++ = '?';
-                str = NULL;
-            }
        }
        else
-        {
+            goto error;
+        continue;
+error:
        *ptr++ = '?';
        str = NULL;
    }
-    }
    return str;
 }
-/**********************************************************************
+/**
- * UTF32toUTF8: converts an array from UTF-32 to UTF-8
+ * UTF32toUTF8(): converts an array from UTF-32 to UTF-8.
- *********************************************************************/
+ *
-char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
+ * @param src the UTF32 table to be converted
+ * @param len the number of code points to be converted from src
+ * (ie. the number of uint32_t in the table pointed to by src)
+ * @param newlen an optional pointer. If not NULL, *newlen will
+ * contain the total number of bytes written.
+ *
+ * @return the result of the conversion (must be free'd())
+ * or NULL on error (in that case, *newlen is undefined).
+ */
+char *UTF32toUTF8( const uint32_t *src, size_t len, size_t *newlen )
 {
    char *res, *out;
@@ -741,6 +701,7 @@ char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
            continue;
        }
        else
+        if( uv < 0x110000 )
        {
            *out++ = (( uv >> 18)         | 0xf0);
            *out++ = (((uv >> 12) & 0x3f) | 0x80);
@@ -748,6 +709,11 @@ char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
            *out++ = (( uv        & 0x3f) | 0x80);
            continue;
        }
+        else
+        {
+            free( res );
+            return NULL;
+        }
    }
    len = out - res;
    res = realloc( res, len );
@@ -756,15 +722,16 @@ char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
    return res;
 }
-/**********************************************************************
+/**
- * FromUTF32: converts an UTF-32 string to UTF-8
+ * FromUTF32(): converts an UTF-32 string to UTF-8.
- **********************************************************************
+ *
- * The result must be free()'d. NULL on error.
+ * @return the result of the conversion (must be free()'d),
- *********************************************************************/
+ * or NULL in case of error.
-char *FromUTF32( const wchar_t *src )
+ */
+char *FromUTF32( const uint32_t *src )
 {
    size_t len;
-    const wchar_t *in;
+    const uint32_t *in;
    /* determine the size of the string */
    for( len = 1, in = src; GetWBE( in ); len++ )