Commit 81eadc82 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

Refactor EnsureUTF8 and IsUTF8

(cherry picked from commit 9ce1a13fb8fa77a38e0dfdfe3387829e1df3f085)

Conflicts:

	src/text/unicode.c
parent 2efce50b
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* unicode.c: Unicode <-> locale functions * unicode.c: Unicode <-> locale functions
***************************************************************************** *****************************************************************************
* Copyright (C) 2005-2006 the VideoLAN team * Copyright (C) 2005-2006 the VideoLAN team
* Copyright © 2005-2008 Rémi Denis-Courmont * Copyright © 2005-2010 Rémi Denis-Courmont
* *
* Authors: Rémi Denis-Courmont <rem # videolan.org> * Authors: Rémi Denis-Courmont <rem # videolan.org>
* *
...@@ -365,72 +365,6 @@ size_t vlc_towc (const char *str, uint32_t *restrict pwc) ...@@ -365,72 +365,6 @@ size_t vlc_towc (const char *str, uint32_t *restrict pwc)
return charlen; return charlen;
} }
static char *CheckUTF8( char *str, char rep )
{
uint8_t *ptr = (uint8_t *)str;
assert (str != NULL);
for (;;)
{
uint8_t c = ptr[0];
if (c == '\0')
break;
if (c > 0xF4)
goto error;
int charlen = clz8 (c ^ 0xFF);
switch (charlen)
{
case 0: // 7-bit ASCII character -> OK
ptr++;
continue;
case 1: // continuation byte -> error
goto error;
}
assert (charlen >= 2 && charlen <= 4);
uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
for (int i = 1; i < charlen; i++)
{
assert (cp < (1 << 26));
c = ptr[i];
if ((c >> 6) != 2) // not a continuation byte
goto error;
cp = (cp << 6) | (ptr[i] & 0x3f);
}
switch (charlen)
{
case 4:
if (cp > 0x10FFFF) // beyond Unicode
goto error;
case 3:
if (cp >= 0xD800 && cp < 0xC000) // UTF-16 surrogate
goto error;
case 2:
if (cp < 128) // ASCII overlong
goto error;
if (cp < (1u << (5 * charlen - 4))) // overlong
goto error;
}
ptr += charlen;
continue;
error:
if (rep == 0)
return NULL;
*ptr++ = rep;
str = NULL;
}
return str;
}
/** /**
* Replaces invalid/overlong UTF-8 sequences with question marks. * Replaces invalid/overlong UTF-8 sequences with question marks.
...@@ -441,7 +375,19 @@ static char *CheckUTF8( char *str, char rep ) ...@@ -441,7 +375,19 @@ static char *CheckUTF8( char *str, char rep )
*/ */
char *EnsureUTF8( char *str ) char *EnsureUTF8( char *str )
{ {
return CheckUTF8( str, '?' ); char *ret = str;
size_t n;
uint32_t cp;
while ((n = vlc_towc (str, &cp)) != 0)
if (likely(n != (size_t)-1))
str += n;
else
{
*str++ = '?';
ret = NULL;
}
return ret;
} }
...@@ -454,7 +400,15 @@ char *EnsureUTF8( char *str ) ...@@ -454,7 +400,15 @@ char *EnsureUTF8( char *str )
*/ */
const char *IsUTF8( const char *str ) const char *IsUTF8( const char *str )
{ {
return CheckUTF8( (char *)str, 0 ); size_t n;
uint32_t cp;
while ((n = vlc_towc (str, &cp)) != 0)
if (likely(n != (size_t)-1))
str += n;
else
return NULL;
return str;
} }
/** /**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment