Commit 9a55e838 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

Simpler UTF-8 check functions + rudimentary unit test

parent d2dff446
......@@ -363,8 +363,9 @@ endif
# Unit/regression test
###############################################################################
if USE_LIBTOOL
check_PROGRAMS = test_i18n_atof test_url
check_PROGRAMS = test_i18n_atof test_url test_utf8
TESTS = $(check_PROGRAMS)
endif
CFLAGS_tests = `$(VLC_CONFIG) --cflags libvlc`
......@@ -375,6 +376,9 @@ test_i18n_atof_CFLAGS = $(CFLAGS_tests)
test_url_SOURCES = test/url.c
test_url_LDADD = libvlc.la
test_url_CFLAGS = $(CFLAGS_tests)
endif
test_utf8_SOURCES = test/utf8.c
test_utf8_LDADD = libvlc.la
test_utf8_CFLAGS = $(CFLAGS_tests)
FORCE:
/*****************************************************************************
* utf8.c: Test for UTF-8 encoding/decoding stuff
*****************************************************************************
* Copyright (C) 2006 Rémi Denis-Courmont
* $Id$
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#include <vlc/vlc.h>
#include "vlc_charset.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
static void test (const char *in, const char *out)
{
bool isutf8 = !strcmp (in, out);
char *str = strdup (in);
if (str == NULL)
abort ();
if (isutf8)
printf ("\"%s\" should be accepted...\n", in);
else
printf ("\"%s\" should be rewritten as \"%s\"...\n", in, out);
if ((IsUTF8 (in) != NULL) != isutf8)
{
printf (" ERROR: IsUTF8 (%s) failed\n", in);
exit (1);
}
if ((EnsureUTF8 (str) != NULL) != isutf8)
{
printf (" ERROR: EnsureUTF8 (%s) failed\n", in);
exit (2);
}
if (strcmp (str, out))
{
printf (" ERROR: got \"%s\"\n", str);
exit (3);
}
if ((EnsureUTF8 (str) == NULL) || IsUTF8 (str) == NULL)
{
printf (" ERROR: EnsureUTF8 (%s) is not UTF-8\n", in);
exit (4);
}
free (str);
}
int main (void)
{
(void)setvbuf (stdout, NULL, _IONBF, 0);
test ("", "");
test ("this_should_not_be_modified_1234",
"this_should_not_be_modified_1234");
test ("\xFF", "?"); // invalid byte
test ("\xEF\xBB\xBFHello", "\xEF\xBB\xBFHello"); // BOM
test ("\x00\xE9", ""); // no conversion past end of string
test ("T\xC3\xA9l\xC3\xA9vision \xE2\x82\xAC", "Télévision €");
test ("T\xE9l\xE9vision", "T?l?vision");
test ("\xC1\x94\xC3\xa9l\xC3\xA9vision", "??élévision"); /* overlong */
test ("Hel\xF0\x83\x85\x87lo", "Hel????lo"); /* more overlong */
return 0;
}
......@@ -639,147 +639,63 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
static char *CheckUTF8( char *str, char rep )
#define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF))
{
unsigned char *ptr, c;
uint8_t *ptr = (uint8_t *)str;
assert (str != NULL);
ptr = (unsigned char *)str;
while( (c = *ptr) != '\0' )
{
/* US-ASCII, 1 byte */
if( c <= 0x7F )
ptr++; /* OK */
else
/* 2 bytes */
if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
{
c = ptr[1];
if( isutf8cont( c ) )
ptr += 2; /* OK */
else
goto error;
}
else
/* 3 bytes */
if( c == 0xE0 )
{
c = ptr[1];
if( ( c >= 0xA0 ) && ( c <= 0xBF ) )
{
c = ptr[2];
if( isutf8cont( c ) )
ptr += 3; /* OK */
else
goto error;
}
else
goto error;
}
else
if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
|| ( c == 0xEE ) || ( c == 0xEF ) )
{
c = ptr[1];
if( isutf8cont( c ) )
{
c = ptr[2];
if( isutf8cont( c ) )
ptr += 3; /* OK */
else
goto error;
}
else
goto error;
}
else
if( c == 0xED )
{
c = ptr[1];
if( ( c >= 0x80 ) && ( c <= 0x9F ) )
for (;;)
{
c = ptr[2];
if( isutf8cont( c ) )
ptr += 3; /* OK */
else
goto error;
}
else
goto error;
}
else
/* 4 bytes */
if( c == 0xF0 )
{
c = ptr[1];
if( ( c >= 0x90 ) && ( c <= 0xBF ) )
{
c = ptr[2];
if( isutf8cont( c ) )
uint8_t c = ptr[0];
int charlen = -1;
if (c == '\0')
break;
for (int i = 0; i < 7; i++)
if ((c >> (7 - i)) == ((0xff >> (7 - i)) ^ 1))
{
c = ptr[3];
if( isutf8cont( c ) )
ptr += 4; /* OK */
else
goto error;
}
else
goto error;
}
else
goto error;
charlen = i;
break;
}
else
if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
{
c = ptr[1];
if( isutf8cont( c ) )
{
c = ptr[2];
if( isutf8cont( c ) )
switch (charlen)
{
c = ptr[3];
if( isutf8cont( c ) )
ptr += 4; /* OK */
goto error;
}
else
goto error;
}
else
case 0: // 7-bit ASCII character -> OK
ptr++;
continue;
case -1: // 1111111x -> error
case 1: // continuation byte -> error
goto error;
}
else
if( c == 0xF4 )
{
c = ptr[1];
if( ( c >= 0x80 ) && ( c <= 0x8F ) )
{
c = ptr[2];
if( isutf8cont( c ) )
assert (charlen >= 2);
uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
for (int i = 1; i < charlen; i++)
{
c = ptr[3];
if( isutf8cont( c ) )
ptr += 4; /* OK */
else
goto error;
}
else
assert (cp < (1 << 26));
c = ptr[i];
if ((c == '\0') // unexpected end of string
|| ((c >> 6) != 2)) // not a continuation byte
goto error;
cp = (cp << 6) | (ptr[i] & 0x3f);
}
else
if (cp < 128) // overlong (special case for ASCII)
goto error;
}
else
if (cp < (1u << (5 * charlen - 3))) // overlong
goto error;
ptr += charlen;
continue;
error:
if( rep == 0 )
error:
if (rep == 0)
return NULL;
*ptr++ = '?';
*ptr++ = rep;
str = NULL;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment