Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc
Commits
9ce1a13f
Commit
9ce1a13f
authored
Oct 08, 2010
by
Rémi Denis-Courmont
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Refactor EnsureUTF8 and IsUTF8
parent
abe105d5
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
75 additions
and
54 deletions
+75
-54
src/text/unicode.c
src/text/unicode.c
+75
-54
No files found.
src/text/unicode.c
View file @
9ce1a13f
...
...
@@ -2,7 +2,7 @@
* unicode.c: Unicode <-> locale functions
*****************************************************************************
* Copyright (C) 2005-2006 the VideoLAN team
* Copyright © 2005-20
08
Rémi Denis-Courmont
* Copyright © 2005-20
10
Rémi Denis-Courmont
*
* Authors: Rémi Denis-Courmont <rem # videolan.org>
*
...
...
@@ -273,73 +273,74 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
}
static
char
*
CheckUTF8
(
char
*
str
,
char
rep
)
/**
* Converts the first character from a UTF-8 sequence into a code point.
*
* @param str an UTF-8 bytes sequence
* @return 0 if str points to an empty string, i.e. the first character is NUL;
* number of bytes that the first character occupies (from 1 to 4) otherwise;
* -1 if the byte sequence was not a valid UTF-8 sequence.
*/
static
size_t
vlc_towc
(
const
char
*
str
,
uint32_t
*
restrict
pwc
)
{
uint8_t
*
ptr
=
(
uint8_t
*
)
str
;
assert
(
str
!=
NULL
);
for
(;;)
{
uint8_t
c
=
ptr
[
0
];
uint8_t
c
=
ptr
[
0
];
if
(
c
==
'\0'
)
break
;
if
(
c
>
0xF4
)
goto
error
;
int
charlen
=
clz8
(
c
^
0xFF
);
switch
(
charlen
)
{
case
0
:
// 7-bit ASCII character -> OK
ptr
++
;
continue
;
if
(
unlikely
(
c
==
'\0'
))
{
*
pwc
=
0
;
return
0
;
}
case
1
:
// continuation byte -> error
goto
error
;
}
if
(
unlikely
(
c
>
0xF4
))
return
-
1
;
assert
(
charlen
>=
2
&&
charlen
<=
4
);
int
charlen
=
clz8
(
c
^
0xFF
);
switch
(
charlen
)
{
case
0
:
// 7-bit ASCII character -> OK
*
pwc
=
c
;
return
1
;
uint32_t
cp
=
c
&
~
((
0xff
>>
(
7
-
charlen
))
<<
(
7
-
charlen
));
for
(
int
i
=
1
;
i
<
charlen
;
i
++
)
{
assert
(
cp
<
(
1
<<
26
));
c
=
ptr
[
i
];
case
1
:
// continuation byte -> error
return
-
1
;
}
if
((
c
>>
6
)
!=
2
)
// not a continuation byte
goto
error
;
assert
(
charlen
>=
2
&&
charlen
<=
4
);
cp
=
(
cp
<<
6
)
|
(
ptr
[
i
]
&
0x3f
);
}
uint32_t
cp
=
c
&
~
((
0xff
>>
(
7
-
charlen
))
<<
(
7
-
charlen
));
for
(
int
i
=
1
;
i
<
charlen
;
i
++
)
{
assert
(
cp
<
(
1
<<
26
));
c
=
ptr
[
i
];
switch
(
charlen
)
{
case
4
:
if
(
cp
>
0x10FFFF
)
// beyond Unicode
goto
error
;
case
3
:
if
(
cp
>=
0xD800
&&
cp
<
0xC000
)
// UTF-16 surrogate
goto
error
;
case
2
:
if
(
cp
<
128
)
// ASCII overlong
goto
error
;
if
(
cp
<
(
1u
<<
(
5
*
charlen
-
3
)))
// overlong
goto
error
;
}
ptr
+=
charlen
;
continue
;
if
(
unlikely
((
c
>>
6
)
!=
2
))
// not a continuation byte
return
-
1
;
error:
if
(
rep
==
0
)
return
NULL
;
*
ptr
++
=
rep
;
str
=
NULL
;
cp
=
(
cp
<<
6
)
|
(
ptr
[
i
]
&
0x3f
);
}
return
str
;
switch
(
charlen
)
{
case
4
:
if
(
unlikely
(
cp
>
0x10FFFF
))
// beyond Unicode
return
-
1
;
case
3
:
if
(
unlikely
(
cp
>=
0xD800
&&
cp
<
0xC000
))
// UTF-16 surrogate
return
-
1
;
case
2
:
if
(
unlikely
(
cp
<
128
))
// ASCII overlong
return
-
1
;
if
(
unlikely
(
cp
<
(
1u
<<
(
5
*
charlen
-
3
))))
// overlong
return
-
1
;
}
*
pwc
=
cp
;
return
charlen
;
}
/**
* Replaces invalid/overlong UTF-8 sequences with question marks.
* Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
...
...
@@ -349,7 +350,19 @@ static char *CheckUTF8( char *str, char rep )
*/
char
*
EnsureUTF8
(
char
*
str
)
{
return
CheckUTF8
(
str
,
'?'
);
char
*
ret
=
str
;
size_t
n
;
uint32_t
cp
;
while
((
n
=
vlc_towc
(
str
,
&
cp
))
!=
0
)
if
(
likely
(
n
!=
(
size_t
)
-
1
))
str
+=
n
;
else
{
*
str
++
=
'?'
;
ret
=
NULL
;
}
return
ret
;
}
...
...
@@ -362,7 +375,15 @@ char *EnsureUTF8( char *str )
*/
const
char
*
IsUTF8
(
const
char
*
str
)
{
return
CheckUTF8
(
(
char
*
)
str
,
0
);
size_t
n
;
uint32_t
cp
;
while
((
n
=
vlc_towc
(
str
,
&
cp
))
!=
0
)
if
(
likely
(
n
!=
(
size_t
)
-
1
))
str
+=
n
;
else
return
NULL
;
return
str
;
}
/**
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment