diff --git a/libavformat/id3v2.c b/libavformat/id3v2.c index 6b1c4606e928b6aab2631872c6b27b1946db4104..dedc0450b3a06c6184fe65aada819270c12074f7 100644 --- a/libavformat/id3v2.c +++ b/libavformat/id3v2.c @@ -81,6 +81,7 @@ static void read_ttag(AVFormatContext *s, int taglen, const char *key) char *q, dst[512]; int len, dstlen = sizeof(dst) - 1; unsigned genre; + unsigned int (*get)(ByteIOContext*) = get_be16; dst[0] = 0; if (taglen < 1) @@ -99,11 +100,38 @@ static void read_ttag(AVFormatContext *s, int taglen, const char *key) *q = 0; break; + case 1: /* UTF-16 with BOM */ + taglen -= 2; + switch (get_be16(s->pb)) { + case 0xfffe: + get = get_le16; + case 0xfeff: + break; + default: + av_log(s, AV_LOG_ERROR, "Incorrect BOM value in tag %s.\n", key); + return; + } + // fall-through + + case 2: /* UTF-16BE without BOM */ + q = dst; + while (taglen > 1 && q - dst < dstlen - 7) { + uint32_t ch; + uint8_t tmp; + + GET_UTF16(ch, ((taglen -= 2) >= 0 ? get(s->pb) : 0), break;) + PUT_UTF8(ch, tmp, *q++ = tmp;) + } + *q = 0; + break; + case 3: /* UTF-8 */ len = FFMIN(taglen, dstlen - 1); get_buffer(s->pb, dst, len); dst[len] = 0; break; + default: + av_log(s, AV_LOG_WARNING, "Unknown encoding in tag %s\n.", key); } if (!strcmp(key, "genre") diff --git a/libavutil/common.h b/libavutil/common.h index 0797a79ac05eb871276e89c5c48bebcce4ae3b41..a6303d9872a9abc14073172e5142bae0cacacbc0 100644 --- a/libavutil/common.h +++ b/libavutil/common.h @@ -265,6 +265,30 @@ static inline av_const int av_ceil_log2(int x) }\ } +/*! + * \def GET_UTF16(val, GET_16BIT, ERROR) + * Converts a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form + * \param val is the output and should be of type uint32_t. It holds the converted + * UCS-4 character and should be a left value. + * \param GET_16BIT gets two bytes of UTF-16 encoded data converted to native endianness. + * It can be a function or a statement whose return value or evaluated value is of type + * uint16_t. It will be executed up to 2 times. + * \param ERROR action that should be taken when an invalid UTF-16 surrogate is + * returned from GET_BYTE. It should be a statement that jumps out of the macro, + * like exit(), goto, return, break, or continue. + */ +#define GET_UTF16(val, GET_16BIT, ERROR)\ + val = GET_16BIT;\ + {\ + unsigned int hi = val - 0xD800;\ + if (hi < 0x800) {\ + val = GET_16BIT - 0xDC00;\ + if (val > 0x3FFU || hi > 0x3FFU)\ + ERROR\ + val += (hi<<10) + 0x10000;\ + }\ + }\ + /*! * \def PUT_UTF8(val, tmp, PUT_BYTE) * Converts a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long).