Commit fdfc6ad1 authored by Marian Ďurkovič's avatar Marian Ďurkovič

Perform charset detection and conversion to UTF-8 also for SDT fields.

parent dd537f5c
...@@ -2571,6 +2571,134 @@ static void ValidateDVBMeta( demux_t *p_demux, int i_pid ) ...@@ -2571,6 +2571,134 @@ static void ValidateDVBMeta( demux_t *p_demux, int i_pid )
#ifdef TS_USE_DVB_SI #ifdef TS_USE_DVB_SI
/* FIXME same than dvbsi_to_utf8 from dvb access */
static char *EITConvertToUTF8( const unsigned char *psz_instring,
size_t i_length )
{
const char *psz_encoding;
char *psz_outstring;
char psz_encbuf[sizeof( "ISO_8859-123" )];
size_t i_in, i_out, offset = 1;
vlc_iconv_t iconv_handle;
if( i_length < 1 ) return NULL;
if( psz_instring[0] >= 0x20 )
{
psz_encoding = "ISO_8859-1";
/* According to the specification, this should be ISO6937,
* but it seems Latin-1 is used instead. */
offset = 0;
}
else switch( psz_instring[0] )
{
case 0x01:
psz_encoding = "ISO_8859-5";
break;
case 0x02:
psz_encoding = "ISO_8859-6";
break;
case 0x03:
psz_encoding = "ISO_8859-7";
break;
case 0x04:
psz_encoding = "ISO_8859-8";
break;
case 0x05:
psz_encoding = "ISO_8859-9";
break;
case 0x06:
psz_encoding = "ISO_8859-10";
break;
case 0x07:
psz_encoding = "ISO_8859-11";
break;
case 0x08:
psz_encoding = "ISO_8859-12";
break;
case 0x09:
psz_encoding = "ISO_8859-13";
break;
case 0x0a:
psz_encoding = "ISO_8859-14";
break;
case 0x0b:
psz_encoding = "ISO_8859-15";
break;
case 0x10:
#warning Is Latin-10 (psz_instring[2] == 16) really illegal?
if( i_length < 3 || psz_instring[1] != 0x00 || psz_instring[2] > 15
|| psz_instring[2] == 0 )
{
psz_encoding = "UTF-8";
offset = 0;
}
else
{
sprintf( psz_encbuf, "ISO_8859-%u", psz_instring[2] );
psz_encoding = psz_encbuf;
offset = 3;
}
break;
case 0x11:
#warning Is there a BOM or do we use a fixed endianess?
psz_encoding = "UTF-16";
break;
case 0x12:
psz_encoding = "KSC5601-1987";
break;
case 0x13:
psz_encoding = "GB2312"; /* GB-2312-1980 */
break;
case 0x14:
psz_encoding = "BIG-5";
break;
case 0x15:
psz_encoding = "UTF-8";
break;
default:
/* invalid */
psz_encoding = "UTF-8";
offset = 0;
}
i_in = i_length - offset;
i_out = i_in * 6 + 1;
psz_outstring = malloc( i_out );
if( !psz_outstring )
{
return NULL;
}
iconv_handle = vlc_iconv_open( "UTF-8", psz_encoding );
if( iconv_handle == (vlc_iconv_t)(-1) )
{
/* Invalid character set (e.g. ISO_8859-12) */
memcpy( psz_outstring, &psz_instring[offset], i_in );
psz_outstring[i_in] = '\0';
EnsureUTF8( psz_outstring );
}
else
{
const char *psz_in = (const char *)&psz_instring[offset];
char *psz_out = psz_outstring;
while( vlc_iconv( iconv_handle, &psz_in, &i_in,
&psz_out, &i_out ) == (size_t)(-1) )
{
/* skip naughty byte. This may fail terribly for multibyte stuff,
* but what can we do anyway? */
psz_in++;
i_in--;
vlc_iconv( iconv_handle, NULL, NULL, NULL, NULL ); /* reset */
}
vlc_iconv_close( iconv_handle );
*psz_out = '\0';
}
return psz_outstring;
}
static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt ) static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
{ {
demux_sys_t *p_sys = p_demux->p_sys; demux_sys_t *p_sys = p_demux->p_sys;
...@@ -2634,14 +2762,13 @@ static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt ) ...@@ -2634,14 +2762,13 @@ static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
"DVB MHP service" "DVB MHP service"
}; };
dvbpsi_service_dr_t *pD = dvbpsi_DecodeServiceDr( p_dr ); dvbpsi_service_dr_t *pD = dvbpsi_DecodeServiceDr( p_dr );
char str1[257]; char *str1 = NULL;
char str2[257]; char *str2 = NULL;
memcpy( str1, pD->i_service_provider_name, str1 = EITConvertToUTF8(pD->i_service_provider_name,
pD->i_service_provider_name_length ); pD->i_service_provider_name_length);
str1[pD->i_service_provider_name_length] = '\0'; str2 = EITConvertToUTF8(pD->i_service_name,
memcpy( str2, pD->i_service_name, pD->i_service_name_length ); pD->i_service_name_length);
str2[pD->i_service_name_length] = '\0';
msg_Dbg( p_demux, " - type=%d provider=%s name=%s", msg_Dbg( p_demux, " - type=%d provider=%s name=%s",
pD->i_service_type, str1, str2 ); pD->i_service_type, str1, str2 );
...@@ -2650,6 +2777,8 @@ static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt ) ...@@ -2650,6 +2777,8 @@ static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
vlc_meta_SetPublisher( p_meta, str1 ); vlc_meta_SetPublisher( p_meta, str1 );
if( pD->i_service_type >= 0x01 && pD->i_service_type <= 0x10 ) if( pD->i_service_type >= 0x01 && pD->i_service_type <= 0x10 )
psz_type = ppsz_type[pD->i_service_type]; psz_type = ppsz_type[pD->i_service_type];
free( str1 );
free( str2 );
} }
} }
...@@ -2739,133 +2868,6 @@ static int EITConvertDuration( uint32_t i_duration ) ...@@ -2739,133 +2868,6 @@ static int EITConvertDuration( uint32_t i_duration )
} }
#undef CVT_FROM_BCD #undef CVT_FROM_BCD
/* FIXME same than dvbsi_to_utf8 from dvb access */
static char *EITConvertToUTF8( const unsigned char *psz_instring,
size_t i_length )
{
const char *psz_encoding;
char *psz_outstring;
char psz_encbuf[sizeof( "ISO_8859-123" )];
size_t i_in, i_out, offset = 1;
vlc_iconv_t iconv_handle;
if( i_length < 1 ) return NULL;
if( psz_instring[0] >= 0x20 )
{
psz_encoding = "ISO_8859-1";
/* According to the specification, this should be ISO6937,
* but it seems Latin-1 is used instead. */
offset = 0;
}
else switch( psz_instring[0] )
{
case 0x01:
psz_encoding = "ISO_8859-5";
break;
case 0x02:
psz_encoding = "ISO_8859-6";
break;
case 0x03:
psz_encoding = "ISO_8859-7";
break;
case 0x04:
psz_encoding = "ISO_8859-8";
break;
case 0x05:
psz_encoding = "ISO_8859-9";
break;
case 0x06:
psz_encoding = "ISO_8859-10";
break;
case 0x07:
psz_encoding = "ISO_8859-11";
break;
case 0x08:
psz_encoding = "ISO_8859-12";
break;
case 0x09:
psz_encoding = "ISO_8859-13";
break;
case 0x0a:
psz_encoding = "ISO_8859-14";
break;
case 0x0b:
psz_encoding = "ISO_8859-15";
break;
case 0x10:
#warning Is Latin-10 (psz_instring[2] == 16) really illegal?
if( i_length < 3 || psz_instring[1] != 0x00 || psz_instring[2] > 15
|| psz_instring[2] == 0 )
{
psz_encoding = "UTF-8";
offset = 0;
}
else
{
sprintf( psz_encbuf, "ISO_8859-%u", psz_instring[2] );
psz_encoding = psz_encbuf;
offset = 3;
}
break;
case 0x11:
#warning Is there a BOM or do we use a fixed endianess?
psz_encoding = "UTF-16";
break;
case 0x12:
psz_encoding = "KSC5601-1987";
break;
case 0x13:
psz_encoding = "GB2312"; /* GB-2312-1980 */
break;
case 0x14:
psz_encoding = "BIG-5";
break;
case 0x15:
psz_encoding = "UTF-8";
break;
default:
/* invalid */
psz_encoding = "UTF-8";
offset = 0;
}
i_in = i_length - offset;
i_out = i_in * 6 + 1;
psz_outstring = malloc( i_out );
if( !psz_outstring )
{
return NULL;
}
iconv_handle = vlc_iconv_open( "UTF-8", psz_encoding );
if( iconv_handle == (vlc_iconv_t)(-1) )
{
/* Invalid character set (e.g. ISO_8859-12) */
memcpy( psz_outstring, &psz_instring[offset], i_in );
psz_outstring[i_in] = '\0';
EnsureUTF8( psz_outstring );
}
else
{
const char *psz_in = (const char *)&psz_instring[offset];
char *psz_out = psz_outstring;
while( vlc_iconv( iconv_handle, &psz_in, &i_in,
&psz_out, &i_out ) == (size_t)(-1) )
{
/* skip naughty byte. This may fail terribly for multibyte stuff,
* but what can we do anyway? */
psz_in++;
i_in--;
vlc_iconv( iconv_handle, NULL, NULL, NULL, NULL ); /* reset */
}
vlc_iconv_close( iconv_handle );
*psz_out = '\0';
}
return psz_outstring;
}
static void EITCallBack( demux_t *p_demux, static void EITCallBack( demux_t *p_demux,
dvbpsi_eit_t *p_eit, bool b_current_following ) dvbpsi_eit_t *p_eit, bool b_current_following )
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment