Commit d10c3fa4 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

Factor EIT text conversion in a .h file

Also add support for the Private Use Area characters and check for
valid ISO 8859 character sets.
parent 327a22aa
SOURCES_dvb = \
access.c \
scan.c scan.h \
linux_dvb.c \
scan.c scan.h \
en50221.c en50221.h \
../../demux/dvb-text.h \
http.c \
dvb.h \
$(NULL)
......@@ -54,6 +54,7 @@
#endif
#include "dvb.h"
#include "../../demux/dvb-text.h"
#undef DEBUG_TPDU
#define HLCI_WAIT_CAM_READY 0
......@@ -1594,7 +1595,7 @@ static char *MMIGetText( cam_t *p_cam, uint8_t **pp_apdu, int *pi_size )
*pp_apdu += l + 4;
*pi_size -= l + 4;
return dvbsi_to_utf8((char*)d,l);
return vlc_from_EIT(d,l);
}
/*****************************************************************************
......@@ -2253,110 +2254,3 @@ void en50221_End( cam_t * p_cam )
/* Leave the CAM configured, so that it can be reused in another
* program. */
}
/* FIXME same than EITConvertToUTF8 from TS demux */
char *dvbsi_to_utf8( const char *psz_instring, size_t i_length )
{
const char *psz_encoding;
char psz_encbuf[sizeof( "ISO_8859-123" )];
size_t offset = 1;
if( i_length < 1 ) return NULL;
if( psz_instring[0] >= 0x20 )
{
psz_encoding = "ISO_6937";
offset = 0;
}
else switch( psz_instring[0] )
{
case 0x01:
psz_encoding = "ISO_8859-5";
break;
case 0x02:
psz_encoding = "ISO_8859-6";
break;
case 0x03:
psz_encoding = "ISO_8859-7";
break;
case 0x04:
psz_encoding = "ISO_8859-8";
break;
case 0x05:
psz_encoding = "ISO_8859-9";
break;
case 0x06:
psz_encoding = "ISO_8859-10";
break;
case 0x07:
psz_encoding = "ISO_8859-11";
break;
case 0x08:
psz_encoding = "ISO_8859-12";
break;
case 0x09:
psz_encoding = "ISO_8859-13";
break;
case 0x0a:
psz_encoding = "ISO_8859-14";
break;
case 0x0b:
psz_encoding = "ISO_8859-15";
break;
case 0x10:
#warning Is Latin-10 (psz_instring[2] == 16) really illegal?
if( i_length < 3 || psz_instring[1] != 0x00 || psz_instring[2] > 15
|| psz_instring[2] == 0 )
{
psz_encoding = "UTF-8";
offset = 0;
}
else
{
sprintf( psz_encbuf, "ISO_8859-%u", psz_instring[2] );
psz_encoding = psz_encbuf;
offset = 3;
}
break;
case 0x11:
#warning Is there a BOM or do we use a fixed endianess?
psz_encoding = "UTF-16";
break;
case 0x12:
psz_encoding = "KSC5601-1987";
break;
case 0x13:
psz_encoding = "GB2312"; /* GB-2312-1980 */
break;
case 0x14:
psz_encoding = "BIG-5";
break;
case 0x15:
psz_encoding = "UTF-8";
break;
default:
/* invalid */
psz_encoding = "UTF-8";
offset = 0;
}
psz_instring += offset;
i_length -= offset;
char *psz = FromCharset( psz_encoding, psz_instring, i_length );
if( psz == NULL )
{ /* Invalid character set (e.g. ISO_8859-12) */
psz = strndup( (const char *)psz_instring, i_length );
if( unlikely(psz == NULL) )
return NULL;
EnsureUTF8( psz );
}
/* Convert EIT-coded CR/LFs */
for(char *p = strstr( psz, "\xc2\x8a" ); p != NULL;
p = strstr( p, "\xc2\x8a" ))
{
p[0] = ' ';
p[1] = '\n';
}
return psz;
}
......@@ -139,5 +139,3 @@ int en50221_CloseMMI( cam_t *, unsigned i_slot );
en50221_mmi_object_t *en50221_GetMMIObject( cam_t *, unsigned i_slot );
void en50221_SendMMIObject( cam_t *, unsigned i_slot, en50221_mmi_object_t * );
void en50221_End( cam_t * );
char *dvbsi_to_utf8( const char *psz_instring, size_t i_length );
......@@ -32,6 +32,7 @@
#include <vlc_block.h>
#include <vlc_dialog.h>
#include <vlc_fs.h>
#include <vlc_charset.h>
#include <sys/types.h>
......@@ -50,7 +51,7 @@
#endif
#include "scan.h"
#include "en50221.h" // FIXME
#include "../../demux/dvb-text.h"
typedef enum
{
......@@ -850,7 +851,8 @@ void scan_session_Destroy( scan_t *p_scan, scan_session_t *p_session )
if( s )
{
if( !s->psz_name )
s->psz_name = dvbsi_to_utf8( (const char *)pD->i_service_name, pD->i_service_name_length );
s->psz_name = vlc_from_EIT( pD->i_service_name,
pD->i_service_name_length );
if( s->type == SERVICE_UNKNOWN )
{
......
......@@ -11,7 +11,7 @@ SOURCES_wav = wav.c
SOURCES_live555 = live555.cpp ../access/mms/asf.c ../access/mms/buffer.c
SOURCES_nsv = nsv.c
SOURCES_real = real.c
SOURCES_ts = ts.c ../mux/mpeg/csa.c
SOURCES_ts = ts.c ../mux/mpeg/csa.c dvb-text.h
SOURCES_ps = ps.c ps.h
SOURCES_mod = mod.c dummy.cpp
SOURCES_pva = pva.c
......
/*****************************************************************************
* dvb-text.h:
*****************************************************************************
* Copyright (C) 2007-2011 the VideoLAN team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
/**
* Converts a DVB SI text item to UTF-8.
* Refer to EN 800 486 annex A.
* @return a heap-allocation nul-terminated UTF-8 string or NULL on error.
*/
static char *vlc_from_EIT (const void *buf, size_t length)
{
if (unlikely(length == 0))
return NULL;
char encbuf[12];
const char *encoding = encbuf;
const char *in = buf;
size_t offset = 1;
unsigned char c = *in;
if (c >= 0x20)
{
offset = 0;
encoding = "ISO_6937";
}
else if ((1 << c) & 0x0EFE) /* 1-7, 9-11 -> ISO 8859-(c+4) */
{
snprintf (encbuf, sizeof (encbuf), "ISO_8859-%hhu", 4 + c);
}
else switch (c)
{
case 0x10: /* two more bytes */
offset = 3;
if (length < 3 || in[1] != 0x00)
return NULL;
c = in[2];
if ((1 << c) & 0xEFFE) /* 1-11, 13-15 -> ISO 8859-(c) */
snprintf (encbuf, sizeof (encbuf), "ISO_8859-%hhu", c);
else
return NULL;
break;
case 0x11: /* the BMP */
encoding = "UCS-2BE";
break;
case 0x12:
/* DVB has no clue about Korean. KS X 1001 (a.k.a. KS C 5601) is a
* character set, not a character encoding... So we assume EUC-KR.
* It is an encoding of KS X 1001. In practice, I guess nobody uses
* this in any real DVB system. */
encoding = "EUC-KR";
break;
case 0x13: /* GB-2312-1980 */
encoding = "GB2312";
break;
case 0x14: /* Big5 subset of the BMP */
encoding = "BIG-5";
break;
case 0x15:
encoding = "UTF-8";
break;
#if 0
case 0x1F: /* operator-specific(?) */
offset = 2;
#endif
default:
return NULL;
}
in += offset;
length -= offset;
char *out = FromCharset (encoding, in, length);
if (out == NULL)
{ /* Fallback... */
out = strndup (in, length);
if (unlikely(out == NULL))
return NULL;
EnsureUTF8 (out);
}
/* Convert control codes */
for (char *p = strchr (out, '\xC2'); p; p = strchr (p + 1, '\xC2'))
{
/* We have valid UTF-8, to 0xC2 is followed by a continuation byte. */
/* 0x80-0x85,0x88-0x89 are reserved.
* 0x86-0x87 are identical to Unicode and Latin-1.
* 0x8A is CR/LF.
* 0x8B-0x9F are unspecified. */
if (p[1] == '\x8A')
memcpy (p, "\r\n", 2);
}
/* Private use area */
for (char *p = strchr (out, '\xEE'); p; p = strchr (p + 1, '\xEE'))
{
/* Within UTF-8, 0xEE is followed by a two continuation bytes. */
if (p[1] != '\x82')
continue;
if (p[2] == '\x8A')
memcpy (p, "\r\r\n", 3); /* we need three bytes, so to CRs ;) */
}
return out;
}
......@@ -2584,120 +2584,15 @@ static void ValidateDVBMeta( demux_t *p_demux, int i_pid )
p_sys->b_dvb_meta = false;
}
#include "dvb-text.h"
/* FIXME same than dvbsi_to_utf8 from dvb access */
static char *EITConvertToUTF8( const unsigned char *psz_instring,
size_t i_length,
bool b_broken )
{
const char *psz_encoding;
char psz_encbuf[sizeof( "ISO_8859-123" )];
size_t offset = 1;
if( i_length < 1 ) return NULL;
if( psz_instring[0] >= 0x20 )
{
/* According to ETSI EN 300 468 Annex A, this should be ISO6937,
* but some broadcasters use different charset... */
if( b_broken )
psz_encoding = "ISO_8859-1";
else
psz_encoding = "ISO_6937";
offset = 0;
}
else switch( psz_instring[0] )
{
case 0x01:
psz_encoding = "ISO_8859-5";
break;
case 0x02:
psz_encoding = "ISO_8859-6";
break;
case 0x03:
psz_encoding = "ISO_8859-7";
break;
case 0x04:
psz_encoding = "ISO_8859-8";
break;
case 0x05:
psz_encoding = "ISO_8859-9";
break;
case 0x06:
psz_encoding = "ISO_8859-10";
break;
case 0x07:
psz_encoding = "ISO_8859-11";
break;
case 0x08:
psz_encoding = "ISO_8859-12";
break;
case 0x09:
psz_encoding = "ISO_8859-13";
break;
case 0x0a:
psz_encoding = "ISO_8859-14";
break;
case 0x0b:
psz_encoding = "ISO_8859-15";
break;
case 0x10:
#warning Is Latin-10 (psz_instring[2] == 16) really illegal?
if( i_length < 3 || psz_instring[1] != 0x00 || psz_instring[2] > 15
|| psz_instring[2] == 0 )
{
psz_encoding = "UTF-8";
offset = 0;
}
else
{
sprintf( psz_encbuf, "ISO_8859-%u", psz_instring[2] );
psz_encoding = psz_encbuf;
offset = 3;
}
break;
case 0x11:
#warning Is there a BOM or do we use a fixed endianess?
psz_encoding = "UTF-16";
break;
case 0x12:
psz_encoding = "KSC5601-1987";
break;
case 0x13:
psz_encoding = "GB2312"; /* GB-2312-1980 */
break;
case 0x14:
psz_encoding = "BIG-5";
break;
case 0x15:
psz_encoding = "UTF-8";
break;
default:
/* invalid */
psz_encoding = "UTF-8";
offset = 0;
}
psz_instring += offset;
i_length -= offset;
char *psz = FromCharset( psz_encoding, psz_instring, i_length );
if( psz == NULL )
{ /* Invalid character set (e.g. ISO_8859-12) */
psz = strndup( (const char *)psz_instring, i_length );
if( unlikely(psz == NULL) )
return NULL;
EnsureUTF8( psz );
}
/* Convert EIT-coded CR/LFs */
for(char *p = strstr( psz, "\xc2\x8a" ); p != NULL;
p = strstr( p, "\xc2\x8a" ))
{
p[0] = ' ';
p[1] = '\n';
}
return psz;
if( b_broken )
return FromCharset( "ISO_8859-1", psz_instring, i_length );
return vlc_from_EIT( psz_instring, i_length );
}
static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment