wincp.c 6.97 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
/*****************************************************************************
 * wincp.c: Guessing "local" ANSI code page on Microsoft Windows®
 *****************************************************************************
 *
 * Copyright © 2006-2007 Rémi Denis-Courmont
 * $Id$
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
 *****************************************************************************/

/*** We need your help to complete this file!! Look for FIXME ***/

25 26 27 28
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

29
#include <vlc_common.h>
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61

#ifndef WIN32
# include <locale.h>
#else
# include <windows.h>
#endif

#ifdef __APPLE__
#   include <errno.h>
#   include <string.h>
#endif

#include <vlc_charset.h>


#ifndef WIN32 /* should work on Win32, but useless */
static inline int locale_match (const char *tab, const char *locale)
{
    for (;*tab; tab += 2)
        if (memcmp (tab, locale, 2) == 0)
            return 0;
    return 1;
}


/**
 * @return a fallback characters encoding to be used, given a locale.
 */
static const char *FindFallbackEncoding (const char *locale)
{
    if ((locale == NULL) || (strlen (locale) < 2)
     || !strcasecmp (locale, "POSIX"))
62
        return "CP1252"; /* Yeah, this is totally western-biased */
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103


    /*** The ISO-8859 series (anything but Asia) ***/
    // Latin-1 Western-European languages (ISO-8859-1)
    static const char western[] =
        "aa" "af" "an" "br" "ca" "da" "de" "en" "es" "et" "eu" "fi" "fo" "fr"
        "ga" "gd" "gl" "gv" "id" "is" "it" "kl" "kw" "mg" "ms" "nb" "nl" "nn"
        "no" "oc" "om" "pt" "so" "sq" "st" "sv" "tl" "uz" "wa" "xh" "zu"
        "eo" "mt" "cy";
    if (!locale_match (western, locale))
        return "CP1252"; // Compatible Microsoft superset

    // Latin-2 Slavic languages (ISO-8859-2)
    static const char slavic[] = "bs" "cs" "hr" "hu" "pl" "ro" "sk" "sl";
    if (!locale_match (slavic, locale))
        return "CP1250"; // CP1250 is more common, but incompatible

    // Latin-3 Southern European languages (ISO-8859-3)
    // "eo" and "mt" -> Latin-1 instead, I presume(?).
    // "tr" -> ISO-8859-9 instead

    // Latin-4 North-European languages (ISO-8859-4)
    // -> Latin-1 instead

    /* Cyrillic alphabet languages (ISO-8859-5) */
    static const char cyrillic[] = "be" "bg" "mk" "ru" "sr";
    if (!locale_match (cyrillic, locale))
        return "CP1251"; // KOI8, ISO-8859-5 and CP1251 are incompatible(?)

    /* Arabic (ISO-8859-6) */
    if (!locale_match ("ar", locale))
        // FIXME: someone check if we should return CP1256 or ISO-8859-6
        return "CP1256"; // CP1256 is(?) more common, but incompatible(?)

    /* Greek (ISO-8859-7) */
    if (!locale_match ("el", locale))
        // FIXME: someone check if we should return CP1253 or ISO-8859-7
        return "CP1253"; // CP1253 is(?) more common and less incompatible

    /* Hebrew (ISO-8859-8) */
    if (!locale_match ("he" "iw" "yi", locale))
104
        return "ISO-8859-8"; // CP1255 is reportedly screwed up
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145

    /* Latin-5 Turkish (ISO-8859-9) */
    if (!locale_match ("tr" "ku", locale))
        return "CP1254"; // Compatible Microsoft superset

    /* Latin-6 “North-European” languages (ISO-8859-10) */
    /* It is so much north European that glibc only uses that for Luganda
     * which is spoken in Uganda... unless someone complains, I'm not
     * using this one; let's fallback to CP1252 here. */

    // ISO-8859-11 does arguably not exist. Thai is handled below.

    // ISO-8859-12 really doesn't exist.

    // Latin-7 Baltic languages (ISO-8859-13)
    if (!locale_match ("lt" "lv" "mi", locale))
        // FIXME: mi = New Zealand, doesn't sound baltic!
        return "CP1257"; // Compatible Microsoft superset

    // Latin-8 Celtic languages (ISO-8859-14)
    // "cy" -> use Latin-1 instead (most likely English or French)

    // Latin-9 (ISO-8859-15) -> see Latin-1

    // Latin-10 (ISO-8859-16) does not seem to be used

    /*** KOI series ***/
    // For Russian, we use CP1251
    if (!locale_match ("uk", locale))
        return "KOI8-U";

    if (!locale_match ("tg", locale))
        return "KOI8-T";

    /*** Asia ***/
    // Japanese
    if (!locale_match ("jp", locale))
        return "SHIFT-JIS"; // Shift-JIS is way more common than EUC-JP

    // Korean
    if (!locale_match ("ko", locale))
146
        return "CP949"; // Microsoft non-standard superset of EUC-KR
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197

    // Thai
    if (!locale_match ("th", locale))
        return "TIS-620";

    // Vietnamese (FIXME: more infos needed)
    if (!locale_match ("vt", locale))
        /* VISCII is probably a bad idea as it is not extended ASCII */
        /* glibc has TCVN5712-1 */
        return "CP1258";

    /* Kazakh (FIXME: more infos needed) */
    if (!locale_match ("kk", locale))
        return "PT154";

    // Chinese. The politically incompatible character sets.
    if (!locale_match ("zh", locale))
    {
        if ((strlen (locale) >= 5) && (locale[2] != '_'))
            locale += 3;

        // Hong Kong
        if (!locale_match ("HK", locale))
            return "BIG5-HKSCS"; /* FIXME: use something else? */

        // Taiwan island
        if (!locale_match ("TW", locale))
            return "BIG5";

        // People's Republic of China and Singapore
        /*
         * GB18030 can represent any Unicode code point
         * (like UTF-8), while remaining compatible with GBK
         * FIXME: is it compatible with GB2312? if not, should we
         * use GB2312 instead?
         */
        return "GB18030";
    }

    return "ASCII";
}
#endif

/**
 * GetFallbackEncoding() suggests an encoding to be used for non UTF-8
 * text files accord to the system's local settings. It is only a best
 * guess.
 */
const char *GetFallbackEncoding( void )
{
#ifndef WIN32
198
    const char *psz_lang;
199

200 201
    psz_lang = getenv ("LC_ALL");
    if ((psz_lang == NULL) || !*psz_lang)
202
    {
203
        psz_lang = getenv ("LC_CTYPE");
204
        if ((psz_lang == NULL) || !*psz_lang)
205
            psz_lang = getenv ("LANG");
206 207 208 209
    }

    return FindFallbackEncoding (psz_lang);
#else
210
    static char buf[16] = "";
211

212
    if (buf[0] == 0)
213 214 215 216 217 218 219 220 221
    {
        int cp = GetACP ();

        switch (cp)
        {
            case 1255: // Hebrew, CP1255 screws up somewhat
                strcpy (buf, "ISO-8859-8");
                break;
            default:
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
222
                snprintf (buf, sizeof (buf), "CP%u", cp);
223 224
        }
    }
225 226 227
    return buf;
#endif
}