c++ 字符集转换

转码整理, 资料来源于网络

charset.h

#pragma once

#include <iostream>
#include <string>

std::string  UnicodeToAnsi(const std::wstring& unicode);
std::wstring AnsiToUnicode(const std::string& ansi);

std::string  AnsiToUtf8(const std::string& strSrc);
std::string  Utf8ToAnsi(const std::string& strSrc);

std::string  UnicodeToUtf8(const std::wstring& wstrSrc);
std::wstring Utf8ToUnicode(const std::string& strSrc);

std::string  GBKToUtf8(const std::string& gbk);
std::string  Utf8ToGBK(const std::string& utf8);

std::wstring GB2312ToUnicode(const std::string& gb2312);
std::string  UnicodeToGB2312(const std::wstring& unicode);

std::wstring BIG5ToUnicode(const std::string& big5);
std::string  UnicodeToBIG5(const std::wstring& unicode);

std::string  FBIG5ToGB2312(const std::string& big5);
std::string  GB2312ToFBIG5(const std::string gb2312);

bool IsUTF8(const void* pBuffer, long size);

main.cpp

#include "charset.h"

void showHex(const char* bytes, int len) {
    for (int i = 0; i < len; i++) {
        printf("%02x ", (unsigned char)bytes[i]);
    }
}

void showHex(std::string charset, std::string str) {
    printf("%10s: ", charset.data());
    showHex(str.data(), str.size());
    printf("\n");
}

void showHex(std::string charset, std::wstring str) {
    printf("%10s: ", charset.data());
    showHex((char*)str.data(), 2 * str.size());
    printf("\n");
}

int main(int argc, char* argv[])
{
    std::wstring wstr(L"中abc国");
    std::string str("中abc国");

    std::string ansi;
    std::string utf8;
    std::string gbk;
    std::wstring unicode;

    showHex("unicode", wstr);
    showHex("ansi", str);

    ansi = UnicodeToAnsi(wstr); showHex("ansi", ansi);

    unicode = AnsiToUnicode(ansi); showHex("unicode", unicode);

    utf8 = AnsiToUtf8(str); showHex("utf8", utf8);
    ansi = Utf8ToAnsi(utf8); showHex("ansi", ansi);

    utf8 = UnicodeToUtf8(wstr); showHex("utf8", utf8);
    unicode = Utf8ToUnicode(utf8); showHex("unicode", unicode);

    gbk = Utf8ToGBK(utf8); showHex("gbk", gbk);
    utf8 = GBKToUtf8(gbk); showHex("utf8", utf8);

    getchar();
    return 0;
}

charset.cpp

#inchude "charset.h"
#include <Windows.h>


std::string UnicodeToAnsi(const std::wstring& unicode)
{
    LPCWCH ptr = unicode.c_str();
    /** 分配目标空间, 一个16位Unicode字符最多可以转为4个字节int size = static_cast<int>( wstrSrc.size() * 4 + 10 );*/
    int size = WideCharToMultiByte(CP_THREAD_ACP, 0, ptr, -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    int len = WideCharToMultiByte(CP_THREAD_ACP, 0, ptr, -1, (LPSTR)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

std::wstring AnsiToUnicode(const std::string& ansi)
{
    LPCCH ptr = ansi.c_str();
    int size = MultiByteToWideChar(CP_ACP, 0, ptr, -1, NULL, NULL);

    std::wstring wstrRet(size, 0);
    int len = MultiByteToWideChar(CP_ACP, 0, ptr, -1, (LPWSTR)wstrRet.c_str(), size);

    return wstrRet;
}

std::string AnsiToUtf8(const std::string& ansi)
{
    LPCCH ptr = ansi.c_str();
    /* 分配目标空间, 长度为 Ansi 编码的两倍 */
    int size = MultiByteToWideChar(CP_ACP, 0, ptr, -1, NULL, NULL);

    std::wstring wstrTemp(size, 0);
    int len = MultiByteToWideChar(CP_ACP, 0, ptr, -1, (LPWSTR)wstrTemp.c_str(), size);

    return UnicodeToUtf8(wstrTemp);
}

std::string Utf8ToAnsi(const std::string& utf8)
{
    std::wstring wstrTemp = Utf8ToUnicode(utf8);

    LPCWCH ptr = wstrTemp.c_str();
    int size = WideCharToMultiByte(CP_ACP, 0, ptr, -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    int len = WideCharToMultiByte(CP_ACP, 0, ptr, -1, (LPSTR)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

std::string UnicodeToUtf8(const std::wstring& unicode)
{
    /* 分配目标空间, 一个16位Unicode字符最多可以转为4个字节 */
    LPCWCH ptr = unicode.c_str();
    int size = WideCharToMultiByte(CP_UTF8, 0, ptr, -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    int len = WideCharToMultiByte(CP_UTF8, 0, ptr, -1, (char*)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

std::wstring Utf8ToUnicode(const std::string& utf8)
{
    LPCCH ptr = utf8.c_str();
    int size = MultiByteToWideChar(CP_UTF8, 0, ptr, -1, NULL, NULL);

    std::wstring wstrRet(size, 0);
    int len = MultiByteToWideChar(CP_UTF8, 0, ptr, -1, (LPWSTR)wstrRet.c_str(), size);

    return wstrRet;
}


std::string GBKToUtf8(const std::string& gbk)
{
    return AnsiToUtf8(gbk);
}

std::string Utf8ToGBK(const std::string& utf8)
{
    return Utf8ToAnsi(utf8);
}

bool IsUTF8(const void* pBuffer, long size)
{
    bool isUTF8 = true;
    unsigned char* start = (unsigned char*)pBuffer;
    unsigned char* end = (unsigned char*)pBuffer + size;
    while (start < end)
    {
        if (*start < 0x80) { /*(10000000): 值小于0x80的为ASCII字符*/
            start++;
        }
        else if (*start < (0xC0)) { /*(11000000): 值介于0x80与0xC0之间的为无效UTF-8字符*/
            isUTF8 = false;
            break;
        }
        else if (*start < (0xE0)) { /*(11100000): 此范围内为2字节UTF-8字符  */
            if (start >= end - 1) {
                break;
            }
            if ((start[1] & (0xC0)) != 0x80) {
                isUTF8 = false;
                break;
            }
            start += 2;
        }
        else if (*start < (0xF0)) { /**(11110000): 此范围内为3字节UTF-8字符*/
            if (start >= end - 2) {
                break;
            }
            if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80) {
                isUTF8 = false;
                break;
            }
            start += 3;
        }
        else {
            isUTF8 = false;
            break;
        }
    }

    return isUTF8;
}



//GB2312 转换成 Unicode
std::wstring GB2312ToUnicode(const std::string& gb2312)
{
    UINT nCodePage = 936; //GB2312
    int size = MultiByteToWideChar(nCodePage, 0, gb2312.c_str(), -1, NULL, 0);

    std::wstring wstrRet(size, 0);
    MultiByteToWideChar(nCodePage, 0, gb2312.c_str(), -1, (LPWSTR)wstrRet.c_str(), size);

    return wstrRet;
}

//BIG5 转换成 Unicode
std::wstring BIG5ToUnicode(const std::string& big5)
{
    UINT nCodePage = 950; //BIG5
    int size = MultiByteToWideChar(nCodePage, 0, big5.c_str(), -1, NULL, 0);

    std::wstring wstrRet(size, 0);
    MultiByteToWideChar(nCodePage, 0, big5.c_str(), -1, (LPWSTR)wstrRet.c_str(), size);

    return wstrRet;
}

//Unicode 转换成 GB2312
std::string UnicodeToGB2312(const std::wstring& unicode)
{
    UINT nCodePage = 936; //GB2312
    int size = WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, (LPSTR)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

//Unicode 转换成 BIG5
std::string UnicodeToBIG5(const std::wstring& unicode)
{
    UINT nCodePage = 950; //BIG5
    int size = WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, (LPSTR)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

//繁体中文BIG5 转换成 简体中文 GB2312
std::string FBIG5ToGB2312(const std::string& big5)
{
    LCID lcid = MAKELCID(MAKELANGID(LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED), SORT_CHINESE_PRC);
    std::wstring unicode = BIG5ToUnicode(big5);

    std::string gb2312 = UnicodeToGB2312(unicode);
    int size = LCMapStringA(lcid, LCMAP_SIMPLIFIED_CHINESE, gb2312.c_str(), -1, NULL, 0);

    std::string strRet(size, 0);
    LCMapStringA(0x0804, LCMAP_SIMPLIFIED_CHINESE, gb2312.c_str(), -1, (LPSTR)strRet.c_str(), size);

    return strRet;
}

//简体中文 GB2312 转换成 繁体中文BIG5
std::string GB2312ToFBIG5(const std::string gb2312)
{
    LCID lcid = MAKELCID(MAKELANGID(LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED), SORT_CHINESE_PRC);
    int size = LCMapStringA(lcid, LCMAP_TRADITIONAL_CHINESE, gb2312.c_str(), -1, NULL, 0);

    std::string strRet(size, 0);
    LCMapStringA(lcid, LCMAP_TRADITIONAL_CHINESE, gb2312.c_str(), -1, (LPSTR)strRet.c_str(), size);

    std::wstring unicode = GB2312ToUnicode(strRet);
    std::string big5 = UnicodeToBIG5(unicode);

    return big5;
}