C#将汉字转换为拼音首字母

关于这个话题以前曾经使用过一个简便的算法很长时间, 代码如下:

private string ToPinyinSingle(string str)

{

if (str.CompareTo("吖") < 0)

return str;

if (str.CompareTo("八") < 0)

return "a";

if (str.CompareTo("嚓") < 0)

return "b";

if (str.CompareTo("咑") < 0)

return "c";

if (str.CompareTo("妸") < 0)

return "d";

if (str.CompareTo("发") < 0)

return "e";

if (str.CompareTo("旮") < 0)

return "f";

if (str.CompareTo("铪") < 0)

return "g";

if (str.CompareTo("讥") < 0)

return "h";

if (str.CompareTo("咔") < 0)

return "j";

if (str.CompareTo("垃") < 0)

return "k";

if (str.CompareTo("嘸") < 0)

return "l";

if (str.CompareTo("拏") < 0)

return "m";

if (str.CompareTo("噢") < 0)

return "n";

if (str.CompareTo("妑") < 0)

return "o";

if (str.CompareTo("七") < 0)

return "p";

if (str.CompareTo("亽") < 0)

return "q";

if (str.CompareTo("仨") < 0)

return "r";

if (str.CompareTo("他") < 0)

return "s";

if (str.CompareTo("哇") < 0)

return "t";

if (str.CompareTo("夕") < 0)

return "w";

if (str.CompareTo("丫") < 0)

return "x";

if (str.CompareTo("帀") < 0)

return "y";

if (str.CompareTo("咗") < 0)

return "z";

return str;

}

这个函数只处理单个汉字, 简单地加个循环就可以让它处理文字串了.

在.net 3.5下, 它一直工作得很好, 虽然偶尔也有出错的时候, 但是概率极低, 基本上可以忽略不计.

然而后来我把项目升级到.net 4.0以后, 发现出错的几率直线上升, 已经高得无法容忍的程度了(例如, "梅" 会返回"L"), 简单查了一下, 没找到微软关于String.CompareTo函数有什么变化的说明, 束手无策, 于是换用另一个也很简单的算法(http://topic.csdn.net/u/20090219/12/61745e3a-a39e-4f4d-8985-67d124236694.html):

static public string getSpell(string cn)

{

byte[] arrCN = System.Text.Encoding.Default.GetBytes(cn);

if(arrCN.Length > 1)

{

int area = (short)arrCN[0];

int pos = (short)arrCN[1];

int code = (area<<8) + pos;

int[] areacode = {45217,45253,45761,46318,46826,47010,47297,47614,48119,48119,49062,49324,49896,50371,50614,50622,50906,51387,51446,52218,52698,52698,52698,52980,53689,54481};

for(int i=0;i<26;i++)

{

int max = 55290;

if(i != 25) max = areacode[i+1];

if(areacode[i]<=code && code<max)

{

return System.Text.Encoding.Default.GetString(new byte[]{(byte)(65+i)});

}

}

return "?";

}

else return cn;

}

但是这个函数出错的概率也很高, 例如"闫""窦""圳" 等都无法识别, 追查了一下原因, 发现原来对GB2312编码来说, 存放规定是这样的:

01-09区为特殊符号。

16-55区为一级汉字,按拼音排序。

56-87区为二级汉字,按部首/笔画排序。

每个汉字及符号以两个字节来表示。第一个字节称为“高位字节”,第二个字节称为“低位字节”。

“高位字节”使用了0xA1-0xF7(把01-87区的区号加上0xA0),“低位字节”使用了0xA1-0xFE(把01-94加上0xA0)。

例如“啊”字在大多数程序中,会以0xB0A1储存。(与区位码对比:0xB0=0xA0+16,0xA1=0xA0+1)

上述几个字位置码都大于55290, 显然是二级汉字, 这个算法就处理不了了, 换言之, 这种写法只能用于处理一级汉字. 这当然是不可接受的.

后来翻查良久, 终于找到一个用C++写的算法, 可以同时处理一级汉字和二级汉字(http://download.csdn.net/detail/ronjay/1955072), 我把它改写成了C#, 代码如下:

public class ChineseToPinYin

{

#region " 全局变量 "

private static string[] _regionChar = new string[32]

{

"CJWGNSPGCGNESYPBTYYZDXYKYGTDJNNJQMBSGZSCYJSYYQPGKBZGYCYWJKGKLJSWKPJQHYTWDDZLSGMRYPYWWCCKZNKYDG",

"TTNGJEYKKZYTCJNMCYLQLYPYQFQRPZSLWBTGKJFYXJWZLTBNCXJJJJZXDTTSQZYCDXXHGCKBPHFFSSWYBGMXLPBYLLLHLX",

"SPZMYJHSOJNGHDZQYKLGJHSGQZHXQGKEZZWYSCSCJXYEYXADZPMDSSMZJZQJYZCDJZWQJBDZBXGZNZCPWHKXHQKMWFBPBY",

"DTJZZKQHYLYGXFPTYJYYZPSZLFCHMQSHGMXXSXJJSDCSBBQBEFSJYHWWGZKPYLQBGLDLCCTNMAYDDKSSNGYCSGXLYZAYBN",

"PTSDKDYLHGYMYLCXPYCJNDQJWXQXFYYFJLEJBZRXCCQWQQSBNKYMGPLBMJRQCFLNYMYQMSQTRBCJTHZTQFRXQHXMJJCJLX",

"QGJMSHZKBSWYEMYLTXFSYDSGLYCJQXSJNQBSCTYHBFTDCYZDJWYGHQFRXWCKQKXEBPTLPXJZSRMEBWHJLBJSLYYSMDXLCL",

"QKXLHXJRZJMFQHXHWYWSBHTRXXGLHQHFNMNYKLDYXZPWLGGTMTCFPAJJZYLJTYANJGBJPLQGDZYQYAXBKYSECJSZNSLYZH",

"ZXLZCGHPXZHZNYTDSBCJKDLZAYFMYDLEBBGQYZKXGLDNDNYSKJSHDLYXBCGHXYPKDJMMZNGMMCLGWZSZXZJFZNMLZZTHCS",

"YDBDLLSCDDNLKJYKJSYCJLKOHQASDKNHCSGANHDAASHTCPLCPQYBSDMPJLPCJOQLCDHJJYSPRCHNWJNLHLYYQYYWZPTCZG",

"WWMZFFJQQQQYXACLBHKDJXDGMMYDJXZLLSYGXGKJRYWZWYCLZMSSJZLDBYDCFCXYHLXCHYZJQSFQAGMNYXPFRKSSBJLYXY",

"SYGLNSCMHCWWMNZJJLXXHCHSYDSTTXRYCYXBYHCSMXJSZNPWGPXXTAYBGAJCXLYSDCCWZOCWKCCSBNHCPDYZNFCYYTYCKX",

"KYBSQKKYTQQXFCWCHCYKELZQBSQYJQCCLMTHSYWHMKTLKJLYCXWHEQQHTQHZPQSQSCFYMMDMGBWHWLGSSLYSDLMLXPTHMJ",

"HWLJZYHZJXHTXJLHXRSWLWZJCBXMHZQXSDZPMGFCSGLSXYMJSHXPJXWMYQKSMYPLRTHBXFTPMHYXLCHLHLZYLXGSSSSTCL",

"SLDCLRPBHZHXYYFHBBGDMYCNQQWLQHJJZYWJZYEJJDHPBLQXTQKWHLCHQXAGTLXLJXMSLXHTZKZJECXJCJNMFBYCSFYWYB",

"JZGNYSDZSQYRSLJPCLPWXSDWEJBJCBCNAYTWGMPAPCLYQPCLZXSBNMSGGFNZJJBZSFZYNDXHPLQKZCZWALSBCCJXJYZGWK",

"YPSGXFZFCDKHJGXDLQFSGDSLQWZKXTMHSBGZMJZRGLYJBPMLMSXLZJQQHZYJCZYDJWBMJKLDDPMJEGXYHYLXHLQYQHKYCW",

"CJMYYXNATJHYCCXZPCQLBZWWYTWBQCMLPMYRJCCCXFPZNZZLJPLXXYZTZLGDLDCKLYRZZGQTGJHHHJLJAXFGFJZSLCFDQZ",

"LCLGJDJCSNCLLJPJQDCCLCJXMYZFTSXGCGSBRZXJQQCTZHGYQTJQQLZXJYLYLBCYAMCSTYLPDJBYREGKLZYZHLYSZQLZNW",

"CZCLLWJQJJJKDGJZOLBBZPPGLGHTGZXYGHZMYCNQSYCYHBHGXKAMTXYXNBSKYZZGJZLQJDFCJXDYGJQJJPMGWGJJJPKQSB",

"GBMMCJSSCLPQPDXCDYYKYFCJDDYYGYWRHJRTGZNYQLDKLJSZZGZQZJGDYKSHPZMTLCPWNJAFYZDJCNMWESCYGLBTZCGMSS",

"LLYXQSXSBSJSBBSGGHFJLWPMZJNLYYWDQSHZXTYYWHMCYHYWDBXBTLMSYYYFSXJCSDXXLHJHFSSXZQHFZMZCZTQCXZXRTT",

"DJHNNYZQQMNQDMMGYYDXMJGDHCDYZBFFALLZTDLTFXMXQZDNGWQDBDCZJDXBZGSQQDDJCMBKZFFXMKDMDSYYSZCMLJDSYN",

"SPRSKMKMPCKLGDBQTFZSWTFGGLYPLLJZHGJJGYPZLTCSMCNBTJBQFKTHBYZGKPBBYMTTSSXTBNPDKLEYCJNYCDYKZDDHQH",

"SDZSCTARLLTKZLGECLLKJLQJAQNBDKKGHPJTZQKSECSHALQFMMGJNLYJBBTMLYZXDCJPLDLPCQDHZYCBZSCZBZMSLJFLKR",

"ZJSNFRGJHXPDHYJYBZGDLQCSEZGXLBLGYXTWMABCHECMWYJYZLLJJYHLGBDJLSLYGKDZPZXJYYZLWCXSZFGWYYDLYHCLJS",

"CMBJHBLYZLYCBLYDPDQYSXQZBYTDKYXJYYCNRJMPDJGKLCLJBCTBJDDBBLBLCZQRPPXJCGLZCSHLTOLJNMDDDLNGKAQHQH",

"JGYKHEZNMSHRPHQQJCHGMFPRXHJGDYCHGHLYRZQLCYQJNZSQTKQJYMSZSWLCFQQQXYFGGYPTQWLMCRNFKKFSYYLQBMQAMM",

"MYXCTPSHCPTXXZZSMPHPSHMCLMLDQFYQXSZYJDJJZZHQPDSZGLSTJBCKBXYQZJSGPSXQZQZRQTBDKYXZKHHGFLBCSMDLDG",

"DZDBLZYYCXNNCSYBZBFGLZZXSWMSCCMQNJQSBDQSJTXXMBLTXZCLZSHZCXRQJGJYLXZFJPHYMZQQYDFQJJLZZNZJCDGZYG",

"CTXMZYSCTLKPHTXHTLBJXJLXSCDQXCBBTJFQZFSLTJBTKQBXXJJLJCHCZDBZJDCZJDCPRNPQCJPFCZLCLZXZDMXMPHJSGZ",

"GSZZQJYLWTJPFSYASMCJBTZKYCWMYTCSJJLJCQLWZMALBXYFBPNLSFHTGJWEJJXXGLLJSTGSHJQLZFKCGNNDSZFDEQFHBS",

"AQTGLLBXMMYGSZLDYDQMJJRGBJTKGDHGKBLQKBDMBYLXWCXYTTYBKMRTJZXQJBHLMHMJJZMQASLDCYXYQDLQCAFYWYXQHZ"

};

private static System.Text.Encoding _encoding = System.Text.Encoding.GetEncoding("GB2312");

#endregion

private static bool In(int lp, int hp, int value)

{

return ((value <= hp) && (value >= lp));

}

public static char GetFirstChar(string chineseChar)

{

var bytes = _encoding.GetBytes(chineseChar);

if (bytes.Length != 2)

return chineseChar[0];

return GetChar(bytes[0], bytes[1], chineseChar);

}

private static char GetChar(byte c1, byte c2, string originChar)

{

var Hi = c1 << 8;

var Lo = c2;

int n = Hi + Lo;

if (n <= 0xD7F9)

{

if (In(0xB0A1, 0xB0C4, n)) return 'A';

if (In(0XB0C5, 0XB2C0, n)) return 'B';

if (In(0xB2C1, 0xB4ED, n)) return 'C';

if (In(0xB4EE, 0xB6E9, n)) return 'D';

if (In(0xB6EA, 0xB7A1, n)) return 'E';

if (In(0xB7A2, 0xB8C0, n)) return 'F';

if (In(0xB8C1, 0xB9FD, n)) return 'G';

if (In(0xB9FE, 0xBBF6, n)) return 'H';

if (In(0xBBF7, 0xBFA5, n)) return 'J';

if (In(0xBFA6, 0xC0AB, n)) return 'K';

if (In(0xC0AC, 0xC2E7, n)) return 'L';

if (In(0xC2E8, 0xC4C2, n)) return 'M';

if (In(0xC4C3, 0xC5B5, n)) return 'N';

if (In(0xC5B6, 0xC5BD, n)) return 'O';

if (In(0xC5BE, 0xC6D9, n)) return 'P';

if (In(0xC6D1, 0xC8BA, n)) return 'Q';

if (In(0xC8BB, 0xC8F5, n)) return 'R';

if (In(0xC8F6, 0xCBF9, n)) return 'S';

if (In(0xCBFA, 0xCDD9, n)) return 'T';

if (In(0xCDDA, 0xCEF3, n)) return 'W';

if (In(0xCEF4, 0xD1B8, n)) return 'X';

if (In(0xD1B9, 0xD4D0, n)) return 'Y';

if (In(0xD4D1, 0xD7F9, n)) return 'Z';

return originChar[0];

}

else

{

var b1 = (c1 & 0x7F) - 0x20 - 56;

var b2 = (c2 & 0x7F) - 0x20 - 1;

if (b1 >= 0 && b1 <= 31 && b2 >= 0 && b2 <= 93)

{

return _regionChar[b1][b2];

}

return originChar[0];

}

}

}

这个算法目前还没有发现哪个汉字会出错.