C++读取文本文件 ANSI,UNICODE,UTF-8

转自http://blog.csdn.net/lightlater/article/details/6326338

关于文本文件的文件头

第一 ANSI文件的文件头为空,不需要处理;

第二 UNICODE文件的文件头为0xFF,0xFE共计两个字节,读取时需要偏移两个字节再行读取;

第三 UTF-8文件的文件头为0xEF,0xBB,0xBF共计三个字节,读取时需要偏移三个字节后再行读取;

关于文本文件类型的判断

根据文本文件的文件头,就可以判断文本文件的类型了。

假设有如下文件类型定义:

typedef enum FileType

{

ANSI = 0,

UNICODE,

UTF8,

}FILETYPE;

我们就可以根据上述特性,来判断文本文件的类型了,下面是一段示例代码:

FILETYPE GetTextFileType(const std::string & strFileName)

{

FILETYPE fileType = ANSI;

std::ifstream file;

file.open(strFileName.c_str(), std::ios_base::in);

bool bUnicodeFile = false;

if (file.good())

{

char szFlag[3] = {0};

file.read(szFlag, sizeof(char) * 3);

if ((unsigned char)szFlag[0] == 0xFF

&& (unsigned char)szFlag[1] == 0xFE)

{

fileType = UNICODE;

}

else if ((unsigned char)szFlag[0] == 0xEF

&& (unsigned char)szFlag[1] == 0xBB

&& (unsigned char)szFlag[2] == 0xBF)

{

fileType = UTF8;

}

}

file.close();

return fileType;

}

ANSI文本文件的读取

ANSI文本文件不需要进行文件头的处理,可以直接读取。

下面是简单示例:

char szBuf[FBLOCK_MAX_BYTES];

memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);

std::string strMessage;

FILE * fp = NULL;

fp = fopen(strFileName.c_str(), "rb");

if (fp != NULL)

{

// common file do not offset.

while(fread(szBuf, sizeof(char), FBLOCK_MAX_BYTES, fp) > 0)

{

strMessage += szBuf;

memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);

}

}

std::cout << strMessage << std::endl;

fclose(fp);

UNICODE文本文件读取

由于UNICODE普遍采用双字节来表示字符,因此读取时,当使用wchar_t类型来读取,使用fopen,fread来进行操作。

下面是简单示例:

wchar_t szBuf[FBLOCK_MAX_BYTES];

memset(szBuf, 0, sizeof(wchar_t) * FBLOCK_MAX_BYTES);

std::string strMessage;

FILE * fp = NULL;

fp = fopen(strFileName.c_str(), "rb");

if (fp != NULL)

{

// Unicode file should offset wchar_t bits(2 byte) from start.

fseek(fp, sizeof(wchar_t), 0);

while(fread(szBuf, sizeof(wchar_t), FBLOCK_MAX_BYTES, fp) > 0)

{

char szTemp[FBLOCK_MAX_BYTES] = {0};

UnicodeToANSI(szTemp, szBuf);

strMessage += szTemp;

memset(szBuf, 0, sizeof(wchar_t) * FBLOCK_MAX_BYTES);

}

}

std::cout << strMessage << std::endl;

fclose(fp);

UTF8文本文件的读取

UTF8是可变字节,使用单一字节读取比较合理,所以读取时,使用char作为基本类型。

下面是简单示例代码:

char szBuf[FBLOCK_MAX_BYTES];

memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);

std::string strMessage;

FILE * fp = NULL;

fp = fopen(strFileName.c_str(), "rb");

if (fp != NULL)

{

// UTF-8 file should offset 3 byte from start position.

fseek(fp, sizeof(char) * 3, 0);

while(fread(szBuf, sizeof(char), FBLOCK_MAX_BYTES, fp) > 0)

{

strMessage += szBuf;

memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);

}

}

std::cout << strMessage << std::endl;

fclose(fp);

  1 #include <assert.h>
  2 #include <windows.h>
  3 #include <iostream>
  4 #include <fstream>
  5 #include <string>
  6 
  7 const int FBLOCK_MAX_BYTES = 256;
  8 
  9 // File Type.
 10 typedef enum FileType
 11 {
 12     ANSI = 0,
 13     UNICODE,
 14     UTF8,
 15 }FILETYPE;
 16 
 17 FILETYPE GetTextFileType(const std::string & strFileName);
 18 
 19 int UnicodeToANSI(char * pDes, const wchar_t * pSrc);
 20 
 21 void main()
 22 {
 23     // file test.
 24     std::string strFileANSI = "C://Hello_ANSI.txt";
 25     std::string strFileUNICODE = "C://Hello_UNICODE.txt";
 26     std::string strFileUTF8 = "C://Hello_UTF8.txt";
 27 
 28     // please change the file name to test.
 29     std::string strFileName = strFileUTF8;
 30 
 31     FILETYPE fileType = GetTextFileType(strFileName);
 32 
 33     if (UNICODE == fileType)
 34     {
 35         wchar_t szBuf[FBLOCK_MAX_BYTES];
 36         memset(szBuf, 0, sizeof(wchar_t) * FBLOCK_MAX_BYTES);
 37 
 38         std::string strMessage;
 39 
 40         FILE * fp = NULL;
 41         fp = fopen(strFileName.c_str(), "rb");
 42         if (fp != NULL)
 43         {
 44             // Unicode file should offset wchar_t bits(2 byte) from start.
 45             fseek(fp, sizeof(wchar_t), 0);
 46             while(fread(szBuf, sizeof(wchar_t), FBLOCK_MAX_BYTES, fp) > 0)
 47             {
 48                 char szTemp[FBLOCK_MAX_BYTES] = {0};
 49 
 50                 UnicodeToANSI(szTemp, szBuf);
 51                 strMessage += szTemp;
 52                 memset(szBuf, 0, sizeof(wchar_t) * FBLOCK_MAX_BYTES);
 53             }
 54         }
 55 
 56         std::cout << strMessage << std::endl;
 57 
 58         fclose(fp);
 59     }
 60     else if (UTF8 == fileType)
 61     {
 62         char szBuf[FBLOCK_MAX_BYTES];
 63         memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);
 64         
 65         std::string strMessage;
 66         
 67         FILE * fp = NULL;
 68         fp = fopen(strFileName.c_str(), "rb");
 69         if (fp != NULL)
 70         {
 71             // UTF-8 file should offset 3 byte from start position.
 72             fseek(fp, sizeof(char) * 3, 0);
 73             while(fread(szBuf, sizeof(char), FBLOCK_MAX_BYTES, fp) > 0)
 74             {
 75                 strMessage += szBuf;
 76                 memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);
 77             }
 78         }
 79         
 80         std::cout << strMessage << std::endl;
 81         
 82         fclose(fp);
 83     }
 84     else
 85     {
 86         char szBuf[FBLOCK_MAX_BYTES];
 87         memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);
 88         
 89         std::string strMessage;
 90         
 91         FILE * fp = NULL;
 92         fp = fopen(strFileName.c_str(), "rb");
 93         if (fp != NULL)
 94         {
 95             // common file do not offset.
 96             while(fread(szBuf, sizeof(char), FBLOCK_MAX_BYTES, fp) > 0)
 97             {
 98                 strMessage += szBuf;
 99                 memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);
100             }
101         }
102 
103         std::cout << strMessage << std::endl;
104         
105         fclose(fp);
106     }
107 
108 #ifdef _DEBUG
109     getchar();
110 #endif
111 }
112 
113 FILETYPE GetTextFileType(const std::string & strFileName)
114 {
115     FILETYPE fileType = ANSI;
116     std::ifstream file;
117     file.open(strFileName.c_str(), std::ios_base::in);
118     
119     bool bUnicodeFile = false;
120     if (file.good())
121     {
122         char szFlag[3] = {0};
123         file.read(szFlag, sizeof(char) * 3);
124         if ((unsigned char)szFlag[0] == 0xFF 
125             && (unsigned char)szFlag[1] == 0xFE)
126         {
127             fileType = UNICODE;
128         }
129         else if ((unsigned char)szFlag[0] == 0xEF 
130             && (unsigned char)szFlag[1] == 0xBB 
131             && (unsigned char)szFlag[2] == 0xBF)
132         {
133             fileType = UTF8;
134         }
135     }
136 
137     file.close();
138 
139     return fileType;
140 }
141 
142 int UnicodeToANSI(char * pDes, const wchar_t * pSrc)
143 {
144     assert(pDes != NULL);
145     assert(pSrc != NULL);
146 
147     int nLen = ::WideCharToMultiByte(CP_ACP, 0, pSrc, -1, NULL, 0, NULL, NULL);
148     if (nLen == 0) 
149     {
150         return -1;
151     }
152 
153     return ::WideCharToMultiByte(CP_ACP, 0, pSrc, -1, pDes, nLen, NULL, NULL);
154 }