按行读取ANSI、UNICODE 、UNICODE big endian、UTF-8四种文本文

最新推荐文章于 2023-03-11 20:51:26 发布

转载最新推荐文章于 2023-03-11 20:51:26 发布 · 4.1k 阅读

文章标签：

#mfc #unicode

学习点滴专栏收录该内容

75 篇文章

订阅专栏

本文介绍了一个MFC文件类CStdioFile的扩展类CStdioFileEx，该类能够识别并读取ANSI、UNICODE、UNICODEbigendian及UTF-8编码的文本文件。通过检测文件头来确定文件编码类型，并根据类型选择合适的读取方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

代码简介

1.问题提出
MFC提供的文件类CStdioFile，其中一个函数ReadString实现了文件的按行读取，但是不能满足不同类型的文本文件的按行读取，为了解决这一问题，笔者初步研究了一些编码知识，参考了网上的一些资料，实现了CStdioFile类的扩展类CStdioFileEx,完成了常见文本文件的按行读取（注明：不包括DOC、PDF等其他形式的文档）.
在此对网上分享编码经验的网友表示感谢，同时由于我编写的类还未经过严格测试，如有错误或方法过于复杂敬请各位指正。
2.问题解决
(1)四种常见文本文件编码方式研究
ANSI、UNICODE 、UNICODE big endian、UTF-8四种格式编码存在差别，简要介绍如下:
ANSI编码:
无文件头(文件编码开头标志性字节)
ANSI编码字母数字占一个字节，汉字占两个字节，
回车换行符单字节   十六进制表示为0d 0a

UNICODE编码:
文件头，十六进制表示为FF FE
每一个字符都用两个字节编码
回车换行符双字节   000d 000a

Unicode big endian编码:
文件头十六进制表示为FE FF ，
后面编码是把字符的高位放在前面，低位放在后面，正好和Unicode编码颠倒。
回车换行符,双字节,十六进制表示为0d00 0a00

UTF-8 编码:
文件头，十六进制表示为EF BB BF。
UTF-8是Unicode的一种变长字符编码，数字、字母、回车、换行都用一个字节表示,汉字占3个字节.
回车换行符，单字节，十六进制表示为0d 0a

以中文"你好"二字为例，各种类型的编码对应的十六进制格式(可由EditPlus查看)如下图所示:

由此可见上述的探讨是正确的。
(2)按行读取上述四种格式文本文件的解决方案
针对不同文件编码的特点，通过先检测文件头判断文件编码类型，然后根据文件类型分别调用不同的读取函数实现文件的按行读取。按行读取过程如下图所示:

实现过程中，编写CStdioFileEx类，该类继承自CStdioFile类，覆盖了CStdioFile类的BOOL ReadString(CString& rString)方法，从而实现了文件按行读取。

代码片段

// StdioFileEx.h: interface for the CStdioFileEx class.
//
//////////////////////////////////////////////////////////////////////

#if !defined(AFX_STDIOFILEEX_H__C1F1F96B_9417_4388_8D24_892EDFA2A616__INCLUDED_)
#define AFX_STDIOFILEEX_H__C1F1F96B_9417_4388_8D24_892EDFA2A616__INCLUDED_

#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
// --------------------------------------------------------------------------------------------
//程序用途：按行读取常见(包括ANSI、UNICODE、UNICODE  big endian、UTF-8)格式的文本文件    
//程序作者：湖北师范学院计算机科学与技术学院  王定桥                                 
//核心算法：CStdioFileEx继承自CStdioFile， 覆盖CStdioFile的 BOOL ReadString(CString& rString)方法, 
//          根据不同文件编码特征，寻找文件回车换行符判断读取行结束，文件结束符判断文件结束                               	  
//          检测不同文件编码头部，获取文件类型后调用不同的读取函数
//测试结果:在Windows7 VC6.0环境下测试上述四种格式的txt文件通过
//尚未完成:未重载CStdioFile的 virtual LPTSTR ReadString( LPTSTR lpsz, UINT nMax )方法
//		   未完成WriteString方法,未在VC UNICODE 环境下的测试         	  					          				  
//制作时间：2012-04-19											  
//代码版权:代码公开供学习交流使用  欢迎指正错误  改善算法								           		
// --------------------------------------------------------------------------------------------
#include "stdafx.h"
//文本文件类型枚举值
typedef enum TextCodeType
{
	UTF8=0,
	UNICODE =1,
	UNICODEBIGENDIAN=2,
	ANSI=3,
	FILEERROR=4
}TextCode;
class CStdioFileEx  :public CStdioFile
{
public:
	CStdioFileEx();
	CStdioFileEx(FILE* pOpenStream);
	CStdioFileEx(LPCTSTR lpszFileName, UINT nOpenFlags);
	virtual ~CStdioFileEx();
	virtual BOOL Open( LPCTSTR lpszFileName, UINT nOpenFlags, CFileException* pError = NULL);
public:
	//文件类型值转换到字符串
	CString  FileTypeToString();
	//获取文件类型
	TextCode GetFileType();
	//按行读取文件
	BOOL     ReadString(CString& rString);
	//静态方法  获取文件类型
	static   TextCode GetFileType( LPCTSTR lpszFileName);
protected:
	TextCode m_FileType;//保存文件类型
	const static int  PREDEFINEDSIZE;//预定义一行文件所需空间
protected:
	//从UTF-8文件按行读取
	BOOL     ReadStringFromUTF8File(CString& rString);
	//从ANSI文件按行读取
    BOOL     ReadStringFromAnsiFile(CString& rString);
	//重UNCIDOE、UNICODE big endian文件读取
	BOOL     ReadStringFromUnicodeFile(CString& rString);
	//UTF-8字符串转换到UNICODE字符串
	CString  UTF8ToUnicode(byte  *szUTF8);
	//处理文件打开标志
	UINT     ProcessFlags(LPCTSTR lpszFileName, UINT& nOpenFlags,TextCode &tc);
};

#endif // !defined(AFX_STDIOFILEEX_H__C1F1F96B_9417_4388_8D24_892EDFA2A616__INCLUDED_)

代码片段

// StdioFileEx.cpp: implementation of the CStdioFileEx class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "StdioFileEx.h"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
/*static*/ const  int  CStdioFileEx::PREDEFINEDSIZE=1024;
CStdioFileEx::CStdioFileEx():CStdioFile()
{
	m_FileType=ANSI;//指定默认类型
}
CStdioFileEx::CStdioFileEx(FILE* pOpenStream):CStdioFile(pOpenStream)
{
  CString filepath=pOpenStream->_tmpfname;//?  尚不清楚File*结构
  m_FileType=GetFileType(filepath);
}
CStdioFileEx::CStdioFileEx(LPCTSTR lpszFileName, UINT nOpenFlags):CStdioFile(lpszFileName,ProcessFlags(lpszFileName, nOpenFlags,m_FileType) )
{
}
CStdioFileEx::~CStdioFileEx()
{

}
// --------------------------------------------------------------------------------------------
//CStdioFileEx::GetFileType  静态方法  检测文本文件类型
// --------------------------------------------------------------------------------------------
/*static */ TextCode CStdioFileEx::GetFileType(LPCTSTR lpszFileName)
{
    CFile file;
	byte  buf[3];//unsigned char
	TextCode tc;
	try
	{
		if(file.Open(lpszFileName,CFile::modeRead|CFile::shareDenyNone|CFile::typeBinary))
		{   
			file.Read(buf,3);
			if(buf[0]==0xEF && buf[1]==0xBB && buf[2]==0xBF)
				tc=UTF8;
			else
			if(buf[0]==0xFF && buf[1]==0xFE )
				tc=UNICODE ;
			else
			if(buf[0]==0xFE && buf[1]==0xFF )
				tc=UNICODEBIGENDIAN;
			else
				tc=ANSI;
		}
		else
			tc=FILEERROR;
	}
	catch (CFileException ex)
	{   
		CString errormsg;
		errormsg.Format(_T("操作文件%s时发生异常!"),ex.m_strFileName);
        AfxMessageBox(errormsg);
	}
	return tc;
}
// --------------------------------------------------------------------------------------------
//CStdioFileEx::Readstring 按行读取文本文件
//根据不同文件类型 调用不同的读取函数
// --------------------------------------------------------------------------------------------
BOOL CStdioFileEx::ReadString(CString& rString)
{    
	BOOL flag=FALSE;
     switch(m_FileType)
	 {
	 case ANSI:
          flag=ReadStringFromAnsiFile(rString);
		  break;
	 case UNICODE:
	 case UNICODEBIGENDIAN:
		  flag=ReadStringFromUnicodeFile(rString);
		  break;
	 case UTF8:
		 flag=ReadStringFromUTF8File(rString);
		 break;
	 case FILEERROR:
		 flag=FALSE;
		 break;
	 default:
		 break;
	 }
	 return flag;
}
// --------------------------------------------------------------------------------------------
//CStdioFileEx::ReadstringFromAnsiFile  从ANSI文件读取字符串
// --------------------------------------------------------------------------------------------
BOOL CStdioFileEx::ReadStringFromAnsiFile(CString& rString)
{   
	BOOL flag;
	try
	{   
		flag=CStdioFile::ReadString(rString);
		rString+="\r\n";
	}
	catch(CFileException ex)
	{
		CString errormsg;
		errormsg.Format(_T("操作文件%s时发生异常!"),ex.m_strFileName);
        AfxMessageBox(errormsg);
	}
    return flag;
}
// --------------------------------------------------------------------------------------------
//CStdioFileEx::ReadstringFromUTF8File 从UTF8文件中按行读取 
//由于UTF-8编码多字节编码且各种字符长度不同,判断回车换行需要判断连续两个字节  
// --------------------------------------------------------------------------------------------
BOOL  CStdioFileEx::ReadStringFromUTF8File(CString& rString)
{
	long    index;
	byte    cr=0x0d;//回车换行符
	byte    lf=0x0a;
	byte    temp[2];
	byte    tempbyte;
	byte    *pbuf=new byte[PREDEFINEDSIZE+1];
	memset(pbuf,0,(PREDEFINEDSIZE+1)*sizeof(byte));
	UINT    readlen;
	try
	{   
		//跳过文件头 移动文件指针
		if (m_pStream && ( GetPosition() == 0))
		{
			CStdioFile::Seek(3*sizeof(byte),CFile::begin);
		}
		index=0;
		do 
		{
			memset(temp,0,2*sizeof(byte));
			readlen=CFile::Read(temp,2);//CStdioFile::Read效果不同 将省去回车符0x0d
			if(!readlen)
				return FALSE;
			//元素存贮到字节数组中
			pbuf[index++]=temp[0];
			pbuf[index++]=temp[1];
			tempbyte=temp[1];
			//判断回车换行
			if( ( tempbyte==cr && temp[0]==lf) ||(temp[0]==cr && temp[1]==lf))
				break;
		} while (readlen==2 && index<PREDEFINEDSIZE );
		pbuf[index]=0;
		rString=UTF8ToUnicode(pbuf);//UTF8编码转换到UNICODE
	}
	catch (CFileException ex)
	{
		CString errormsg;
		errormsg.Format(_T("操作文件%s时发生异常!"),ex.m_strFileName);
        AfxMessageBox(errormsg);
	}
    delete[] pbuf;
	return TRUE;
}
// --------------------------------------------------------------------------------------------
//从UNICODE、UNICODE big endian文件按行读取
//当读取字节小于请求值(文件结束)或者超过预定义空间时无条件退出循环
//wChLine存放每行字符,wchtemp存放临时读取字符
//当编码为UNICODE big endian 时交换高低字节 ,将其转换成UNICODE字符串
// --------------------------------------------------------------------------------------------
BOOL  CStdioFileEx::ReadStringFromUnicodeFile(CString& rString)
{
	long    index;
	UINT    readlen;
	wchar_t wchcr=MAKEWORD(0x0d,0x00);;//回车符  MakeWord(低、高字节顺序) 
	wchar_t wchlf=MAKEWORD(0x0a,0x00);
	wchar_t *wChLine=new wchar_t[PREDEFINEDSIZE+1];
	memset(wChLine,0,(PREDEFINEDSIZE+1)*sizeof(wchar_t));
	wchar_t wchtemp[2];
	BOOL   flag=TRUE;
	try
	{   
		//跳过文件头 移动文件指针
		if (m_pStream && ( GetPosition() ==0))
		{
			Seek(2*sizeof(byte),CFile::begin);
		}
		index=0;
		do
		{   
			memset(wchtemp,0,2*sizeof(wchar_t));
			readlen=CFile::Read(wchtemp,sizeof(wchar_t)*2);//CStdioFile::Read效果不同
			if(!readlen)
				 break;
			//UNICODE big endian交换高低字节
			if(UNICODEBIGENDIAN==m_FileType)
			{   
				unsigned char high, low;
				high = (wchtemp[0] & 0xFF00) >>8;
				low  = wchtemp[0] & 0x00FF;
				wchtemp[0] = ( low <<8) | high;
				high = (wchtemp[1] & 0xFF00) >>8;
				low  = wchtemp[1] & 0x00FF;
				wchtemp[1] = ( low <<8) | high;
			}
			wChLine[index++]=wchtemp[0];
			wChLine[index++]=wchtemp[1];
			//判断回车换行
			if(wchtemp[0]==wchcr && wchtemp[1]==wchlf)
				break;
		}
		while( (readlen==sizeof(wchar_t)*2) && index<PREDEFINEDSIZE );
		wChLine[index]=0;
		CString strtext(wChLine,index);
		rString=strtext;
		if(rString.IsEmpty())
           flag=FALSE;
	}
	catch (CFileException ex)
	{
		CString errormsg;
		errormsg.Format(_T("操作文件%s时发生异常!"),ex.m_strFileName);
        AfxMessageBox(errormsg);
	}
	delete[] wChLine;
	return flag;
}
// --------------------------------------------------------------------------------------------
//CStdioFileEx::UTF8ToUnicode  UTF-8字符串转换成UNICODE字符串
// --------------------------------------------------------------------------------------------
CString CStdioFileEx::UTF8ToUnicode(byte  *szUTF8)
{   
	CString strret;
    strret=_T("");
	if(!szUTF8)
		return strret;
	//获取转换后所需串空间的长度 
	int   wcsLen =  MultiByteToWideChar(CP_UTF8,0,(LPSTR)szUTF8,strlen((char*)szUTF8),NULL,0);
	LPWSTR   lpw=new   WCHAR[wcsLen+1]; 
	if(!lpw)
		return strret;
	memset(lpw,0,(wcsLen+1)*sizeof(wchar_t)); 
	//实施转换
	MultiByteToWideChar(CP_UTF8,0, (LPSTR)szUTF8, 
		strlen((char *)szUTF8),  (LPWSTR)lpw,  wcsLen); 
	CString str(lpw);
	delete[]   lpw;
	return str;
}
// --------------------------------------------------------------------------------------------
//CStdioFileEx::GetFileType获取文件类型
// --------------------------------------------------------------------------------------------
TextCode CStdioFileEx::GetFileType()
{   
	return m_FileType;
}
// --------------------------------------------------------------------------------------------
//CStdioFileEx::FileTypeToString 文件类型枚举值转换为字符串值
// --------------------------------------------------------------------------------------------
CString CStdioFileEx::FileTypeToString()
{   
	CString strtype;
	switch(m_FileType)
	{
	case ANSI:
		strtype.Format("%s",_T("ANSI"));
		break;
	case UTF8:
		strtype.Format("%s",_T("UTF8"));
		break;
	case UNICODE:
		strtype.Format("%s",_T("UNICODE"));
		break;
	case UNICODEBIGENDIAN:
		strtype.Format("%s",_T("UNICODE big endian"));
		break;
	case FILEERROR:
		strtype.Format("%s",_T("FILEERROR"));
		break;
	default:
		break;
	}
	return strtype;
}
// --------------------------------------------------------------------------------------------
//CStdioFileEx::Open 重载父类的文件打开操作 改变不同类型文件的打开方式
// --------------------------------------------------------------------------------------------
BOOL CStdioFileEx::Open( LPCTSTR lpszFileName, UINT nOpenFlags, CFileException* pError)
{   
    ProcessFlags(lpszFileName,nOpenFlags,m_FileType);//处理文件打开方式
	return CStdioFile::Open(lpszFileName, nOpenFlags,pError);
}
// --------------------------------------------------------------------------------------------
//CStdioFileEx::ProcessFlags 处理不同文件的打开方式
//ANSI文件采用文本读取，UNICODE、UNICDOE big endian、UTF-8采用二进制方式读取
// --------------------------------------------------------------------------------------------
UINT CStdioFileEx::ProcessFlags(LPCTSTR lpszFileName, UINT& nOpenFlags,TextCode &tc)
{
    tc=CStdioFileEx::GetFileType(lpszFileName);
	if ((nOpenFlags & CFile::modeReadWrite)|| (nOpenFlags & CFile::modeRead) )
	{
		switch(tc)
		{
		case ANSI:
			nOpenFlags|= CFile::typeText;
			nOpenFlags&=~CFile::typeBinary;
			break;
		case UTF8:
			nOpenFlags |= CFile::typeBinary;
			nOpenFlags&= ~CFile::typeText;
			break;
		case UNICODE:
			nOpenFlags |= CFile::typeBinary;
			nOpenFlags&= ~CFile::typeText;
			break;
		case UNICODEBIGENDIAN:
			nOpenFlags |= CFile::typeBinary;
			nOpenFlags&= ~CFile::typeText;
			break;
		case FILEERROR:
			break;
		default:
			break;
		}
	}
	nOpenFlags|=CFile::shareDenyNone;
	return nOpenFlags;
}

代码片段

void CReadStringDlg::OnBtnOpen() 
{
	// TODO: Add your control notification handler code here
	char szFilter[] = "Text Files (*.txt)|*.txt|All Files (*.*)|*.*||";
	CFileDialog  filedlg(TRUE , "txt", NULL, OFN_HIDEREADONLY | OFN_OVERWRITEPROMPT, szFilter,this);
    if(IDOK==filedlg.DoModal())
	{
        m_strPath=filedlg.GetPathName();
		UpdateData(FALSE);
		m_ctrlEdit.SetSel(0,-1);
		m_ctrlEdit.Clear();
		if(m_stdiofileex.Open(m_strPath,CFile::modeRead))
		{
			m_strFileType=m_stdiofileex.FileTypeToString();
			UpdateData(FALSE);
		}
		else
		{
			MessageBox(_T("读取文件失败!"));
		}
	}
}
//读取文件
void CReadStringDlg::OnBtnRead() 
{
	// TODO: Add your control notification handler code here
	if(!ValidateInput())
		return;
	CString      strread,strtemp;
	m_ctrlEdit.GetWindowText(strread);
	m_ctrlEdit.SetSel(0,-1);
	m_ctrlEdit.Clear();
	if(m_stdiofileex.m_pStream)
	{   
		int cnt=0;
		strread+="\r\n";
		while(cnt<m_lLineCnt)
		{
			if(m_stdiofileex.ReadString(strtemp))
			{   
				strread+=strtemp;
				cnt++;
			}
			else
			{   
				AfxMessageBox(_T("读取已经到达文件末尾!"));
				break;
			}
		}
		m_ctrlEdit.SetSel(0,-1);
		m_ctrlEdit.ReplaceSel(strread);
	}
	else
	{
		MessageBox(_T("读取文件失败!"));
	}
}
//验证输入
BOOL CReadStringDlg::ValidateInput()
{
    UpdateData();
	if(m_strPath.IsEmpty())
	{
		MessageBox("文件路径为空，请填写!");
		return FALSE;
	}
	if(m_lLineCnt<=0)
		return FALSE;
	return TRUE;
}

代码片段

PTSTR Normalise(PBYTE	pBuffer
        	)
{
    PTSTR			ptText;		// pointer to the text char* or wchar_t* depending on UNICODE setting
    PWSTR			pwStr;		// pointer to a wchar_t buffer
    int				nLength;	// a useful integer variable
    
    // obtain a wide character pointer to check BOMs
    pwStr = reinterpret_cast<PWSTR>(pBuffer);
    
    // check if the first word is a Unicode Byte Order Mark
    if (*pwStr == 0xFFFE || *pwStr == 0xFEFF)
    {
        // Yes, this is Unicode data
        if (*pwStr++ == 0xFFFE)
        {
            // BOM says this is Big Endian so we need
            // to swap bytes in each word of the text
            while (*pwStr)
            {
                // swap bytes in each word of the buffer
                WCHAR	wcTemp = *pwStr >> 8;
                wcTemp |= *pwStr << 8;
                *pwStr = wcTemp;
                ++pwStr;
            }
            // point back to the start of the text
            pwStr = reinterpret_cast<PWSTR>(pBuffer + 2);
        }
#if !defined(UNICODE)
        // This is a non-Unicode project so we need
        // to convert wide characters to multi-byte
        
        // get calculated buffer size
        nLength = WideCharToMultiByte(CP_UTF8, 0, pwStr, -1, NULL, 0, NULL, NULL);
        // obtain a new buffer for the converted characters
        ptText = new TCHAR[nLength];
        // convert to multi-byte characters
        nLength = WideCharToMultiByte(CP_UTF8, 0, pwStr, -1, ptText, nLength, NULL, NULL);
#else
        nLength = wcslen(pwStr) + 1;    // if Unicode, then copy the input text
        ptText = new WCHAR[nLength];    // to a new output buffer
        nLength *= sizeof(WCHAR);       // adjust to size in bytes
        memcpy_s(ptText, nLength, pwStr, nLength);
#endif
    }
    else
    {
        // The text data is UTF-8 or Ansi
#if defined(UNICODE)
        // This is a Unicode project so we need to convert
        // multi-byte or Ansi characters to Unicode.
        
        // get calculated buffer size
        nLength = MultiByteToWideChar(CP_UTF8, 0, reinterpret_cast<PCSTR>(pBuffer), -1, NULL, 0);
        // obtain a new buffer for the converted characters
        ptText = new TCHAR[nLength];
        // convert to Unicode characters
        nLength = MultiByteToWideChar(CP_UTF8, 0, reinterpret_cast<PCSTR>(pBuffer), -1, ptText, nLength);
#else
        // This is a non-Unicode project so we just need
        // to skip the UTF-8 BOM, if present
        if (memcmp(pBuffer, "\xEF\xBB\xBF", 3) == 0)
        {
            // UTF-8
            pBuffer += 3;
        }
        nLength = strlen(reinterpret_cast<PSTR>(pBuffer)) + 1;  // if UTF-8/ANSI, then copy the input text
        ptText = new char[nLength];                             // to a new output buffer
        memcpy_s(ptText, nLength, pBuffer, nLength);
#endif
    }
    
    // return pointer to the (possibly converted) text buffer.
    return ptText;
}