下面的代码是MS记事本中用的字符识别代码
可以有效的识别UTF8或者是UNICODE
第一步是判断存储的文件头,如果没有文件头,则判断字符范围
/* IsTextUTF8
*
* UTF-8 is the encoding of Unicode based on Internet Society RFC2279
*
* Basicly:
* 0000 0000-0000 007F - 0xxxxxxx (ascii converts to 1 octet!)
* 0000 0080-0000 07FF - 110xxxxx 10xxxxxx ( 2 octet format)
* 0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)
* (this keeps going for 32 bit unicode)
*
*
* Return value: TRUE, if the text is in UTF-8 format.
* FALSE, if the text is not in UTF-8 format.
* We will also return FALSE is it is only 7-bit ascii, so the right code page
* will be used.
*
* Actually for 7 bit ascii, it doesn't matter which code page we use, but
* notepad will remember that it is utf-8 and "save" or "save as" will store
* the file with a UTF-8 BOM. Not cool.
*/
INT IsTextUTF8( LPSTR lpstrInputStream, INT iLen )
{
INT i;
DWORD cOctets; // octets to go in this UTF-8 encoded character
UCHAR chr;
BOOL bAllAscii= TRUE;
cOctets= 0;
for( i=0; i < iLen; i++ ) {
chr= *(lpstrInputStream+i);
if( (chr&0x80) != 0 ) bAllAscii= FALSE;
if( cOctets == 0 ) {
//
// 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case.
//
if( chr >= 0x80 ) {
//
// count of the leading 1 bits is the number of characters encoded
//
do {
chr <<= 1;
cOctets++;
}
while( (chr&0x80) != 0 );
cOctets--; // count includes this character
if( cOctets == 0 ) return FALSE; // must start with 11xxxxxx
}
}
else {
// non-leading bytes must start as 10xxxxxx
if( (chr&0xC0) != 0x80 ) {
return FALSE;
}
cOctets--; // processed another octet in encoding
}
}
//
// End of text. Check for consistency.
//
if( cOctets > 0 ) { // anything left over at the end is an error
return FALSE;
}
if( bAllAscii ) { // Not utf-8 if all ascii. Forces caller to use code pages for conversion
return FALSE;
}
return TRUE;
}
/* IsInputTextUnicode
* Verify if the input stream is in Unicode format.
*
* Return value: TRUE, if the text is in Unicode format.
*
* 29 June 1998
*/
INT IsInputTextUnicode (LPSTR lpstrInputStream, INT iLen)
{
INT iResult= ~0; // turn on IS_TEXT_UNICODE_DBCS_LEADBYTE
BOOL bUnicode;
// We would like to check the possibility
// of IS_TEXT_UNICODE_DBCS_LEADBYTE.
//
bUnicode= IsTextUnicode( lpstrInputStream, iLen, &iResult);
if (bUnicode &&
((iResult & IS_TEXT_UNICODE_STATISTICS) != 0 ) &&
((iResult & (~IS_TEXT_UNICODE_STATISTICS)) == 0 ) )
{
CPINFO cpiInfo;
CHAR* pch= (CHAR*)lpstrInputStream;
INT cb;
//
// If the result depends only upon statistics, check
// to see if there is a possibility of DBCS.
// Only do this check if the ansi code page is DBCS
//
GetCPInfo( CP_ACP, &cpiInfo);
if( cpiInfo.MaxCharSize > 1 )
{
for( cb=0; cb<iLen; cb++ )
{
if( IsDBCSLeadByte(*pch++) )
{
return FALSE;
}
}
}
}
return bUnicode;
}
#define UNICODE_FFFF 0xFFFF
#define REVERSE_BYTE_ORDER_MARK 0xFFFE
#define BYTE_ORDER_MARK 0xFEFF
lpBuf= MapViewOfFile( hMap, FILE_MAP_READ, 0,0,len);
lpBufAfterBOM= (LPSTR) lpBuf;
if( typeFlag == FT_UNKNOWN )
{
switch(*lpBuf)
{
case BYTE_ORDER_MARK:
bUnicode= TRUE;
ftOpenedAs= FT_UNICODE;
// don't count the BOM.
nChars= len / sizeof(TCHAR) -1;
break;
case REVERSE_BYTE_ORDER_MARK:
bUnicode= TRUE;
ftOpenedAs= FT_UNICODEBE;
// don't count the BOM.
nChars= len / sizeof(TCHAR) -1;
break;
// UTF bom has 3 bytes; if it doesn't have UTF BOM just fall through ..
case BOM_UTF8_HALF:
if (len > 2 && ((BYTE) *(((LPSTR)lpBuf)+2) == BOM_UTF8_2HALF) )
{
bUTF8= TRUE;
cpTemp= CP_UTF8;
ftOpenedAs= FT_UTF8;
// Ignore the first three bytes.
lpBufAfterBOM= (LPSTR)lpBuf + 3;
len -= 3;
break;
}
default:
// Is the file unicode without BOM ?
if ((bUnicode= IsInputTextUnicode((LPSTR) lpBuf, len)))
{
ftOpenedAs= FT_UNICODE;
nChars= len / sizeof(TCHAR);
}
else
{
// Is the file UTF-8 even though it doesn't have UTF-8 BOM.
if ((bUTF8= IsTextUTF8((LPSTR) lpBuf, len)))
{
ftOpenedAs= FT_UTF8;
cpTemp= CP_UTF8;
}
// well, not it must be an ansi file!
else
{
ftOpenedAs= FT_ANSI;
cpTemp= CP_ACP;
}
}
break;
}
}
记事本中用到的字符识别(UTF8/UNICODE)
最新推荐文章于 2023-07-11 17:01:35 发布