序
一般工程中, 文本不是utf-8就是gbk.
那先判断文本是否为utf-8, 如果不是就按照gbk走.
同事在网上找了一段代码, 我整理一下,也备一份.
准备了一段utf-8的字符串缓冲区,还判断的挺准的.
实验
// prj.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <stdlib.h>
#include <stdio.h>
#include "class_is_utf8.h"
int _tmain(int argc, _TCHAR* argv[])
{
// plain text is "abc中文测试"
// save as utf-8,don't input utf-8 header(0xef,0xbb,0xbf), because the input isn't file content
unsigned char szMsg[] = {0x61,0x62,0x63,0xe4,0xb8,0xad,0xe6,0x96,0x87,0xe6,0xb5,0x8b,0xe8,0xaf,0x95,0x00,0x00};
// save as ansi, detect is not utf-8
// {
// 0x61,0x62,0x63,0xd6,0xd0,0xce,0xc4,0xb2,0xe2,0xca,0xd4,0x00,0x00
// };
// {'a', 'b', 'c', '\0', '\0'}; // not utf-8
bool b_rc = class_is_utf8::is_utf8((const char*)&szMsg[0]);
printf("class_is_utf8::is_utf8 = %s\n", (b_rc ? "true" : "false"));
/** run result
class_is_utf8::is_utf8 = true
*/
system("pause");
return 0;
}
// @file class_is_utf8.h
#ifndef __CLASS_IS_UTF8_H__
#define __CLASS_IS_UTF8_H__
class class_is_utf8
{
public:
class_is_utf8(void);
virtual ~class_is_utf8(void);
static bool is_utf8(const char* str); // 给定的字符串是否为utf-8
};
#endif // #ifndef __CLASS_IS_UTF8_H__
// @file class_is_utf8.cpp
#include "StdAfx.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "class_is_utf8.h"
class_is_utf8::class_is_utf8(void)
{
}
class_is_utf8::~class_is_utf8(void)
{
}
// http://www.fileformat.info/info/unicode/utf8.htm
bool class_is_utf8::is_utf8(const char* str)
{
int i = 0;
int size = strlen(str);
while(i < size)
{
int step = 0;
if((str[i] & 0x80) == 0x00)
{
step = 1;
}
else if((str[i] & 0xe0) == 0xc0)
{
if(i + 1 >= size)
{
return false;
}
if((str[i + 1] & 0xc0) != 0x80)
{
return false;
}
step = 2;
}
else if((str[i] & 0xf0) == 0xe0)
{
if(i + 2 >= size)
{
return false;
}
if((str[i + 1] & 0xc0) != 0x80)
{
return false;
}
if((str[i + 2] & 0xc0) != 0x80)
{
return false;
}
step = 3;
}
else
{
return false;
}
i += step;
}
if(i == size)
{
return true;
}
return false;
}