关于字符移植一些注意事项-char w_char等+正则表达式

最新推荐文章于 2024-06-18 14:55:26 发布

原创最新推荐文章于 2024-06-18 14:55:26 发布

· 626 阅读

0 ·

版权

学习笔记专栏收录该内容

25 篇文章

订阅专栏

https://github.com/coderforlife/mingw-unicode-main/blob/master/mingw-unicode.c

https://www.cnblogs.com/zyl910/archive/2012/07/30/wcharfmt.html

https://www.cnblogs.com/zyl910/archive/2013/01/17/tcharall.html

https://zh.cppreference.com/w/cpp/keyword/wchar_t

http://www.cplusplus.com/reference/string/basic_string/

https://blog.youkuaiyun.com/drowedfish/article/details/89702556

https://blog.youkuaiyun.com/wallaceli1981/article/details/6116738

https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2013/6aw8xdf2%28v%3dvs.120%29

https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2013/69ze775t%28v%3dvs.120%29

最先引入这个问题的原因是在正则的一个项目中。如：输入:明天提醒我。匹配提醒我?([\W\S]{1,30}) 这一句正则，结果显示 //备注：这里切错的部分是：输入明天提醒我，然后最后我字的一半被给切出来了。正则用的是 regex_search 这个函数。

用wstring的工程中，只能使用unicode方式，真是这样吗？自己做一个测试,修改测试程序的控制台为utf-8编码的 --- 结果是：在VS工程中。不存在这种说法。不管是unicode还是多字节，还是未设置，结果都一样。测试前提是把代码的cpp+h都变换成utf-8格式编码的。

正则表达式中获取匹配组的代码示例：

///Compliments to: https://stackoverflow.com/questions/16749069/c-split-string-by-regex for the below code
vector<string> splitString(const string & s, const string & rgx_str)
{
	vector<string> elems;

	regex rgx(rgx_str);

	sregex_token_iterator iter(s.begin(), s.end(), rgx, -1);
	sregex_token_iterator end;

	vector<string> outMatchedVec;
	while (iter != end)
	{
		elems.push_back(*iter);
		++iter;
	}

	return elems;
}

vector<string> testRe(int ipos)
{
	vector<string> elems;
	std::string text = "明天我要去太平洋影院去看天下无双定时提醒我";
	std::regex ws_re("我(.*?)去(.*?)去看(.*)定时"); // 空白符
	std::sregex_token_iterator iter(text.begin(), text.end(), ws_re, ipos);//-1 匹配明天 1结果 是为了匹配出一句
	sregex_token_iterator end;
	vector<string> outMatchedVec;

	while (iter != end)
	{
		string stmp;
		elems.push_back(*iter);
		++iter;
	}

	return elems;
}

int main()
{
	vector<string> vec1 = testRe(-2);// ""--空内容
	vector<string> vec0 = testRe(-1);//明天 提醒我--为未匹配的正则的前内容与后内容

	vector<string> vec2 = testRe(0);//我要去太平洋影院去看天下无双定时--为分组0的内容
	vector<string> vec3 = testRe(1);//要--为分组1的内容

	vector<string> vec4 = testRe(2);//太平洋影院--为分组2的内容
	vector<string> vec5 = testRe(3);//天下无双--为分组3的内容

	vector<string> vec6 = testRe(4);// ""--空内容
	vector<string> vec7 = testRe(5);// ""--空内容

	vector<string>  vecStr0 = splitString("ss12a67rt12ab67tt12abc67uu12abcd67iiopq", "12");
	vector<string>  vecStr1 = splitString("ss12a67rt12ab67tt12abc67uu12abcd67iiipq", "12.+67");
	vector<string>  vecStr2 = splitString("ss12a67rt1 2ab67tt12abc 67uu1   2abc  d67iiiipq", " ");
	vector<string>  vecStr3 = splitString("ss12a67rt12ab67tt12abc67uu12abcd67iiiipq", "ab");
	//vector<string>  vecStr4 = splitString("记得明天提醒我我们最先引入这个问题的原因是在正则的一个项目中", "提醒我?([\\W\\S]{ 1,30 })");
	vector<string>  vecStr4 = splitString("记得明天提醒我我们最先引入这个问题的原因是在正则的一个项目中", "提醒我?(.{1,11})");
	
	
	getchar();
    return 0;
}

正则问题代码：

wcharTest.h：

#pragma once

#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <wchar.h>
#include <regex>
#include <string>
using namespace std;

#include <stdio.h> 
#include <wchar.h>

void test_sample();
void test_sample_w();

wcharTest.cpp：

#include "wcharTest.h"


#define BUFFER_SIZE 50

void test_sample()
{
	//mbtowc()
	//wctomb

	std::string log_tianfu(string("明天提醒我"));
	try {
		std::regex r_tianfu(string("提醒我?([\\W\\S]{1,30})"), regex_constants::ECMAScript);
		// std::regex r_tianfu(curReCfg.strReCont, regex_constants::ECMAScript);

		std::smatch sm_tianfu;//typedef match_results<string::const_iterator> smatch;
							  //用while是为了匹配出一句中最佳匹配选项
		int icntMax = 0;

		//while (regex_search(log_tianfu, sm_tianfu, r_tianfu))
		if (regex_search(log_tianfu, sm_tianfu, r_tianfu) == true)
		{
			bool bfdError = false;
			int itmpCnt = 0;
			int itmpLen = 0;
			//for (int ix = 0; ix < vi.size(); ++ix)
			{
				int itmp = 1;// vi[ix];
				string str0 = sm_tianfu[itmp].str();// 我 = CE D2 ,其中 -46=D2
				int ab = 99;
			}

			log_tianfu = "";//不再进行二次匹配
		}
	}
	catch (...)
	{
		int ab = 99;
	}
}

void test_sample_w()
{
	//mbtowc()
	//wctomb

	std::wstring log_tianfu(wstring(L"明天提醒我"));
	try {
		std::wregex r_tianfu(wstring(L"提醒我?([\\W\\S]{1,30})"), regex_constants::ECMAScript);
		// std::regex r_tianfu(curReCfg.strReCont, regex_constants::ECMAScript);

		std::wsmatch sm_tianfu;//typedef match_results<string::const_iterator> smatch;
							  //用while是为了匹配出一句中最佳匹配选项
		int icntMax = 0;

		//while (regex_search(log_tianfu, sm_tianfu, r_tianfu))
		if (regex_search(log_tianfu, sm_tianfu, r_tianfu) == true)
		{
			bool bfdError = false;
			int itmpCnt = 0;
			int itmpLen = 0;
			//for (int ix = 0; ix < vi.size(); ++ix)
			{
				int itmp = 1;// vi[ix];
				wstring str0 = sm_tianfu[itmp].str();// 我 = CE D2 ,其中 -46=D2
				int ab = 99;
			}

			log_tianfu = L"";//不再进行二次匹配
		}
	}
	catch (...)
	{
		int ab = 99;
	}
}

其中：出错的部分在：string str0 = sm_tianfu[itmp].str();// 我 = CE D2 ,其中 -46=D2。用wstring：wstring str0 = sm_tianfu[itmp].str();这一句不会出错。在linux+windows上面测试后，发现也是同样的问题.string的形式在linux上面也是出乱码。但是在windows上面与linux上面出错的乱码不一样。

然后我自己写了代码进行测试。

测试代码：

//windows下面的代码是ANSI格式 linux下面的代码是UTF-8格式
void readFunc3()
{
	char c0 = 'a';//占用1个字节 windows(61) unix(61)
	string s0 = "abc";//占用3个字节 windows(61 62 63)，s0[0]='a' s0[1]='b' s0[2]='c'   unix(61 62 63) s0[0]='a' s0[1]='b' s0[2]='c'
	string st0 = "你好世界";//windows-8个字节-ANSI格式：(c4 e3 ba c3 ca c0 bd e7),st0[0]=c4、st0[1] = e3、st0[2] = ba、st0[3] = c3、st0[4] = ca、st0[5] = c0、st0[6] = bd 、st0[7] = e7   unix-12个字节-UTF8格式：(E4 BD A0 E5 A5 BD E4 B8 96 E7 95 8C ) st0[0]=E4、st0[1] = bd、st0[2] = a0、st0[3] = e5、st0[4] = a5、st0[5] = bd、st0[6] = e4 、st0[7] = b8 、st0[8] = 96 、st0[9] = e7 、st0[10] = 95 、st0[11] = 8c


	wchar_t c1 = 'a';//windows: 61 00  unix：61 00 00 00
	wstring s1 = L"abc";//windows:61 00 62 00 63 00,s1[0]='a' s1[1]='b' s1[2]='c'  unix：61 00 00 00 62 00 00 00 63 00 00 00 ,s1[0]='a' s1[1]='b' s1[2]='c'
	wstring st1 = L"你好世界";//对于汉字来说，st1 里面的每一个内容都是汉字对应的ansi码表中的值，比如 你 对应的是20320(0x4f60)这个值，//windows:60 4f 7d 59 16 4e 4c 75 st1[0]="你" st1[1]="好" st1[2]="世" st1[3]="界" //UNIX:60  4F  00  00  7D  59  00  00  16  4E  00  00  4C  75  00  00 st1[0]="你" st1[1]="好" st1[2]="世" st1[3]="界"

	char16_t  c2 = 'a';///windows: 61 00  unix：61 00
	u16string s2 = u"abc";//windows:61 00 62 00 63 00,s2[0]='a' s2[1]='b' s2[2]='c' unix：61 00 62 00 63 00,s2[0]='a' s2[1]='b' s2[2]='c'
	u16string st2 = u"你好世界";//windows:60 4f 7d 59 16 4e 4c 75, st2[0]="你" st2[1]="好" st2[2]="世" st2[3]="界" ,unix： 60  4F  7D  59  16  4E  4C  75 , st2[0]="你" st2[1]="好" st2[2]="世" st2[3]="界" , 其中，60  4F为你的20230的16进制表示
	char16_t  c20 = st2[0];//

	char32_t  c3 = 'a';//windows: 61 00 00 00  unix：61 00 00 00
	u32string s3 = U"abc"; //windows: 61 00 00 00 62 00 00 00 63 00 00 00 , s3[0]='a' s3[1]='b' s3[2]='c' . unix：61 00 00 00 62 00 00 00 63 00 00 00, s3[0]='a' s3[1]='b' s3[2]='c' .
	u32string st3 = U"你好世界";//windows: 60 4f 00 00 7d 59 00 00 16 4e 00 00 4c 75 00 00 , st3[0]="你" st3[1]="好" st3[2]="世" st3[3]="界" . unix： 60  4F  7D  59  16  4E  4C  75  其中，60  4F为你的20230的16进制表示, st3[0]="你" st3[1]="好" st3[2]="世" st3[3]="界" .
	char32_t  c30 = st3[0];//unix： 60  4F  00  00  7D  59  00  00  16  4E  00  00  4C  75  00  00  其中，60  4F为你的20230的16进制表示

						   //wstring str = L"酒吧点零";
						   //double result = parseChineseToNum(str);
						   //printf("result = %f", result);
	return;
}

下面想办法来解决这个问题。用wstring来进行尝试处理。

可以查看微软的官方文档：https://www.baidu.com/link?url=KJRM1FaTAiBj8ESSzu21NZZuFTGIrTrKTyEfcAckPqwgUw6k_985rbddumhmya-RBSlv-eBEin2ne_8Uv9tQ__&wd=&eqid=b8393a35002d99c4000000065dca0cd2

其中有一个例子：

// crt__wfopen.c
// compile with: /W3
// This program creates a file (or overwrites one if
// it exists), in text mode using Unicode encoding.
// It then writes two strings into the file
// and then closes the file.

#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <wchar.h>

#define BUFFER_SIZE 50

int main(int argc, char** argv)
{
	wchar_t str[BUFFER_SIZE];
	size_t  strSize;
	FILE*   fileHandle;

	// Create an the xml file in text and Unicode encoding mode.
	if ((fileHandle = _wfopen(L"_wfopen_test.xml", L"wt+,ccs=UNICODE")) == NULL) // C4996
																				 // Note: _wfopen is deprecated; consider using _wfopen_s instead
	{
		wprintf(L"_wfopen failed!\n");
		return(0);
	}

	// Write a string into the file.
	
	wcscpy_s(str, sizeof(str) / sizeof(wchar_t), L"<xmlTag>\n");
	int strLen = sizeof(str);

	strSize = wcslen(str);
	if (fwrite(str, sizeof(wchar_t), strSize, fileHandle) != strSize)
	{
		wprintf(L"fwrite failed!\n");
	}

	// Write a string into the file.
	wcscpy_s(str, sizeof(str) / sizeof(wchar_t), L"</xmlTag>");
	strSize = wcslen(str);
	if (fwrite(str, sizeof(wchar_t), strSize, fileHandle) != strSize)
	{
		wprintf(L"fwrite failed!\n");
	}

	// Close the file.
	if (fclose(fileHandle))
	{
		wprintf(L"fclose failed!\n");
	}
	return 0;
}

上面自己写的测试程序就是为了搞清楚实际中编码在内存中的存储方式。事实上，用wstring确实能解决很多问题。但是并不是所有的函数都有来处理wstring的。所以参考https://github.com/zyl910/tcharall的来解决。

tcharall解决思路：

tcharall主要是通过宏来控制底层的char与wchar_t与string和wstring的一种转换，如果相应平台有对应的头文件来转换，那么就用对应平台的函数功能，否则就用它自己定义的一些宏。
然后在测试的过程中，发现
wcout.imbue(std::locale("chs"));
wcin.imbue(std::locale("chs"));
wcout << L"Parsed:" << endl;
getline(wcin, response, wchar_t('\n'));
getline(wcin, response, L'\n');
这段测试代码，在windows下面是正确的，可以正确的显示出中文的输入与输出，但是在unix下面，不能正确的进行中文的输入与输出。
在unix下面，
wcout.imbue(std::locale("chs"));
wcin.imbue(std::locale("chs")); 这两行代码引起的错误信息如下：
terminate called after throwing an instance of 'std::runtime_error' what(): locale::facet::_S_create_c_locale name not valid。网上搜索，发现是本地机器没有安装相应的语言包支持。原因：接口库文件在接口调用时有返回GBK字段，但是由于本地机器没有安装相应的语言包支持。所以程序出错崩溃。解决方法：安装中文语言包支持。
sudo dpkg-reconfigure locales
然后 vim /home/nan/.bashrc
#添加一行
export LC_ALL="C"
或者直接在bash终端 export LC_ALL="C"
结果都没有解决问题。于是参照 https://blog.youkuaiyun.com/wallaceli1981/article/details/6116738 来解决此问题。代码如下：

int main()
{
	locale lc("zh_CN.UTF-8");//unix下面成功 windows下面成功，所有统一用这句吧
	//locale lc("chs");//windows下面成功
	locale::global(lc);//unix下面成功，cin\cout都成功。 unix下面可以输出中文，但是 cout 不能用了。windows下面这一句cin cout wcin wcout 都是可以的。
	//wcout.imbue(lc);//unix下面失败，不能用此句,windows下面也可以用这一句，但是建议和unix保持一致，统一用上一句。
	//wcin.imbue(lc);//unix下面失败，不能用此句,windows下面也可以用这一句，但是建议和unix保持一致，统一用上一句。

	const wchar_t* strzh = L"中文abc";
	wcout << L"Zhong text is: " << strzh << endl;
    printf("wchar_t string: %ls \n", strzh );

	std::printf("printf --Integers\n");
	std::wprintf(L"wprintf-- Integers\n");

	//wcout << L"wcout--Current global locale is: " << L'\n';
	//wcout << L"wcout--Current global locale is: " << lc.name() << L'\n';//windows:lc.name()=C unix:lc.name()=C
	cout << "cout--Current global locale is: " << lc.name() << endl;//windows:lc.name()=C unix:lc.name()=C

	auto s0 = locale::global(locale(""));
	std::wcout << L"wcout 再次输出同一数字 locale setting is 再次输出同一数字" << std::locale("").name().c_str() << L'\n';
	std::cout << "cout 再次输出同一数字 locale setting is 再次输出同一数字" << std::locale("").name().c_str() << '\n';

	// 在启动时，全局本地环境是 "C" 本地环境
	std::wcout << 1000.01 << L'\n';
	std::cout << 1000.01 << '\n';

	// 将来的宽字符输出使用新的全局本地环境
	//std::wcout.imbue(std::locale());
	std::wcout << L"wcout 再次输出同一数字" << L'\n';
	std::cout << "cout 再次输出同一数字" << '\n';

	wstring sin0;
	wcin >> sin0;
	wcout << sin0;

	string sin1;
	cin >> sin1;
	cout << sin1;
	
	getchar();
}

宽字符字符串转换成单字符串。如wstring转化成string，在linux+windows上面测试：

/*
string 转换为 wstring
*/
std::wstring c2w(const char *pc)
{
#ifdef _WIN32
	//std::setlocale(LC_ALL, "chs");//如果pc是ANSI格式的，也就是一个中文占用两个字符，用这句
	std::setlocale(LC_ALL, "zh_CN.UTF-8");//如果pc是UTF8格式的，也就是一个中文占用3个字符，用这句
#else
	std::setlocale(LC_ALL, "zh_CN.UTF-8");
#endif

	std::wstring val = L"";

	if (NULL == pc)
	{
		return val;
	}
	size_t size_of_wc;
	size_t destlen = mbstowcs(0, pc, 0);
	if (destlen == (size_t)(-1))
	{
		return val;
	}
	size_of_wc = destlen + 1;
	wchar_t * pw = new wchar_t[size_of_wc];
	wmemset(pw, 0, size_of_wc);
	mbstowcs(pw, pc, size_of_wc);
	val = pw;
	delete pw;
	return val;
}

/*
wstring 转换为 string
*/
std::string w2c(const wchar_t * pw)
{
#ifdef _WIN32
	//std::setlocale(LC_ALL, "chs");//如果返回的string是ANSI格式的，也就是一个中文占用两个字符，用这句
	std::setlocale(LC_ALL, "zh_CN.UTF-8");//如果返回的string是UTF8格式的，也就是一个中文占用3个字符，用这句
#else//linux下面不存在ANSI的格式问题
	std::setlocale(LC_ALL, "zh_CN.UTF-8");
#endif

	if (!pw)
	{
		return "";
	}
	int len0 = wcslen(pw);
	size_t size = len0 * sizeof(wchar_t)*2;
	char *pc = NULL;
	if (!(pc = (char*)malloc(size)))
	{
		return "";
	}
	memset(pc, 0, size);
	size_t destlen = wcstombs(pc, pw, size);
	/*转换不为空时，返回值为-1。如果为空，返回值0*/
	if (destlen == (size_t)(0))
	{
		return "";
	}
	string val = string(pc);
	delete pc;
	return val;
}

上述代码总结就是：代码中只可能使用一种编码方式。然后需要设置setlocale它的值。就可以进行字符的处理了。

#include <codecvt>

错误 LNK2001 无法解析的外部符号 "__declspec(dllimport) public: static class std::locale::id std::codecvt<char16_t,char,struct _Mbstatet>::id" (__imp_?id@?$codecvt@_SDU_Mbstatet@@@std@@2V0locale@2@A) ConsoleTest E:\ljtStudy\ConsoleTest\ConsoleTest\ConsoleTest.obj 1
windows下面的解决方案：链接msvcprt.lib即可。 -- 不需要，用wchar_t替换char_16字符串。

https://social.msdn.microsoft.com/Forums/en-US/8f40dcd8-c67f-4eba-9134-a19b9178e481/vs-2015-rc-linker-stdcodecvt-error?forum=vcgeneral

下面是使用 codecvt 进行文件读取的一些代码：

#include <iostream>
#include <string>
#include <fstream>
#include <locale>
#include <iomanip>
#include <codecvt>

std::wstring readFile(const char* filename)
{
	std::wifstream wif(filename);
	wif.imbue(std::locale(std::locale::empty(), new std::codecvt_utf8<wchar_t>));
	std::wstringstream wss;
	wss << wif.rdbuf();
	return wss.str();
}

int main()
{
	wstring str0=readFile("F:/utf8.txt");//utf8.txt是一个utf-8的文本,str0中读取的数据就是读取出的utf8文本
	getchar();
｝

上面是windows上的代码，下面是linux下面的代码：

std::wstring readFile(const char* filename)
{

	//wif.imbue(std::locale(std::locale::empty(), new std::codecvt_utf8<wchar_t>));
	//wif.imbue(std::locale("chs"));
	locale lc("zh_CN.UTF-8");
	locale::global(lc);

	std::wifstream wif(filename);
	std::wstringstream wss;
	wss << wif.rdbuf();
	return wss.str();
}

int main()
{
	wstring str0=readFile("/home/lak/eclipse-workspace/testCandel/utf8.txt");
	wcout<<str0<<endl;
	wcout<<str0[52]<<endl;
	wcout<<str0[51]<<endl;
	wcout<<str0[50]<<endl;
	wcout<<str0[49]<<endl;
	getchar();
}

下面是一些有用的网站：https://www.orcode.com/question/1192472_kb0b35.html
http://www.cplusplus.com/reference/codecvt/codecvt_utf8/
https://bbs.youkuaiyun.com/topics/391854505
https://blog.youkuaiyun.com/zrufo747/article/details/80233764
https://blog.youkuaiyun.com/MaxWoods/article/details/40506939
http://it.cppreference.com/w/cpp/locale/wstring_convert

std::wstring readFile(const char* filename)
{
	std::wifstream wif(filename);
#ifdef _WIN32
	wif.imbue(std::locale(std::locale::empty(), new std::codecvt_utf8<wchar_t>));
#else
	locale lc("zh_CN.UTF-8");//unix下面要安装zh_CN.UTF-8的语言包
	locale::global(lc);
	std::wifstream wif(filename);//这一句放在后面，在linux中才能起效果
#endif
	vector<wstring> allLines;
	std::wstringstream wss;
	wchar_t tmp[5000];
	wmemset(tmp, ('\0'), 5000);

	for (;wif.getline(tmp, 5000);)
	{
		allLines.push_back(wstring(tmp));
		wmemset(tmp,('\0'), 5000);
	}

	//wss << wif.rdbuf();wss.str(); //读取所有的内容,wss.str()是所有的内容
	return wss.str();
}

写入数据到UTF8文件中。：

void writeFilePYInfo(const wchar_t *pspecialRulesPath, const map<wstring, int>&inputMp)
{
#ifdef _WIN32
	std::wfstream wif(pspecialRulesPath);
	//wif.imbue(std::locale(std::locale::empty(), new std::codecvt_utf8<wchar_t>));
	wif.imbue(std::locale(std::locale::empty(), new std::codecvt_utf8<wchar_t>));
#else
	locale lc("zh_CN.UTF-8");//unix下面要安装zh_CN.UTF-8的语言包
	locale::global(lc);
	std::wifstream wif(filename);//这一句放在后面，在linux中才能起效果
#endif
	map<wstring, int>::const_iterator it = inputMp.cbegin();
	for (; it != inputMp.cend(); ++it)
	{
		wstring whz = it->first;
		int ihz = it->second;
		wif << whz << L" " << ihz << endl;
	}

	wif.close();
}