从宽字符转换到UTF8的代码

本文提供了一个实用的C++代码片段,用于在UTF-8与UTF-16编码之间进行转换。该代码定义了两个主要的转换函数:`utf8_wchar`用于将UTF-8字符串转换为UTF-16宽字符字符串;`wchar_utf8`用于将UTF-16宽字符字符串转换为UTF-8字符串。此外,还提供了便捷的接口函数简化转换过程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

代码]cpp代码  http://www.oschina.net/code/snippet_170948_18231   

001 /*
002     Copyright (C) 2004-2005 Cory Nelson
003  
004     This software is provided 'as-is', without any express or implied
005     warranty.  In no event will the authors be held liable for any damages
006     arising from the use of this software.
007  
008     Permission is granted to anyone to use this software for any purpose,
009     including commercial applications, and to alter it and redistribute it
010     freely, subject to the following restrictions:
011  
012     1. The origin of this software must not be misrepresented; you must not
013         claim that you wrote the original software. If you use this software
014         in a product, an acknowledgment in the product documentation would be
015         appreciated but is not required.
016     2. Altered source versions must be plainly marked as such, and must not be
017         misrepresented as being the original software.
018     3. This notice may not be removed or altered from any source distribution.
019 */
020  
021 // namespaces added by Arvid Norberg
022  
023 #ifndef __UTF8_H__
024 #define __UTF8_H__
025  
026 #include <string>
027 #include <iterator>
028 #include <stdexcept>
029 #include <cwchar>
030  
031 namespace detail {
032  
033 template<typename InputIterator>
034 wchar_t decode_utf8_mb(InputIterator &iter, InputIterator last)
035 {
036     if (iter == last) throw std::runtime_error("incomplete UTF-8 sequence");
037     if (((*iter) & 0xc0) != 0x80) throw std::runtime_error("invalid UTF-8 sequence");
038  
039     return (wchar_t)((*iter++) & 0x3f);
040 }
041  
042 template<typename InputIterator>
043 wchar_t decode_utf8(InputIterator &iter, InputIterator last)
044 {
045     wchar_t ret;
046  
047     if (((*iter) & 0x80) == 0) // one byte
048     {
049         ret = *iter++;
050     }
051     else if (((*iter) & 0xe0) == 0xc0) // two bytes
052     {
053         wchar_t byte1 = (*iter++) & 0x1f;
054         wchar_t byte2 = decode_utf8_mb(iter, last);
055         ret = (byte1 << 6) | byte2;
056     }
057     else if (((*iter) & 0xf0) == 0xe0) // three bytes
058     {
059         wchar_t byte1 = (*iter++) & 0x0f;
060         wchar_t byte2 = decode_utf8_mb(iter, last);
061         wchar_t byte3 = decode_utf8_mb(iter, last);
062         ret = (byte1 << 12) | (byte2 << 6) | byte3;
063     }
064     // TODO: support surrogate pairs
065     else throw std::runtime_error("UTF-8 not convertable to UTF-16");
066  
067     return ret;
068 }
069  
070 template<typename InputIterator, typename OutputIterator>
071 OutputIterator utf8_wchar(InputIterator first, InputIterator last, OutputIterator dest)
072 {
073     for(; first!=last; ++dest)
074         *dest = decode_utf8(first, last);
075     return dest;
076 }
077  
078 template<typename InputIterator, typename OutputIterator>
079 void encode_wchar(InputIterator iter, OutputIterator &dest)
080 {
081     if(*iter <= 0x007F)
082     {
083         *dest=(char)*iter;
084         ++dest;
085     }
086     else if(*iter <= 0x07FF)
087     {
088         *dest = (char)(
089             0xC0 |
090             ((*iter & 0x07C0) >> 6)
091         );
092         ++dest;
093  
094         *dest = (char)(
095             0x80 |
096             (*iter & 0x003F)
097         );
098         ++dest;
099     }
100     else if(*iter <= 0xFFFF)
101     {
102         *dest = (char)(
103             0xE0 |
104             ((*iter & 0xF000) >> 12)
105         );
106         ++dest;
107  
108         *dest = (char)(
109             0x80 |
110             ((*iter & 0x0FC0) >> 6)
111         );
112         ++dest;
113  
114         *dest = (char)(
115             0x80 |
116             (*iter & 0x003F)
117         );
118         ++dest;
119     }
120 }
121  
122 template<typename InputIterator, typename OutputIterator>
123 OutputIterator wchar_utf8(InputIterator first, InputIterator last, OutputIterator dest)
124 {
125     for(; first!=last; ++first)
126         encode_wchar(first, dest);
127     return dest;
128 }
129  
130 }
131  
132 inline void utf8_wchar(const std::string &utf8, std::wstring &wide)
133 {
134     wide.clear();
135     detail::utf8_wchar(utf8.begin(), utf8.end(), std::back_inserter(wide));
136 }
137  
138 inline std::wstring utf8_wchar(const std::string &str)
139 {
140     std::wstring ret;
141     utf8_wchar(str, ret);
142     return ret;
143 }
144  
145 inline void wchar_utf8(const std::wstring &wide, std::string &utf8)
146 {
147     utf8.clear();
148     detail::wchar_utf8(wide.begin(), wide.end(), std::back_inserter(utf8));
149 }
150  
151 inline std::string wchar_utf8(const std::wstring &str)
152 {
153     std::string ret;
154     wchar_utf8(str, ret);
155     return ret;
156 }
157  
158 #endif // __UTF8_H__

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值