代码]cpp代码 http://www.oschina.net/code/snippet_170948_18231
001 | /* |
002 | Copyright (C) 2004-2005 Cory Nelson |
003 |
004 | This software is provided 'as-is', without any express or implied |
005 | warranty. In no event will the authors be held liable for any damages |
006 | arising from the use of this software. |
007 |
008 | Permission is granted to anyone to use this software for any purpose, |
009 | including commercial applications, and to alter it and redistribute it |
010 | freely, subject to the following restrictions: |
011 |
012 | 1. The origin of this software must not be misrepresented; you must not |
013 | claim that you wrote the original software. If you use this software |
014 | in a product, an acknowledgment in the product documentation would be |
015 | appreciated but is not required. |
016 | 2. Altered source versions must be plainly marked as such, and must not be |
017 | misrepresented as being the original software. |
018 | 3. This notice may not be removed or altered from any source distribution. |
019 | */ |
020 |
021 | // namespaces added by Arvid Norberg |
022 |
023 | #ifndef __UTF8_H__ |
024 | #define __UTF8_H__ |
025 |
026 | #include <string> |
027 | #include <iterator> |
028 | #include <stdexcept> |
029 | #include <cwchar> |
030 |
031 | namespace detail { |
032 |
033 | template < typename InputIterator> |
034 | wchar_t decode_utf8_mb(InputIterator &iter, InputIterator last) |
035 | { |
036 | if (iter == last) throw std::runtime_error( "incomplete UTF-8 sequence" ); |
037 | if (((*iter) & 0xc0) != 0x80) throw std::runtime_error( "invalid UTF-8 sequence" ); |
038 |
039 | return ( wchar_t )((*iter++) & 0x3f); |
040 | } |
041 |
042 | template < typename InputIterator> |
043 | wchar_t decode_utf8(InputIterator &iter, InputIterator last) |
044 | { |
045 | wchar_t ret; |
046 |
047 | if (((*iter) & 0x80) == 0) // one byte |
048 | { |
049 | ret = *iter++; |
050 | } |
051 | else if (((*iter) & 0xe0) == 0xc0) // two bytes |
052 | { |
053 | wchar_t byte1 = (*iter++) & 0x1f; |
054 | wchar_t byte2 = decode_utf8_mb(iter, last); |
055 | ret = (byte1 << 6) | byte2; |
056 | } |
057 | else if (((*iter) & 0xf0) == 0xe0) // three bytes |
058 | { |
059 | wchar_t byte1 = (*iter++) & 0x0f; |
060 | wchar_t byte2 = decode_utf8_mb(iter, last); |
061 | wchar_t byte3 = decode_utf8_mb(iter, last); |
062 | ret = (byte1 << 12) | (byte2 << 6) | byte3; |
063 | } |
064 | // TODO: support surrogate pairs |
065 | else throw std::runtime_error( "UTF-8 not convertable to UTF-16" ); |
066 |
067 | return ret; |
068 | } |
069 |
070 | template < typename InputIterator, typename OutputIterator> |
071 | OutputIterator utf8_wchar(InputIterator first, InputIterator last, OutputIterator dest) |
072 | { |
073 | for (; first!=last; ++dest) |
074 | *dest = decode_utf8(first, last); |
075 | return dest; |
076 | } |
077 |
078 | template < typename InputIterator, typename OutputIterator> |
079 | void encode_wchar(InputIterator iter, OutputIterator &dest) |
080 | { |
081 | if (*iter <= 0x007F) |
082 | { |
083 | *dest=( char )*iter; |
084 | ++dest; |
085 | } |
086 | else if (*iter <= 0x07FF) |
087 | { |
088 | *dest = ( char )( |
089 | 0xC0 | |
090 | ((*iter & 0x07C0) >> 6) |
091 | ); |
092 | ++dest; |
093 |
094 | *dest = ( char )( |
095 | 0x80 | |
096 | (*iter & 0x003F) |
097 | ); |
098 | ++dest; |
099 | } |
100 | else if (*iter <= 0xFFFF) |
101 | { |
102 | *dest = ( char )( |
103 | 0xE0 | |
104 | ((*iter & 0xF000) >> 12) |
105 | ); |
106 | ++dest; |
107 |
108 | *dest = ( char )( |
109 | 0x80 | |
110 | ((*iter & 0x0FC0) >> 6) |
111 | ); |
112 | ++dest; |
113 |
114 | *dest = ( char )( |
115 | 0x80 | |
116 | (*iter & 0x003F) |
117 | ); |
118 | ++dest; |
119 | } |
120 | } |
121 |
122 | template < typename InputIterator, typename OutputIterator> |
123 | OutputIterator wchar_utf8(InputIterator first, InputIterator last, OutputIterator dest) |
124 | { |
125 | for (; first!=last; ++first) |
126 | encode_wchar(first, dest); |
127 | return dest; |
128 | } |
129 |
130 | } |
131 |
132 | inline void utf8_wchar( const std::string &utf8, std::wstring &wide) |
133 | { |
134 | wide.clear(); |
135 | detail::utf8_wchar(utf8.begin(), utf8.end(), std::back_inserter(wide)); |
136 | } |
137 |
138 | inline std::wstring utf8_wchar( const std::string &str) |
139 | { |
140 | std::wstring ret; |
141 | utf8_wchar(str, ret); |
142 | return ret; |
143 | } |
144 |
145 | inline void wchar_utf8( const std::wstring &wide, std::string &utf8) |
146 | { |
147 | utf8.clear(); |
148 | detail::wchar_utf8(wide.begin(), wide.end(), std::back_inserter(utf8)); |
149 | } |
150 |
151 | inline std::string wchar_utf8( const std::wstring &str) |
152 | { |
153 | std::string ret; |
154 | wchar_utf8(str, ret); |
155 | return ret; |
156 | } |
157 |
158 | #endif // __UTF8_H__ |