Open Chinese Convert 1.1.2
A project for conversion between Traditional and Simplified Chinese
UTF8Util.hpp
1/*
2 * Open Chinese Convert
3 *
4 * Copyright 2013 Carbo Kuo <byvoid@byvoid.com>
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#pragma once
20
21#ifdef _MSC_VER
22#define NOMINMAX
23#include <Windows.h>
24#undef NOMINMAX
25#endif // _MSC_VER
26
27#include <cstring>
28
29#include "Common.hpp"
30#include "Exception.hpp"
31
32namespace opencc {
37class OPENCC_EXPORT UTF8Util {
38public:
42 static void SkipUtf8Bom(FILE* fp);
43
48 static size_t NextCharLengthNoException(const char* str) {
49 char ch = *str;
50 if ((ch & 0xF0) == 0xE0) {
51 return 3;
52 } else if ((ch & 0x80) == 0x00) {
53 return 1;
54 } else if ((ch & 0xE0) == 0xC0) {
55 return 2;
56 } else if ((ch & 0xF8) == 0xF0) {
57 return 4;
58 } else if ((ch & 0xFC) == 0xF8) {
59 return 5;
60 } else if ((ch & 0xFE) == 0xFC) {
61 return 6;
62 }
63 return 0;
64 }
65
69 static size_t NextCharLength(const char* str) {
70 size_t length = NextCharLengthNoException(str);
71 if (length == 0) {
72 throw InvalidUTF8(str);
73 }
74 return length;
75 }
76
80 static size_t PrevCharLength(const char* str) {
81 {
82 const size_t length = NextCharLengthNoException(str - 3);
83 if (length == 3) {
84 return length;
85 }
86 }
87 {
88 const size_t length = NextCharLengthNoException(str - 1);
89 if (length == 1) {
90 return length;
91 }
92 }
93 {
94 const size_t length = NextCharLengthNoException(str - 2);
95 if (length == 2) {
96 return length;
97 }
98 }
99 for (size_t i = 4; i <= 6; i++) {
100 const size_t length = NextCharLengthNoException(str - i);
101 if (length == i) {
102 return length;
103 }
104 }
105 throw InvalidUTF8(str);
106 }
107
111 static const char* NextChar(const char* str) {
112 return str + NextCharLength(str);
113 }
114
118 static const char* PrevChar(const char* str) {
119 return str - PrevCharLength(str);
120 }
121
125 static size_t Length(const char* str) {
126 size_t length = 0;
127 while (*str != '\0') {
128 str = NextChar(str);
129 length++;
130 }
131 return length;
132 }
133
140 static const char* FindNextInline(const char* str, const char ch) {
141 while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
142 str = NextChar(str);
143 }
144 return str;
145 }
146
150 static bool IsLineEndingOrFileEnding(const char ch) {
151 return ch == '\0' || ch == '\n' || ch == '\r';
152 }
153
157 static std::string FromSubstr(const char* str, size_t length) {
158 std::string newStr;
159 newStr.resize(length);
160 strncpy(const_cast<char*>(newStr.c_str()), str, length);
161 return newStr;
162 }
163
168 static bool NotShorterThan(const char* str, size_t byteLength) {
169 while (byteLength > 0) {
170 if (*str == '\0') {
171 return false;
172 }
173 byteLength--;
174 str++;
175 }
176 return true;
177 }
178
183 static std::string TruncateUTF8(const char* str, size_t maxByteLength) {
184 std::string wordTrunc;
185 if (NotShorterThan(str, maxByteLength)) {
186 size_t len = 0;
187 const char* pStr = str;
188 for (;;) {
189 const size_t charLength = NextCharLength(pStr);
190 if (len + charLength > maxByteLength) {
191 break;
192 }
193 pStr += charLength;
194 len += charLength;
195 }
196 wordTrunc = FromSubstr(str, len);
197 } else {
198 wordTrunc = str;
199 }
200 return wordTrunc;
201 }
202
206 static void ReplaceAll(std::string& str, const char* from, const char* to) {
207 std::string::size_type pos = 0;
208 std::string::size_type fromLen = strlen(from);
209 std::string::size_type toLen = strlen(to);
210 while ((pos = str.find(from, pos)) != std::string::npos) {
211 str.replace(pos, fromLen, to);
212 pos += toLen;
213 }
214 }
215
219 static std::string Join(const std::vector<std::string>& strings,
220 const std::string& separator) {
221 std::ostringstream buffer;
222 bool first = true;
223 for (const auto& str : strings) {
224 if (!first) {
225 buffer << separator;
226 }
227 buffer << str;
228 first = false;
229 }
230 return buffer.str();
231 }
232
236 static std::string Join(const std::vector<std::string>& strings) {
237 std::ostringstream buffer;
238 for (const auto& str : strings) {
239 buffer << str;
240 }
241 return buffer.str();
242 }
243
244 static void GetByteMap(const char* str, const size_t utf8Length,
245 std::vector<size_t>* byteMap) {
246 if (byteMap->size() < utf8Length) {
247 byteMap->resize(utf8Length);
248 }
249 const char* pstr = str;
250 for (size_t i = 0; i < utf8Length; i++) {
251 (*byteMap)[i] = pstr - str;
252 pstr = NextChar(pstr);
253 }
254 }
255
256#ifdef _MSC_VER
257 static std::wstring GetPlatformString(const std::string& str) {
258 return U8ToU16(str);
259 }
260#else
261 static std::string GetPlatformString(const std::string& str) { return str; }
262#endif // _MSC_VER
263
264#ifdef _MSC_VER
265 static std::string U16ToU8(const std::wstring& wstr) {
266 std::string ret;
267 int length = static_cast<int>(wstr.length());
268 int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0,
269 NULL, NULL);
270 if (convcnt > 0) {
271 ret.resize(convcnt);
272 WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt,
273 NULL, NULL);
274 }
275 return ret;
276 }
277
278 static std::wstring U8ToU16(const std::string& str) {
279 std::wstring ret;
280 int length = static_cast<int>(str.length());
281 int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0);
282 if (convcnt > 0) {
283 ret.resize(convcnt);
284 MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt);
285 }
286 return ret;
287 }
288#endif // _MSC_VER
289};
290} // namespace opencc
Definition: Exception.hpp:77
UTF8 std::string utilities.
Definition: UTF8Util.hpp:37
static bool IsLineEndingOrFileEnding(const char ch)
Returns ture if the character is a line ending or end of file.
Definition: UTF8Util.hpp:150
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition: UTF8Util.hpp:80
static std::string FromSubstr(const char *str, size_t length)
Copies a substd::string with given length to a new std::string.
Definition: UTF8Util.hpp:157
static void ReplaceAll(std::string &str, const char *from, const char *to)
Replaces all patterns in a std::string in place.
Definition: UTF8Util.hpp:206
static size_t NextCharLengthNoException(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:48
static bool NotShorterThan(const char *str, size_t byteLength)
Returns true if the given std::string is longer or as long as the given length.
Definition: UTF8Util.hpp:168
static std::string Join(const std::vector< std::string > &strings)
Joins a std::string vector in to a std::string.
Definition: UTF8Util.hpp:236
static std::string TruncateUTF8(const char *str, size_t maxByteLength)
Truncates a std::string with a maximal length in byte.
Definition: UTF8Util.hpp:183
static size_t Length(const char *str)
Returns the UTF8 length of a valid UTF8 std::string.
Definition: UTF8Util.hpp:125
static const char * FindNextInline(const char *str, const char ch)
Finds a character in the same line.
Definition: UTF8Util.hpp:140
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:69
static std::string Join(const std::vector< std::string > &strings, const std::string &separator)
Joins a std::string vector in to a std::string with a separator.
Definition: UTF8Util.hpp:219
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition: UTF8Util.hpp:118
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition: UTF8Util.hpp:111