ipaddress 1.1.0
Loading...
Searching...
No Matches
unicode.hpp
Go to the documentation of this file.
1/**
2 * @file unicode.hpp
3 * @brief Unicode character processing
4 * @author Vladimir Shaleev
5 * @copyright MIT License
6 *
7 * This file contains definitions and templates for working with various
8 * Unicode encodings such as UTF-8, UTF-16, UTF-32, and Wide chars.
9 * It provides functionality to convert unicode characters to char and
10 * handling errors associated with invalid Unicode characters.
11 */
12
13#ifndef IPADDRESS_UNICODE_HPP
14#define IPADDRESS_UNICODE_HPP
15
16#include "errors.hpp"
17
18namespace IPADDRESS_NAMESPACE {
19
20namespace internal {
21
22template <typename T>
23struct char_reader;
24
25template <typename T>
26struct char_or_throw {
27 IPADDRESS_NODISCARD_WHEN_NO_EXCEPTIONS static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next(const T*& it, const T* begin, const T* end) IPADDRESS_NOEXCEPT_WHEN_NO_EXCEPTIONS {
28 uint32_t error_symbol = 0;
29 auto code = error_code::no_error;
30 const auto result = char_reader<T>::next_or_error(it, end, code, error_symbol);
31 if (code != error_code::no_error) {
32 raise_error(code, error_symbol, begin, end - begin);
33 }
34 return result;
35 }
36
37 static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE void has_throw() IPADDRESS_NOEXCEPT_WHEN_NO_EXCEPTIONS
38#ifdef IPADDRESS_MODULE
39 { }
40#else
41 ;
42#endif
43};
44
45struct symbol {
46 uint32_t value;
47 uint32_t length;
48};
49
50template <typename T>
51struct utf8_reader {
52 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char utf8_next_or_error(const T*& it, const T* end, error_code& error, uint32_t& error_symbol) {
54 error_symbol = 0;
55 auto correct = true;
56 auto symbol = utf8_code_point(uint8_t(*it), correct);
57 if (correct) {
58 switch (symbol.length) {
59 case 1:
60 break;
61
62 case 2:
63 symbol.value = (symbol.value << 6) | trailing_utf8_code_point(it, end, correct);
64 break;
65
66 case 3:
67 symbol.value = (symbol.value << 6) | trailing_utf8_code_point(it, end, correct);
68 symbol.value = (symbol.value << 6) | trailing_utf8_code_point(it, end, correct);
69 break;
70
71 case 4:
72 symbol.value = (symbol.value << 6) | trailing_utf8_code_point(it, end, correct);
73 symbol.value = (symbol.value << 6) | trailing_utf8_code_point(it, end, correct);
74 symbol.value = (symbol.value << 6) | trailing_utf8_code_point(it, end, correct);
75 break;
76
77 default:
78 correct = false;
79 break;
80 }
81 }
82 ++it;
83 if (!correct) {
85 return '\0';
86 } else if (symbol.value > 127) {
88 error_symbol = symbol.value;
89 return '\0';
90 }
91 return char(symbol.value);
92 }
93
94 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE symbol utf8_code_point(uint8_t byte, bool& correct) IPADDRESS_NOEXCEPT {
95 if ((byte & 0b10000000) == 0b00000000) {
96 return {static_cast<uint32_t>(byte), 1};
97 }
98 if ((byte & 0b11100000) == 0b11000000) {
99 return {static_cast<uint32_t>(byte & 0b00011111), 2};
100 }
101 if ((byte & 0b11110000) == 0b11100000) {
102 return {static_cast<uint32_t>(byte & 0b00001111), 3};
103 }
104 if ((byte & 0b11111000) == 0b11110000) {
105 return {static_cast<uint32_t>(byte & 0b00000111), 4};
106 }
107 correct = false;
108 return {0, 0};
109 }
110
111 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE uint32_t trailing_utf8_code_point(const T*& it, const T* end, bool& correct) IPADDRESS_NOEXCEPT {
112 if (++it >= end) {
113 correct = false;
114 return 0;
115 }
116 if ((uint8_t(*it) & 0b11000000) == 0b10000000) {
117 correct = true;
118 return uint8_t(*it) & 0b00111111;
119 }
120 correct = false;
121 return 0;
122 }
123};
124
125template <typename T>
126struct utf16_reader {
127 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char utf16_next_or_error(const T*& it, const T* end, error_code& error, uint32_t& error_symbol) IPADDRESS_NOEXCEPT {
128 error = error_code::no_error;
129 error_symbol = 0;
130 bool correct = true;
131 auto symbol = utf16_code_point(uint16_t(*it));
132 switch (symbol.length) {
133 case 1:
134 symbol.value = uint32_t(*it);
135 break;
136
137 case 2:
138 if (++it >= end) {
139 correct = false;
140 break;
141 }
142 if ((*it & 0b1111110000000000) == 0b1101110000000000) {
143 symbol.value = ((symbol.value << 10) | (uint16_t(*it) & 0b0000001111111111)) + 0x10000;
144 } else {
145 correct = false;
146 }
147 break;
148
149 default:
150 correct = false;
151 break;
152 }
153 ++it;
154 if (!correct) {
156 return '\0';
157 } else if (symbol.value > 127) {
159 error_symbol = symbol.value;
160 return '\0';
161 }
162 return char(symbol.value);
163 }
164
165 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE symbol utf16_code_point(uint16_t value) IPADDRESS_NOEXCEPT {
166 if ((value & 0b1111110000000000) == 0b1101100000000000) {
167 return {static_cast<uint32_t>(value & 0b0000001111111111), 2};
168 } else {
169 return {value, 1};
170 }
171 }
172};
173
174template <typename T>
175struct utf32_reader {
176 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char utf32_next_or_error(const T*& it, const T* /*end*/, error_code& error, uint32_t& error_symbol) IPADDRESS_NOEXCEPT {
177 error = error_code::no_error;
178 error_symbol = 0;
179 if (uint32_t(*it) > 127) {
181 error_symbol = uint32_t(*it++); // NOLINT
182 return '\0';
183 }
184 return char(*it++);
185 }
186};
187
188template <>
189struct char_reader<char>
190#ifdef IPADDRESS_CHAR_IS_UTF8
191: utf8_reader<char>, char_or_throw<char>
192#endif
193{
194#ifdef IPADDRESS_CHAR_IS_UTF8
195
196 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next_or_error(const char*& it, const char* end, error_code& error, uint32_t& error_symbol) IPADDRESS_NOEXCEPT {
197 return utf8_next_or_error(it, end, error, error_symbol);
198 }
199
200#else // !IPADDRESS_CHAR_IS_UTF8
201
202 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next(const char*& it, const char* /*begin*/, const char* /*end*/) IPADDRESS_NOEXCEPT {
203 return *it++;
204 }
205
206 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next_or_error(const char*& it, const char* /*end*/, error_code& error, uint32_t& error_symbol) IPADDRESS_NOEXCEPT {
207 error = error_code::no_error;
208 error_symbol = 0;
209 return *it++;
210 }
211
212 static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE void has_throw() IPADDRESS_NOEXCEPT
213#ifdef IPADDRESS_MODULE
214 { }
215#else
216 ;
217#endif
218
219#endif // !IPADDRESS_CHAR_IS_UTF8
220};
221
222#if __cpp_char8_t >= 201811L
223
224template <>
225struct char_reader<char8_t> : utf8_reader<char8_t>, char_or_throw<char8_t> {
226 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next_or_error(const char8_t*& it, const char8_t* end, error_code& error, uint32_t& error_symbol) IPADDRESS_NOEXCEPT {
227 return utf8_next_or_error(it, end, error, error_symbol);
228 }
229};
230
231#endif // __cpp_char8_t
232
233template <>
234struct char_reader<char16_t> : utf16_reader<char16_t>, char_or_throw<char16_t> {
235 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next_or_error(const char16_t*& it, const char16_t* end, error_code& error, uint32_t& error_symbol) IPADDRESS_NOEXCEPT {
236 return utf16_next_or_error(it, end, error, error_symbol);
237 }
238};
239
240template <>
241struct char_reader<char32_t> : utf32_reader<char32_t>, char_or_throw<char32_t> {
242 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next_or_error(const char32_t*& it, const char32_t* end, error_code& error, uint32_t& error_symbol) IPADDRESS_NOEXCEPT {
243 return utf32_next_or_error(it, end, error, error_symbol);
244 }
245};
246
247template <>
248struct char_reader<wchar_t> : utf16_reader<wchar_t>, utf32_reader<wchar_t>, char_or_throw<wchar_t> {
249 IPADDRESS_NODISCARD static IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next_or_error(const wchar_t*& it, const wchar_t* end, error_code& error, uint32_t& error_symbol) IPADDRESS_NOEXCEPT {
250 error = error_code::no_error;
251 error_symbol = 0;
252 #ifdef _WIN32
253 if (sizeof(wchar_t) == sizeof(char16_t)) {
254 return utf16_next_or_error(it, end, error, error_symbol);
255 }
256 #else
257 if (sizeof(wchar_t) == sizeof(char32_t)) {
258 return utf32_next_or_error(it, end, error, error_symbol);
259 }
260 #endif
261 if (*it > 127) {
263 error_symbol = uint32_t(*it++); // NOLINT
264 return '\0';
265 }
266 return char(*it++);
267 }
268};
269
270template <typename T>
271IPADDRESS_NODISCARD_WHEN_NO_EXCEPTIONS IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next_char(const T*& it, const T* begin, const T* end) IPADDRESS_NOEXCEPT(noexcept(internal::char_reader<T>::next(it, begin, end))) {
272 return internal::char_reader<T>::next(it, begin, end);
273}
274
275template <typename T>
276IPADDRESS_NODISCARD IPADDRESS_CONSTEXPR IPADDRESS_FORCE_INLINE char next_char_or_error(const T*& it, const T* end, error_code& error, uint32_t& error_symbol) IPADDRESS_NOEXCEPT {
277 return internal::char_reader<T>::next_or_error(it, end, error, error_symbol);
278}
279
280template <typename T>
281struct string_converter {
282 IPADDRESS_NODISCARD static IPADDRESS_FORCE_INLINE std::basic_string<T, std::char_traits<T>, std::allocator<T>> convert(const std::string& str) {
283 return std::basic_string<T, std::char_traits<T>, std::allocator<T>>(str.cbegin(), str.cend());
284 }
285};
286
287template <>
288struct string_converter<char> {
289 IPADDRESS_NODISCARD static IPADDRESS_FORCE_INLINE const std::string& convert(const std::string& str) IPADDRESS_NOEXCEPT {
290 return str;
291 }
292};
293
294IPADDRESS_FORCE_INLINE void print_symbol_code(std::ostringstream& out, uint32_t symbol) {
295 out << "{U+" << std::setw(4) << std::setfill('0') << std::hex << symbol << '}';
296}
297
298} // IPADDRESS_NAMESPACE::internal
299
300IPADDRESS_FORCE_INLINE std::ostringstream& error::print(std::ostringstream& out, const symbol& arg) {
301 internal::print_symbol_code(out, arg.value);
302 return out;
303}
304
305template <typename T, size_t N>
306IPADDRESS_FORCE_INLINE std::ostringstream& error::print(std::ostringstream& out, const T (&str)[N]) {
307 auto code = error_code::no_error;
308 uint32_t error_symbol = 0;
309 const T* it = str;
310 const T* end = str + N;
311 while (it < end) {
312 const auto result = internal::next_char_or_error(it, end, code, error_symbol);
313 if (code == error_code::no_error) {
314 if (result == '\0') {
315 break;
316 }
317 out << result;
318 } else {
319 if (error_symbol == 0) {
320 break;
321 }
322 internal::print_symbol_code(out, error_symbol);
323 }
324 }
325 return out;
326}
327
328} // namespace IPADDRESS_NAMESPACE
329
330#endif // IPADDRESS_UNICODE_HPP
The primary exception class used by the IP address library.
Definition errors.hpp:114
#define IPADDRESS_NOEXCEPT_WHEN_NO_EXCEPTIONS
Definition config.hpp:93
#define IPADDRESS_NODISCARD
Definition config.hpp:98
#define IPADDRESS_FORCE_INLINE
Definition config.hpp:112
#define IPADDRESS_NAMESPACE
Definition config.hpp:38
#define IPADDRESS_NOEXCEPT
Definition config.hpp:89
#define IPADDRESS_NODISCARD_WHEN_NO_EXCEPTIONS
Definition config.hpp:102
error_code
Enumeration of error codes for IP address parsing and validation.
Definition errors.hpp:52
@ unexpected_symbol
The input string contains an unexpected character.
@ wrong_encoding_sequence
Incorrect byte sequence in Unicode encoding.
@ no_error
Indicates the absence of any errors.