Better Unicode support in GeneralUtils (#658)

* ASCIIToUTF16: output replacement character instead of failing assert

* Add GeneralUtils::_NextUTF8Char

* Implement GeneralUtils::UTF8ToUTF16

* use string_view everywhere

* use string_view::front instead of begin

* Add PushUTF16CodePoint
This commit is contained in:
Daniel Seiler 2022-07-26 06:11:30 +02:00 committed by GitHub
parent e97ae92624
commit 9813c3ed2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 179 additions and 8 deletions

View File

@ -6,7 +6,7 @@
#include <algorithm>
template <typename T>
inline size_t MinSize(size_t size, const std::basic_string<T>& string) {
inline size_t MinSize(size_t size, const std::basic_string_view<T>& string) {
if (size == size_t(-1) || size > string.size()) {
return string.size();
} else {
@ -24,7 +24,7 @@ inline bool IsTrailSurrogate(char16_t c) {
inline void PushUTF8CodePoint(std::string& ret, char32_t cp) {
if (cp <= 0x007F) {
ret.push_back(cp);
ret.push_back(static_cast<uint8_t>(cp));
} else if (cp <= 0x07FF) {
ret.push_back(0xC0 | (cp >> 6));
ret.push_back(0x80 | (cp & 0x3F));
@ -42,16 +42,123 @@ inline void PushUTF8CodePoint(std::string& ret, char32_t cp) {
}
}
constexpr const char16_t REPLACEMENT_CHARACTER = 0xFFFD;
bool _IsSuffixChar(uint8_t c) {
return (c & 0xC0) == 0x80;
}
bool GeneralUtils::_NextUTF8Char(std::string_view& slice, uint32_t& out) {
size_t rem = slice.length();
const uint8_t* bytes = (const uint8_t*) &slice.front();
if (rem > 0) {
uint8_t first = bytes[0];
if (first < 0x80) { // 1 byte character
out = static_cast<uint32_t>(first & 0x7F);
slice.remove_prefix(1);
return true;
} else if (first < 0xC0) {
// middle byte, not valid at start, fall through
} else if (first < 0xE0) { // two byte character
if (rem > 1) {
uint8_t second = bytes[1];
if (_IsSuffixChar(second)) {
out = (static_cast<uint32_t>(first & 0x1F) << 6)
+ static_cast<uint32_t>(second & 0x3F);
slice.remove_prefix(2);
return true;
}
}
} else if (first < 0xF0) { // three byte character
if (rem > 2) {
uint8_t second = bytes[1];
uint8_t third = bytes[2];
if (_IsSuffixChar(second) && _IsSuffixChar(third)) {
out = (static_cast<uint32_t>(first & 0x0F) << 12)
+ (static_cast<uint32_t>(second & 0x3F) << 6)
+ static_cast<uint32_t>(third & 0x3F);
slice.remove_prefix(3);
return true;
}
}
} else if (first < 0xF8) { // four byte character
if (rem > 3) {
uint8_t second = bytes[1];
uint8_t third = bytes[2];
uint8_t fourth = bytes[3];
if (_IsSuffixChar(second) && _IsSuffixChar(third) && _IsSuffixChar(fourth)) {
out = (static_cast<uint32_t>(first & 0x07) << 18)
+ (static_cast<uint32_t>(second & 0x3F) << 12)
+ (static_cast<uint32_t>(third & 0x3F) << 6)
+ static_cast<uint32_t>(fourth & 0x3F);
slice.remove_prefix(4);
return true;
}
}
}
out = static_cast<uint32_t>(REPLACEMENT_CHARACTER);
slice.remove_prefix(1);
return true;
}
return false;
}
/// See <https://www.ietf.org/rfc/rfc2781.html#section-2.1>
bool PushUTF16CodePoint(std::u16string& output, uint32_t U, size_t size) {
if (output.length() >= size) return false;
if (U < 0x10000) {
// If U < 0x10000, encode U as a 16-bit unsigned integer and terminate.
output.push_back(static_cast<uint16_t>(U));
return true;
} else if (U > 0x10FFFF) {
output.push_back(REPLACEMENT_CHARACTER);
return true;
} else if (output.length() + 1 < size) {
// Let U' = U - 0x10000. Because U is less than or equal to 0x10FFFF,
// U' must be less than or equal to 0xFFFFF. That is, U' can be
// represented in 20 bits.
uint32_t Ut = U - 0x10000;
// Initialize two 16-bit unsigned integers, W1 and W2, to 0xD800 and
// 0xDC00, respectively. These integers each have 10 bits free to
// encode the character value, for a total of 20 bits.
uint16_t W1 = 0xD800;
uint16_t W2 = 0xDC00;
// Assign the 10 high-order bits of the 20-bit U' to the 10 low-order
// bits of W1 and the 10 low-order bits of U' to the 10 low-order
// bits of W2.
W1 += static_cast<uint16_t>((Ut & 0x3FC00) >> 10);
W2 += static_cast<uint16_t>((Ut & 0x3FF) >> 0);
// Terminate.
output.push_back(W1); // high surrogate
output.push_back(W2); // low surrogate
return true;
} else return false;
}
std::u16string GeneralUtils::UTF8ToUTF16(const std::string_view& string, size_t size) {
size_t newSize = MinSize(size, string);
std::u16string output;
output.reserve(newSize);
std::string_view iterator = string;
uint32_t c;
while (_NextUTF8Char(iterator, c) && PushUTF16CodePoint(output, c, size)) {}
return output;
}
//! Converts an std::string (ASCII) to UCS-2 / UTF-16
std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size) {
std::u16string GeneralUtils::ASCIIToUTF16(const std::string_view& string, size_t size) {
size_t newSize = MinSize(size, string);
std::u16string ret;
ret.reserve(newSize);
for (size_t i = 0; i < newSize; i++) {
char c = string[i];
assert(c > 0 && c <= 127);
ret.push_back(static_cast<char16_t>(c));
// Note: both 7-bit ascii characters and REPLACEMENT_CHARACTER fit in one char16_t
ret.push_back((c > 0 && c <= 127) ? static_cast<char16_t>(c) : REPLACEMENT_CHARACTER);
}
return ret;
@ -59,7 +166,7 @@ std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size
//! Converts a (potentially-ill-formed) UTF-16 string to UTF-8
//! See: <http://simonsapin.github.io/wtf-8/#decoding-ill-formed-utf-16>
std::string GeneralUtils::UTF16ToWTF8(const std::u16string& string, size_t size) {
std::string GeneralUtils::UTF16ToWTF8(const std::u16string_view& string, size_t size) {
size_t newSize = MinSize(size, string);
std::string ret;
ret.reserve(newSize);

View File

@ -26,7 +26,18 @@ namespace GeneralUtils {
\param size A size to trim the string to. Default is -1 (No trimming)
\return An UTF-16 representation of the string
*/
std::u16string ASCIIToUTF16(const std::string& string, size_t size = -1);
std::u16string ASCIIToUTF16(const std::string_view& string, size_t size = -1);
//! Converts a UTF-8 String to a UTF-16 string
/*!
\param string The string to convert
\param size A size to trim the string to. Default is -1 (No trimming)
\return An UTF-16 representation of the string
*/
std::u16string UTF8ToUTF16(const std::string_view& string, size_t size = -1);
//! Internal, do not use
bool _NextUTF8Char(std::string_view& slice, uint32_t& out);
//! Converts a UTF-16 string to a UTF-8 string
/*!
@ -34,7 +45,7 @@ namespace GeneralUtils {
\param size A size to trim the string to. Default is -1 (No trimming)
\return An UTF-8 representation of the string
*/
std::string UTF16ToWTF8(const std::u16string& string, size_t size = -1);
std::string UTF16ToWTF8(const std::u16string_view& string, size_t size = -1);
/**
* Compares two basic strings but does so ignoring case sensitivity

View File

@ -4,6 +4,7 @@ create_test_sourcelist (Tests
AMFDeserializeTests.cpp
TestNiPoint3.cpp
TestLDFFormat.cpp
TestEncoding.cpp
)
# add the executable

52
tests/TestEncoding.cpp Normal file
View File

@ -0,0 +1,52 @@
#include <stdexcept>
#include <string>
#include "GeneralUtils.h"
#include "CommonCxxTests.h"
int TestEncoding(int argc, char* *const argv) {
std::string x = "Hello World!";
std::string_view v(x);
uint32_t out;
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'H');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'e');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'o');
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), true);
x = u8"Frühling";
v = x;
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'F');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'r');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'ü');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'h');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'l');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'i');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'n');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'g');
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
x = "中文字";
v = x;
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'');
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'');
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
x = "👨‍⚖️";
v = x;
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x1F468);
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x200D);
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x2696);
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0xFE0F);
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Hello World!"), u"Hello World!");
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Frühling"), u"Frühling");
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("中文字"), u"中文字");
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("👨‍⚖️"), u"👨‍⚖️");
return 0;
}