mirror of
https://github.com/DarkflameUniverse/DarkflameServer.git
synced 2024-11-25 15:07:28 +00:00
Better Unicode support in GeneralUtils (#658)
* ASCIIToUTF16: output replacement character instead of failing assert * Add GeneralUtils::_NextUTF8Char * Implement GeneralUtils::UTF8ToUTF16 * use string_view everywhere * use string_view::front instead of begin * Add PushUTF16CodePoint
This commit is contained in:
parent
e97ae92624
commit
9813c3ed2c
@ -6,7 +6,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline size_t MinSize(size_t size, const std::basic_string<T>& string) {
|
inline size_t MinSize(size_t size, const std::basic_string_view<T>& string) {
|
||||||
if (size == size_t(-1) || size > string.size()) {
|
if (size == size_t(-1) || size > string.size()) {
|
||||||
return string.size();
|
return string.size();
|
||||||
} else {
|
} else {
|
||||||
@ -24,7 +24,7 @@ inline bool IsTrailSurrogate(char16_t c) {
|
|||||||
|
|
||||||
inline void PushUTF8CodePoint(std::string& ret, char32_t cp) {
|
inline void PushUTF8CodePoint(std::string& ret, char32_t cp) {
|
||||||
if (cp <= 0x007F) {
|
if (cp <= 0x007F) {
|
||||||
ret.push_back(cp);
|
ret.push_back(static_cast<uint8_t>(cp));
|
||||||
} else if (cp <= 0x07FF) {
|
} else if (cp <= 0x07FF) {
|
||||||
ret.push_back(0xC0 | (cp >> 6));
|
ret.push_back(0xC0 | (cp >> 6));
|
||||||
ret.push_back(0x80 | (cp & 0x3F));
|
ret.push_back(0x80 | (cp & 0x3F));
|
||||||
@ -42,16 +42,123 @@ inline void PushUTF8CodePoint(std::string& ret, char32_t cp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr const char16_t REPLACEMENT_CHARACTER = 0xFFFD;
|
||||||
|
|
||||||
|
bool _IsSuffixChar(uint8_t c) {
|
||||||
|
return (c & 0xC0) == 0x80;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool GeneralUtils::_NextUTF8Char(std::string_view& slice, uint32_t& out) {
|
||||||
|
size_t rem = slice.length();
|
||||||
|
const uint8_t* bytes = (const uint8_t*) &slice.front();
|
||||||
|
if (rem > 0) {
|
||||||
|
uint8_t first = bytes[0];
|
||||||
|
if (first < 0x80) { // 1 byte character
|
||||||
|
out = static_cast<uint32_t>(first & 0x7F);
|
||||||
|
slice.remove_prefix(1);
|
||||||
|
return true;
|
||||||
|
} else if (first < 0xC0) {
|
||||||
|
// middle byte, not valid at start, fall through
|
||||||
|
} else if (first < 0xE0) { // two byte character
|
||||||
|
if (rem > 1) {
|
||||||
|
uint8_t second = bytes[1];
|
||||||
|
if (_IsSuffixChar(second)) {
|
||||||
|
out = (static_cast<uint32_t>(first & 0x1F) << 6)
|
||||||
|
+ static_cast<uint32_t>(second & 0x3F);
|
||||||
|
slice.remove_prefix(2);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (first < 0xF0) { // three byte character
|
||||||
|
if (rem > 2) {
|
||||||
|
uint8_t second = bytes[1];
|
||||||
|
uint8_t third = bytes[2];
|
||||||
|
if (_IsSuffixChar(second) && _IsSuffixChar(third)) {
|
||||||
|
out = (static_cast<uint32_t>(first & 0x0F) << 12)
|
||||||
|
+ (static_cast<uint32_t>(second & 0x3F) << 6)
|
||||||
|
+ static_cast<uint32_t>(third & 0x3F);
|
||||||
|
slice.remove_prefix(3);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (first < 0xF8) { // four byte character
|
||||||
|
if (rem > 3) {
|
||||||
|
uint8_t second = bytes[1];
|
||||||
|
uint8_t third = bytes[2];
|
||||||
|
uint8_t fourth = bytes[3];
|
||||||
|
if (_IsSuffixChar(second) && _IsSuffixChar(third) && _IsSuffixChar(fourth)) {
|
||||||
|
out = (static_cast<uint32_t>(first & 0x07) << 18)
|
||||||
|
+ (static_cast<uint32_t>(second & 0x3F) << 12)
|
||||||
|
+ (static_cast<uint32_t>(third & 0x3F) << 6)
|
||||||
|
+ static_cast<uint32_t>(fourth & 0x3F);
|
||||||
|
slice.remove_prefix(4);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = static_cast<uint32_t>(REPLACEMENT_CHARACTER);
|
||||||
|
slice.remove_prefix(1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// See <https://www.ietf.org/rfc/rfc2781.html#section-2.1>
|
||||||
|
bool PushUTF16CodePoint(std::u16string& output, uint32_t U, size_t size) {
|
||||||
|
if (output.length() >= size) return false;
|
||||||
|
if (U < 0x10000) {
|
||||||
|
// If U < 0x10000, encode U as a 16-bit unsigned integer and terminate.
|
||||||
|
output.push_back(static_cast<uint16_t>(U));
|
||||||
|
return true;
|
||||||
|
} else if (U > 0x10FFFF) {
|
||||||
|
output.push_back(REPLACEMENT_CHARACTER);
|
||||||
|
return true;
|
||||||
|
} else if (output.length() + 1 < size) {
|
||||||
|
// Let U' = U - 0x10000. Because U is less than or equal to 0x10FFFF,
|
||||||
|
// U' must be less than or equal to 0xFFFFF. That is, U' can be
|
||||||
|
// represented in 20 bits.
|
||||||
|
uint32_t Ut = U - 0x10000;
|
||||||
|
|
||||||
|
// Initialize two 16-bit unsigned integers, W1 and W2, to 0xD800 and
|
||||||
|
// 0xDC00, respectively. These integers each have 10 bits free to
|
||||||
|
// encode the character value, for a total of 20 bits.
|
||||||
|
uint16_t W1 = 0xD800;
|
||||||
|
uint16_t W2 = 0xDC00;
|
||||||
|
|
||||||
|
// Assign the 10 high-order bits of the 20-bit U' to the 10 low-order
|
||||||
|
// bits of W1 and the 10 low-order bits of U' to the 10 low-order
|
||||||
|
// bits of W2.
|
||||||
|
W1 += static_cast<uint16_t>((Ut & 0x3FC00) >> 10);
|
||||||
|
W2 += static_cast<uint16_t>((Ut & 0x3FF) >> 0);
|
||||||
|
|
||||||
|
// Terminate.
|
||||||
|
output.push_back(W1); // high surrogate
|
||||||
|
output.push_back(W2); // low surrogate
|
||||||
|
return true;
|
||||||
|
} else return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::u16string GeneralUtils::UTF8ToUTF16(const std::string_view& string, size_t size) {
|
||||||
|
size_t newSize = MinSize(size, string);
|
||||||
|
std::u16string output;
|
||||||
|
output.reserve(newSize);
|
||||||
|
std::string_view iterator = string;
|
||||||
|
|
||||||
|
uint32_t c;
|
||||||
|
while (_NextUTF8Char(iterator, c) && PushUTF16CodePoint(output, c, size)) {}
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
//! Converts an std::string (ASCII) to UCS-2 / UTF-16
|
//! Converts an std::string (ASCII) to UCS-2 / UTF-16
|
||||||
std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size) {
|
std::u16string GeneralUtils::ASCIIToUTF16(const std::string_view& string, size_t size) {
|
||||||
size_t newSize = MinSize(size, string);
|
size_t newSize = MinSize(size, string);
|
||||||
std::u16string ret;
|
std::u16string ret;
|
||||||
ret.reserve(newSize);
|
ret.reserve(newSize);
|
||||||
|
|
||||||
for (size_t i = 0; i < newSize; i++) {
|
for (size_t i = 0; i < newSize; i++) {
|
||||||
char c = string[i];
|
char c = string[i];
|
||||||
assert(c > 0 && c <= 127);
|
// Note: both 7-bit ascii characters and REPLACEMENT_CHARACTER fit in one char16_t
|
||||||
ret.push_back(static_cast<char16_t>(c));
|
ret.push_back((c > 0 && c <= 127) ? static_cast<char16_t>(c) : REPLACEMENT_CHARACTER);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
@ -59,7 +166,7 @@ std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size
|
|||||||
|
|
||||||
//! Converts a (potentially-ill-formed) UTF-16 string to UTF-8
|
//! Converts a (potentially-ill-formed) UTF-16 string to UTF-8
|
||||||
//! See: <http://simonsapin.github.io/wtf-8/#decoding-ill-formed-utf-16>
|
//! See: <http://simonsapin.github.io/wtf-8/#decoding-ill-formed-utf-16>
|
||||||
std::string GeneralUtils::UTF16ToWTF8(const std::u16string& string, size_t size) {
|
std::string GeneralUtils::UTF16ToWTF8(const std::u16string_view& string, size_t size) {
|
||||||
size_t newSize = MinSize(size, string);
|
size_t newSize = MinSize(size, string);
|
||||||
std::string ret;
|
std::string ret;
|
||||||
ret.reserve(newSize);
|
ret.reserve(newSize);
|
||||||
|
@ -26,7 +26,18 @@ namespace GeneralUtils {
|
|||||||
\param size A size to trim the string to. Default is -1 (No trimming)
|
\param size A size to trim the string to. Default is -1 (No trimming)
|
||||||
\return An UTF-16 representation of the string
|
\return An UTF-16 representation of the string
|
||||||
*/
|
*/
|
||||||
std::u16string ASCIIToUTF16(const std::string& string, size_t size = -1);
|
std::u16string ASCIIToUTF16(const std::string_view& string, size_t size = -1);
|
||||||
|
|
||||||
|
//! Converts a UTF-8 String to a UTF-16 string
|
||||||
|
/*!
|
||||||
|
\param string The string to convert
|
||||||
|
\param size A size to trim the string to. Default is -1 (No trimming)
|
||||||
|
\return An UTF-16 representation of the string
|
||||||
|
*/
|
||||||
|
std::u16string UTF8ToUTF16(const std::string_view& string, size_t size = -1);
|
||||||
|
|
||||||
|
//! Internal, do not use
|
||||||
|
bool _NextUTF8Char(std::string_view& slice, uint32_t& out);
|
||||||
|
|
||||||
//! Converts a UTF-16 string to a UTF-8 string
|
//! Converts a UTF-16 string to a UTF-8 string
|
||||||
/*!
|
/*!
|
||||||
@ -34,7 +45,7 @@ namespace GeneralUtils {
|
|||||||
\param size A size to trim the string to. Default is -1 (No trimming)
|
\param size A size to trim the string to. Default is -1 (No trimming)
|
||||||
\return An UTF-8 representation of the string
|
\return An UTF-8 representation of the string
|
||||||
*/
|
*/
|
||||||
std::string UTF16ToWTF8(const std::u16string& string, size_t size = -1);
|
std::string UTF16ToWTF8(const std::u16string_view& string, size_t size = -1);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compares two basic strings but does so ignoring case sensitivity
|
* Compares two basic strings but does so ignoring case sensitivity
|
||||||
|
@ -4,6 +4,7 @@ create_test_sourcelist (Tests
|
|||||||
AMFDeserializeTests.cpp
|
AMFDeserializeTests.cpp
|
||||||
TestNiPoint3.cpp
|
TestNiPoint3.cpp
|
||||||
TestLDFFormat.cpp
|
TestLDFFormat.cpp
|
||||||
|
TestEncoding.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
# add the executable
|
# add the executable
|
||||||
|
52
tests/TestEncoding.cpp
Normal file
52
tests/TestEncoding.cpp
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "GeneralUtils.h"
|
||||||
|
#include "CommonCxxTests.h"
|
||||||
|
|
||||||
|
int TestEncoding(int argc, char* *const argv) {
|
||||||
|
std::string x = "Hello World!";
|
||||||
|
std::string_view v(x);
|
||||||
|
|
||||||
|
uint32_t out;
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'H');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'e');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'o');
|
||||||
|
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), true);
|
||||||
|
|
||||||
|
x = u8"Frühling";
|
||||||
|
v = x;
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'F');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'r');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'ü');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'h');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'l');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'i');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'n');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'g');
|
||||||
|
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
|
||||||
|
|
||||||
|
x = "中文字";
|
||||||
|
v = x;
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'中');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'文');
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'字');
|
||||||
|
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
|
||||||
|
|
||||||
|
x = "👨⚖️";
|
||||||
|
v = x;
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x1F468);
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x200D);
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x2696);
|
||||||
|
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0xFE0F);
|
||||||
|
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
|
||||||
|
|
||||||
|
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Hello World!"), u"Hello World!");
|
||||||
|
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Frühling"), u"Frühling");
|
||||||
|
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("中文字"), u"中文字");
|
||||||
|
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("👨⚖️"), u"👨⚖️");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user