Better Unicode support in GeneralUtils (#658)

* ASCIIToUTF16: output replacement character instead of failing assert * Add GeneralUtils::_NextUTF8Char * Implement GeneralUtils::UTF8ToUTF16 * use string_view everywhere * use string_view::front instead of begin * Add PushUTF16CodePoint
2026-02-08 11:59:52 +00:00 · 2022-07-26 06:11:30 +02:00
parent e97ae92624
commit 9813c3ed2c
4 changed files with 179 additions and 8 deletions
--- a/dCommon/GeneralUtils.cpp
+++ b/dCommon/GeneralUtils.cpp
@@ -6,7 +6,7 @@
 #include <algorithm>

 template <typename T>
-inline size_t MinSize(size_t size, const std::basic_string<T>& string) {
+inline size_t MinSize(size_t size, const std::basic_string_view<T>& string) {
    if (size == size_t(-1) || size > string.size()) {
        return string.size();
    } else {
@@ -24,7 +24,7 @@ inline bool IsTrailSurrogate(char16_t c) {

 inline void PushUTF8CodePoint(std::string& ret, char32_t cp) {
    if (cp <= 0x007F) {
-        ret.push_back(cp);
+        ret.push_back(static_cast<uint8_t>(cp));
    } else if (cp <= 0x07FF) {
        ret.push_back(0xC0 | (cp >> 6));
        ret.push_back(0x80 | (cp & 0x3F));
@@ -42,16 +42,123 @@ inline void PushUTF8CodePoint(std::string& ret, char32_t cp) {
    }
 }

+constexpr const char16_t REPLACEMENT_CHARACTER = 0xFFFD;
+
+bool _IsSuffixChar(uint8_t c) {
+    return (c & 0xC0) == 0x80;
+}
+
+bool GeneralUtils::_NextUTF8Char(std::string_view& slice, uint32_t& out) {
+    size_t rem = slice.length();
+    const uint8_t* bytes = (const uint8_t*) &slice.front();
+    if (rem > 0) {
+        uint8_t first = bytes[0];
+        if (first < 0x80) { // 1 byte character
+            out = static_cast<uint32_t>(first & 0x7F);
+            slice.remove_prefix(1);
+            return true;
+        } else if (first < 0xC0) {
+            // middle byte, not valid at start, fall through
+        } else if (first < 0xE0) { // two byte character
+            if (rem > 1) {
+                uint8_t second = bytes[1];
+                if (_IsSuffixChar(second)) {
+                    out = (static_cast<uint32_t>(first & 0x1F) << 6)
+                        + static_cast<uint32_t>(second & 0x3F);
+                    slice.remove_prefix(2);
+                    return true;
+                }
+            }
+        } else if (first < 0xF0) { // three byte character
+            if (rem > 2) {
+                uint8_t second = bytes[1];
+                uint8_t third = bytes[2];
+                if (_IsSuffixChar(second) && _IsSuffixChar(third)) {
+                    out = (static_cast<uint32_t>(first & 0x0F) << 12)
+                        + (static_cast<uint32_t>(second & 0x3F) << 6)
+                        + static_cast<uint32_t>(third & 0x3F);
+                    slice.remove_prefix(3);
+                    return true;
+                }
+            }
+        } else if (first < 0xF8) { // four byte character
+            if (rem > 3) {
+                uint8_t second = bytes[1];
+                uint8_t third = bytes[2];
+                uint8_t fourth = bytes[3];
+                if (_IsSuffixChar(second) && _IsSuffixChar(third) && _IsSuffixChar(fourth)) {
+                    out = (static_cast<uint32_t>(first & 0x07) << 18)
+                        + (static_cast<uint32_t>(second & 0x3F) << 12)
+                        + (static_cast<uint32_t>(third & 0x3F) << 6)
+                        + static_cast<uint32_t>(fourth & 0x3F);
+                    slice.remove_prefix(4);
+                    return true;
+                }
+            }
+        }
+        out = static_cast<uint32_t>(REPLACEMENT_CHARACTER);
+        slice.remove_prefix(1);
+        return true;
+    }
+    return false;
+}
+
+/// See <https://www.ietf.org/rfc/rfc2781.html#section-2.1>
+bool PushUTF16CodePoint(std::u16string& output, uint32_t U, size_t size) {
+    if (output.length() >= size) return false;
+    if (U < 0x10000) {
+        // If U < 0x10000, encode U as a 16-bit unsigned integer and terminate.
+        output.push_back(static_cast<uint16_t>(U));
+        return true;
+    } else if (U > 0x10FFFF) {
+        output.push_back(REPLACEMENT_CHARACTER);
+        return true;
+    } else if (output.length() + 1 < size) {
+        // Let U' = U - 0x10000. Because U is less than or equal to 0x10FFFF,
+        // U' must be less than or equal to 0xFFFFF. That is, U' can be
+        // represented in 20 bits.
+        uint32_t Ut = U - 0x10000;
+
+        // Initialize two 16-bit unsigned integers, W1 and W2, to 0xD800 and
+        // 0xDC00, respectively. These integers each have 10 bits free to
+        // encode the character value, for a total of 20 bits.
+        uint16_t W1 = 0xD800;
+        uint16_t W2 = 0xDC00;
+
+        // Assign the 10 high-order bits of the 20-bit U' to the 10 low-order
+        // bits of W1 and the 10 low-order bits of U' to the 10 low-order
+        // bits of W2.
+        W1 += static_cast<uint16_t>((Ut & 0x3FC00) >> 10);
+        W2 += static_cast<uint16_t>((Ut & 0x3FF) >> 0);
+
+        // Terminate.
+        output.push_back(W1); // high surrogate
+        output.push_back(W2); // low surrogate
+        return true;
+    } else return false;
+}
+
+std::u16string GeneralUtils::UTF8ToUTF16(const std::string_view& string, size_t size) {
+    size_t newSize = MinSize(size, string);
+    std::u16string output;
+    output.reserve(newSize);
+    std::string_view iterator = string;
+
+    uint32_t c;
+    while (_NextUTF8Char(iterator, c) && PushUTF16CodePoint(output, c, size)) {}
+    return output;
+}
+
 //! Converts an std::string (ASCII) to UCS-2 / UTF-16
-std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size) {
+std::u16string GeneralUtils::ASCIIToUTF16(const std::string_view& string, size_t size) {
    size_t newSize = MinSize(size, string);
    std::u16string ret;
    ret.reserve(newSize);

    for (size_t i = 0; i < newSize; i++) {
        char c = string[i];
-        assert(c > 0 && c <= 127);
-        ret.push_back(static_cast<char16_t>(c));
+        // Note: both 7-bit ascii characters and REPLACEMENT_CHARACTER fit in one char16_t
+        ret.push_back((c > 0 && c <= 127) ? static_cast<char16_t>(c) : REPLACEMENT_CHARACTER);
    }

    return ret;
@@ -59,7 +166,7 @@ std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size

 //! Converts a (potentially-ill-formed) UTF-16 string to UTF-8
 //! See: <http://simonsapin.github.io/wtf-8/#decoding-ill-formed-utf-16>
-std::string GeneralUtils::UTF16ToWTF8(const std::u16string& string, size_t size) {
+std::string GeneralUtils::UTF16ToWTF8(const std::u16string_view& string, size_t size) {
    size_t newSize = MinSize(size, string);
    std::string ret;
    ret.reserve(newSize);
--- a/dCommon/GeneralUtils.h
+++ b/dCommon/GeneralUtils.h
@@ -26,7 +26,18 @@ namespace GeneralUtils {
      \param size A size to trim the string to. Default is -1 (No trimming)
      \return An UTF-16 representation of the string
     */
-    std::u16string ASCIIToUTF16(const std::string& string, size_t size = -1);
+    std::u16string ASCIIToUTF16(const std::string_view& string, size_t size = -1);
+
+    //! Converts a UTF-8 String to a UTF-16 string
+    /*!
+      \param string The string to convert
+      \param size A size to trim the string to. Default is -1 (No trimming)
+      \return An UTF-16 representation of the string
+     */
+    std::u16string UTF8ToUTF16(const std::string_view& string, size_t size = -1);
+
+    //! Internal, do not use
+    bool _NextUTF8Char(std::string_view& slice, uint32_t& out);

    //! Converts a UTF-16 string to a UTF-8 string
    /*!
@@ -34,7 +45,7 @@ namespace GeneralUtils {
      \param size A size to trim the string to. Default is -1 (No trimming)
      \return An UTF-8 representation of the string
     */
-    std::string UTF16ToWTF8(const std::u16string& string, size_t size = -1);
+    std::string UTF16ToWTF8(const std::u16string_view& string, size_t size = -1);

    /**
     * Compares two basic strings but does so ignoring case sensitivity
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ create_test_sourcelist (Tests
 	AMFDeserializeTests.cpp
 	TestNiPoint3.cpp
 	TestLDFFormat.cpp
+	TestEncoding.cpp
 )

 # add the executable
--- a/tests/TestEncoding.cpp
+++ b/tests/TestEncoding.cpp
@@ -0,0 +1,52 @@
+#include <stdexcept>
+#include <string>
+
+#include "GeneralUtils.h"
+#include "CommonCxxTests.h"
+
+int TestEncoding(int argc, char* *const argv) {
+    std::string x = "Hello World!";
+    std::string_view v(x);
+
+    uint32_t out;
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'H');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'e');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'o');
+    ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), true);
+
+    x = u8"Frühling";
+    v = x;
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'F');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'r');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'ü');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'h');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'l');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'i');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'n');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'g');
+    ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
+
+    x = "中文字";
+    v = x;
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'中');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'文');
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'字');
+    ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
+
+    x = "👨‍⚖️";
+    v = x;
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x1F468);
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x200D);
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x2696);
+    GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0xFE0F);
+    ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
+
+    ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Hello World!"), u"Hello World!");
+    ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Frühling"), u"Frühling");
+    ASSERT_EQ(GeneralUtils::UTF8ToUTF16("中文字"), u"中文字");
+    ASSERT_EQ(GeneralUtils::UTF8ToUTF16("👨‍⚖️"), u"👨‍⚖️");
+
+    return 0;
+}