SAP · stefanuhrig · Aug 5, 2022 · Jul 27, 2022 · Jul 29, 2022 · Aug 4, 2022
diff --git a/src/odbc/CMakeLists.txt b/src/odbc/CMakeLists.txt
@@ -17,6 +17,7 @@ SET(public_headers
     ResultSetMetaDataUnicode.h
     Statement.h
     StatementBase.h
+    StringConverter.h
     Types.h
     Util.h
 )
@@ -38,6 +39,7 @@ SET(odbccpp_sources
     ResultSetMetaDataUnicode.cpp
     Statement.cpp
     StatementBase.cpp
+    StringConverter.cpp
     Types.cpp
     Util.cpp
     internal/Batch.cpp

diff --git a/src/odbc/Exception.h b/src/odbc/Exception.h
@@ -31,6 +31,7 @@ class ODBC_EXPORT Exception : public std::exception
     friend class ResultSetMetaDataUnicode;
     friend class Statement;
     friend class StatementBase;
+    friend class StringConverter;
     friend class time;
     friend class timestamp;
     friend class ValueBuffer;

diff --git a/src/odbc/Forwards.h b/src/odbc/Forwards.h
@@ -20,6 +20,7 @@ class ResultSetMetaDataBase;
 class ResultSetMetaDataUnicode;
 class StatementBase;
 class Statement;
+class StringConverter;
 class ValueBuffer;
 //------------------------------------------------------------------------------
 typedef Reference<Connection> ConnectionRef;

diff --git a/src/odbc/StringConverter.cpp b/src/odbc/StringConverter.cpp
@@ -0,0 +1,120 @@
+#include <odbc/Exception.h>
+#include <odbc/StringConverter.h>
+#include <odbc/internal/Macros.h>
+#include <odbc/internal/charset/Utf16.h>
+#include <odbc/internal/charset/Utf8.h>
+#include <cassert>
+#include <cstring>
+#include <sstream>
+//------------------------------------------------------------------------------
+using namespace std;
+//------------------------------------------------------------------------------
+NS_ODBC_START
+//------------------------------------------------------------------------------
+// StringConverter class
+//------------------------------------------------------------------------------
+u16string StringConverter::utf8ToUtf16(const char* src)
+{
+    ODBC_CHECK(src != nullptr, "Input string must not be nullptr.");
+    return utf8ToUtf16(src, nullptr);
+}
+//------------------------------------------------------------------------------
+u16string StringConverter::utf8ToUtf16(const char* src, size_t srcLength)
+{
+    ODBC_CHECK(src != nullptr, "Input string must not be nullptr.");
+    return utf8ToUtf16(src, src + srcLength);
+}
+//------------------------------------------------------------------------------
+u16string StringConverter::utf8ToUtf16(const char* begin, const char* end)
+{
+    assert(begin != nullptr);
+
+    if (end == nullptr)
+        end = begin + strlen(begin);
+
+    size_t len = utf8ToUtf16Length(begin, end);
+    u16string str;
+    str.reserve(len);
+
+    const char* curr = begin;
+
+    while (curr < end)
+    {
+        pair<int, char32_t> cp = utf8ToCodePoint(begin, curr, end);
+        curr += cp.first;
+
+        assert(utf16::isRepresentable(cp.second));
+
+        if (utf16::needsSurrogatePair(cp.second))
+        {
+            pair<char16_t, char16_t> sp = utf16::encodeSurrogatePair(cp.second);
+            str.push_back(sp.first);
+            str.push_back(sp.second);
+        }
+        else
+        {
+            str.push_back(cp.second);
+        }
+    }
+
+    return str;
+}
+//------------------------------------------------------------------------------
+pair<int, char32_t> StringConverter::utf8ToCodePoint(
+    const char* begin, const char* curr, const char* end)
+{
+    assert(begin != nullptr && end != nullptr);
+    assert(begin <= curr && curr < end);
+
+    int len = utf8::getSequenceLength(*curr);
+    if (len == 1)
+    {
+        // Short-cut for the easy and common case.
+        return pair<int, char32_t>(len, *curr);
+    }
+    if (len == -1)
+    {
+        ODBC_FAIL("The string contains an invalid UTF-8 byte sequence at "
+                  "position " << (curr - begin) << ".");
+    }
+
+    // We have to make sure that we don't exceed the end of the string.
+    if ((curr + len) > end)
+    {
+        ODBC_FAIL("The string contains an incomplete UTF-8 byte sequence at "
+                  "position " << (curr - begin) << ".");
+    }
+
+    if (!utf8::isValidSequence(len, curr))
+    {
+        ODBC_FAIL("The string contains an invalid UTF-8 byte sequence at "
+                  "position " << (curr - begin) << ".");
+    }
+
+    return pair<int, char32_t>(len, utf8::decode(len, curr));
+}
+//------------------------------------------------------------------------------
+size_t StringConverter::utf8ToUtf16Length(const char* begin, const char* end)
+{
+    assert(begin != nullptr && end != nullptr);
+
+    size_t len = 0;
+
+    const char* curr = begin;
+    while (curr < end)
+    {
+        pair<int, char32_t> cp = utf8ToCodePoint(begin, curr, end);
+        curr += cp.first;
+
+        ODBC_CHECK(utf16::isRepresentable(cp.second),
+                   "The UTF-8 string contains codepoint U+" <<
+                   std::hex << (uint32_t)cp.second <<
+                   ", which cannot be represented in UTF-16.");
+
+        len += utf16::needsSurrogatePair(cp.second) ? 2 : 1;
+    }
+
+    return len;
+}
+//------------------------------------------------------------------------------
+NS_ODBC_END
diff --git a/src/odbc/StringConverter.h b/src/odbc/StringConverter.h
@@ -0,0 +1,46 @@
+#ifndef ODBC_STRING_CONVERTER_H_INCLUDED
+#define ODBC_STRING_CONVERTER_H_INCLUDED
+//------------------------------------------------------------------------------
+#include <odbc/Config.h>
+#include <cstddef>
+#include <utility>
+#include <string>
+//------------------------------------------------------------------------------
+NS_ODBC_START
+//------------------------------------------------------------------------------
+class ODBC_EXPORT StringConverter
+{
+public:
+    StringConverter() = delete;
+
+    /**
+     * Converts a null-terminated UTF-8 string to a UTF-16 string.
+     *
+     * @param src  The null-terminated UTF-8 string to be converted.
+     * @return     The resulting UTF-16 string.
+     */
+    static std::u16string utf8ToUtf16(const char* src);
+
+    /**
+     * Converts a UTF-8 string to a UTF-16 string.
+     *
+     * @param src        The UTF-8 string to be converted.
+     * @param srcLength  The length of the input string.
+     * @return           The resulting UTF-16 string.
+     */
+    static std::u16string utf8ToUtf16(const char* src, std::size_t srcLength);
+
+private:
+    static std::u16string utf8ToUtf16(const char* begin, const char* end);
+
+    static std::pair<int, char32_t> utf8ToCodePoint(
+        const char* begin,
+        const char* curr,
+        const char* end);
+
+    static std::size_t utf8ToUtf16Length(const char* begin, const char* end);
+};
+//------------------------------------------------------------------------------
+NS_ODBC_END
+//------------------------------------------------------------------------------
+#endif
diff --git a/src/odbc/internal/charset/Utf16.h b/src/odbc/internal/charset/Utf16.h
@@ -0,0 +1,54 @@
+#ifndef ODBC_INTERNAL_CHARSET_UTF16_H_INCLUDED
+#define ODBC_INTERNAL_CHARSET_UTF16_H_INCLUDED
+//------------------------------------------------------------------------------
+#include <cassert>
+#include <utility>
+#include <odbc/Config.h>
+//------------------------------------------------------------------------------
+NS_ODBC_START
+//------------------------------------------------------------------------------
+namespace utf16 {
+//------------------------------------------------------------------------------
+/**
+ * Checks if a code point is representable in UTF-16, i.e. it is a code-point
+ * less or equal to U+10FFFF and not a surrogate part.
+ *
+ * @param c  The code point to check.
+ * @return   True if the code point is representable in UTF-16, false otherwise.
+ */
+inline bool isRepresentable(char32_t c)
+{
+    return c <= 0x10FFFF && !(c >= 0xD800 && c <= 0xDFFF);
+}
+//------------------------------------------------------------------------------
+/**
+ * Checks if a code point must be represented by a surrogate pair.
+ *
+ * @param c  The code point to check.
+ * @return   True if a surrogate pair is needed, false otherwise.
+ */
+inline bool needsSurrogatePair(char32_t c)
+{
+    return c >= 0x10000;
+}
+//------------------------------------------------------------------------------
+/**
+ * Encodes a code point from the supplementary planes as a surrogate pair.
+ *
+ * @param c  Character from the supplementary planes, i.e. from U+10000 to
+ *           U+10FFFF.
+ * @return   A pair containing the high surrogate as first and the low surrogate
+ *           as second.
+ */
+inline std::pair<char16_t, char16_t> encodeSurrogatePair(char32_t c)
+{
+    assert((c >= 0x10000) && (c <= 0x10FFFF));
+    c -= 0x10000;
+    return std::pair<char16_t, char16_t>{
+        (char16_t)(0xD800 | (c >> 10)), (char16_t)(0xDC00 | (c & 0x3FF))};
+}
+//------------------------------------------------------------------------------
+} // namespace utf16
+NS_ODBC_END
+//------------------------------------------------------------------------------
+#endif
diff --git a/src/odbc/internal/charset/Utf8.h b/src/odbc/internal/charset/Utf8.h
@@ -0,0 +1,130 @@
+#ifndef ODBC_INTERNAL_CHARSET_UTF8_H_INCLUDED
+#define ODBC_INTERNAL_CHARSET_UTF8_H_INCLUDED
+//------------------------------------------------------------------------------
+#include <cassert>
+#include <odbc/Config.h>
+#include <odbc/internal/Macros.h>
+//------------------------------------------------------------------------------
+NS_ODBC_START
+//------------------------------------------------------------------------------
+namespace utf8 {
+//------------------------------------------------------------------------------
+/**
+ * Determine the UTF-8 sequence length given the first byte of a UTF-8 sequence.
+ *
+ * @param c  First byte of a UTF-8 sequence.
+ * @return   The length of the sequence or -1 if the passed byte cannot be the
+ *           first byte of a sequence.
+ */
+inline int getSequenceLength(char c)
+{
+    if ((c & 0x80) == 0x00)
+        return 1;
+    if ((c & 0xE0) == 0xC0)
+        return 2;
+    if ((c & 0xF0) == 0xE0)
+        return 3;
+    if ((c & 0xF8) == 0xF0)
+        return 4;
+    return -1;
+}
+//------------------------------------------------------------------------------
+/**
+ * Checks if an UTF-8 sequence is valid.
+ *
+ * @param len  Length of the sequence. Must be in [1,4].
+ * @param c    Pointer to the sequence start.
+ * @return     Returns true if the sequence is a valid UTF-8 sequence of the
+ *             given length, false otherwise.
+ */
+inline bool isValidSequence(int len, const char* c)
+{
+    switch (len)
+    {
+    case 1:
+        return ((c[0] & 0x80) == 0x00);
+    case 2:
+        return ((c[0] & 0xE0) == 0xC0) && ((c[1] & 0xC0) == 0x80);
+    case 3:
+        return ((c[0] & 0xF0) == 0xE0) && ((c[1] & 0xC0) == 0x80)
+               && ((c[2] & 0xC0) == 0x80);
+    case 4:
+        return ((c[0] & 0xF8) == 0xF0) && ((c[1] & 0xC0) == 0x80)
+               && ((c[2] & 0xC0) == 0x80) && ((c[3] & 0xC0) == 0x80);
+    }
+    assert(false);
+}
+//------------------------------------------------------------------------------
+/**
+ * Decodes a UTF-8 sequence of length 2.
+ *
+ * @param c  Pointer to the sequence start.
+ * @return   Returns the decoded character.
+ */
+inline char32_t decode2(const char* c)
+{
+    assert(isValidSequence(2, c));
+    char32_t b1 = (char32_t)(unsigned char)(c[0]);
+    char32_t b2 = (char32_t)(unsigned char)(c[1]);
+    return ((b1 & 0x1F) << 6) | (b2 & 0x3F);
+}
+//------------------------------------------------------------------------------
+/**
+ * Decodes a UTF-8 sequence of length 3.
+ *
+ * @param c  Pointer to the sequence start.
+ * @return   Returns the decoded character.
 */
+inline char32_t decode3(const char* c)
+{
+    assert(isValidSequence(3, c));
+    char32_t b1 = (char32_t)(unsigned char)(c[0]);
+    char32_t b2 = (char32_t)(unsigned char)(c[1]);
+    char32_t b3 = (char32_t)(unsigned char)(c[2]);
+    return ((b1 & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+}
+//------------------------------------------------------------------------------
+/**
+ * Decodes a UTF-8 sequence of length 4.
+ *
+ * @param c  Pointer to the sequence start.
+ * @return   Returns the decoded character.
+ */
+inline char32_t decode4(const char* c)
+{
+    assert(isValidSequence(4, c));
+    char32_t b1 = (char32_t)(unsigned char)(c[0]);
+    char32_t b2 = (char32_t)(unsigned char)(c[1]);
+    char32_t b3 = (char32_t)(unsigned char)(c[2]);
+    char32_t b4 = (char32_t)(unsigned char)(c[3]);
+    return ((b1 & 0x07) << 18) | ((b2 & 0x3F) << 12) | ((b3 & 0x3F) << 6)
+           | (b4 & 0x3F);
+}
+//------------------------------------------------------------------------------
+/**
+ * Decodes a UTF-8 sequence of the given length, which must be in [1, 4].
+ *
+ * @param len  The length of the sequence. Must be in [1, 4].
+ * @param c    Pointer to the sequence start.
+ * @return     Returns the decoded character.
+ */
+inline char32_t decode(int len, const char* c)
+{
+    switch (len)
+    {
+    case 1:
+        return *c;
+    case 2:
+        return decode2(c);
+    case 3:
+        return decode3(c);
+    case 4:
+        return decode4(c);
+    }
+    assert(false);
+}
+//------------------------------------------------------------------------------
+} // namespace utf8
+NS_ODBC_END
+//------------------------------------------------------------------------------
+#endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -3,10 +3,13 @@ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/src)
 
 SET(OdbcCppTest_sources
     EnvironmentTest.cpp
+    StringConverterTest.cpp
     TestMain.cpp
     TypesTest.cpp
     internal/ParameterDataTest.cpp
     internal/UtilInternalTest.cpp
+    internal/charset/Utf16Test.cpp
+    internal/charset/Utf8Test.cpp
 )
 
 ADD_EXECUTABLE(