Conversation
src/odbc/StringConverter.cpp
Outdated
| enum class endian | ||
| { | ||
| #ifdef BYTE_ORDER | ||
| little = LITTLE_ENDIAN, | ||
| big = BIG_ENDIAN, | ||
| native = BYTE_ORDER | ||
| #elif defined(_M_IX86) || defined(_M_AMD64) | ||
| little = 0, | ||
| big = 1, | ||
| native = little | ||
| #else | ||
| #error "Cannot determine endianness" | ||
| #endif | ||
| }; |
There was a problem hiding this comment.
I don't think we need any endianness information.
| public: | ||
| StringConverter() = delete; | ||
|
|
||
| static std::u16string utf8ToUtf16(const char* src, std::size_t srcLength); |
There was a problem hiding this comment.
We should also add an overload just taking a const char* for null-terminated strings.
| static std::pair<int, char32_t> utf8ToCodePoint( | ||
| const char* curr, | ||
| const char* end); | ||
|
|
||
| static std::size_t utf8ToUtf16Length(const char* src, | ||
| std::size_t srcLength); | ||
| }; |
There was a problem hiding this comment.
These methods can be moved to the anonymous namespace in the .cpp.
There was a problem hiding this comment.
This is not possible as the Exception class can only be used by its friends.
src/odbc/StringConverter.cpp
Outdated
| u16string StringConverter::utf8ToUtf16(const char* src, size_t srcLength) | ||
| { | ||
| if (src == nullptr) | ||
| ODBC_FAIL("Input string cannot be nullptr."); |
There was a problem hiding this comment.
"must not" instead of "cannot".
src/odbc/internal/charset/Utf16.h
Outdated
| /** | ||
| * Checks if a 16-bit code unit is a high surrogate (starting a surrogate pair). | ||
| * | ||
| * @param c The 16-bit code unit to check. | ||
| * @return True if the code unit is a high surrogate, false otherwise. | ||
| */ | ||
| inline bool isHighSurrogate(char16_t c) | ||
| { | ||
| return (c >= 0xD800 && c <= 0xDBFF); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Checks if a 16-bit code unit is a low surrogate (ending a surrogate pair). | ||
| * | ||
| * @param c The 16-bit code unit to check. | ||
| * @return True if the code unit is a high surrogate, false otherwise. | ||
| */ | ||
| inline bool isLowSurrogate(char16_t c) | ||
| { | ||
| return (c >= 0xDC00 && c <= 0xDFFF); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Checks if the 16-bit code unit is either a high or low surrogate. | ||
| * | ||
| * @param c The 16-bit code unit to check | ||
| * @return True if the code unit is a low or high surrogate, false otherwise. | ||
| */ | ||
| inline bool isSurrogatePart(char16_t c) | ||
| { | ||
| return (c >= 0xD800 && c <= 0xDFFF); | ||
| } |
There was a problem hiding this comment.
Only required for decoding. Can be removed.
src/odbc/internal/charset/Utf16.h
Outdated
| /** | ||
| * Encodes a 16-bit code unit as little endian. | ||
| * | ||
| * @param c The 16-bit code unit to encode. | ||
| * @param target The target buffer. buffer[0] and buffer[1] must writable. | ||
| */ | ||
| inline void encodeSingleLE(char16_t c, char* target) | ||
| { | ||
| target[0] = (char)(c & 0xFF); | ||
| target[1] = (char)(c >> 8); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Encodes a 16-bit code unit as big endian. | ||
| * | ||
| * @param c The 16-bit code unit to encode. | ||
| * @param target The target buffer. buffer[0] and buffer[1] must writable. | ||
| */ | ||
| inline void encodeSingleBE(char16_t c, char* target) | ||
| { | ||
| target[0] = (char)(c >> 8); | ||
| target[1] = (char)(c & 0xFF); | ||
| } |
There was a problem hiding this comment.
Not required if we work on character level only.
src/odbc/internal/charset/Utf16.h
Outdated
| /** | ||
| * Encodes a code point as UTF-16 little endian. | ||
| * | ||
| * This method automatically encodes as single 16-bit code point or a surrogate | ||
| * pair depending on the code point. Therefore the first 4 bytes of the target | ||
| * buffer must be accessible. | ||
| * | ||
| * @param c The code point to encode. | ||
| * @param target The target buffer. The first four bytes must be accessible. | ||
| * @return The number of bytes written to the target buffer. | ||
| */ | ||
| inline int encodeLE(char32_t c, char* target) | ||
| { | ||
| ODBC_ASSERT( | ||
| isRepresentable(c), "Codepoint " << (uint32_t)c << " is invalid"); | ||
| if (!needsSurrogatePair(c)) | ||
| { | ||
| encodeSingleLE((char16_t)c, target); | ||
| return 2; | ||
| } | ||
| else | ||
| { | ||
| std::pair<char16_t, char16_t> sp = encodeSurrogatePair(c); | ||
| encodeSingleLE(sp.first, target); | ||
| encodeSingleLE(sp.second, target + 2); | ||
| return 4; | ||
| } | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Encodes a code point as UTF-16 big endian. | ||
| * | ||
| * This method automatically encodes as single 16-bit code point or a surrogate | ||
| * pair depending on the code point. Therefore the first 4 bytes of the target | ||
| * buffer must be accessible. | ||
| * | ||
| * @param c The code point to encode. | ||
| * @param target The target buffer. The first four bytes must be accessible. | ||
| * @return The number of bytes written to the target buffer. | ||
| */ | ||
| inline int encodeBE(char32_t c, char* target) | ||
| { | ||
| ODBC_ASSERT( | ||
| isRepresentable(c), "Codepoint " << (uint32_t)c << " is invalid"); | ||
| if (!needsSurrogatePair(c)) | ||
| { | ||
| encodeSingleBE((char16_t)c, target); | ||
| return 2; | ||
| } | ||
| else | ||
| { | ||
| std::pair<char16_t, char16_t> sp = encodeSurrogatePair(c); | ||
| encodeSingleBE(sp.first, target); | ||
| encodeSingleBE(sp.second, target + 2); | ||
| return 4; | ||
| } | ||
| } |
There was a problem hiding this comment.
Not required if we work only character level only.
test/StringConverterTest.cpp
Outdated
| { | ||
| const char* src; | ||
| size_t srcLength; | ||
| const char* dst; |
There was a problem hiding this comment.
We should use const char16_t* here. dst is kind of a misleading name. It's the expected outcome.
test/StringConverterTest.cpp
Outdated
| { | ||
| "\x48", | ||
| 0, | ||
| "", |
There was a problem hiding this comment.
I'd use UTF-16 string literals here, e.g. u"Oststraße".
src/odbc/StringConverter.h
Outdated
| StringConverter() = delete; | ||
|
|
||
| /** | ||
| * Converts a null-terminated UTF-8 string to a UTF16 string. |
src/odbc/internal/Macros.h
Outdated
| #define ODBC_TERMINATE(msg) std::terminate() | ||
| //------------------------------------------------------------------------------ | ||
| #define ODBC_TERMINATE_CHECK(cond, expr) \ | ||
| do \ | ||
| { \ | ||
| if (!(cond)) \ | ||
| { \ | ||
| ODBC_TERMINATE(expr << "; Condition '" << #cond << "' failed."); \ | ||
| } \ | ||
| } while (false) | ||
| //------------------------------------------------------------------------------ | ||
| #define ODBC_TERMINATE_CHECK_0(cond) \ | ||
| do \ | ||
| { \ | ||
| if (!(cond)) \ | ||
| { \ | ||
| ODBC_TERMINATE("Condition '" << #cond << "' failed."); \ | ||
| } \ | ||
| } while (false) | ||
| //------------------------------------------------------------------------------ | ||
| // Asserts are executed in debug mode only | ||
| #ifdef ODBC_DBG | ||
| #define ODBC_ASSERT(cond, expr) ODBC_TERMINATE_CHECK(cond, expr) | ||
| #define ODBC_ASSERT_0(cond) ODBC_TERMINATE_CHECK_0(cond) | ||
| #else | ||
| #define ODBC_ASSERT(cond, expr) | ||
| #define ODBC_ASSERT_0(cond) | ||
| #endif | ||
| //------------------------------------------------------------------------------ | ||
| #if defined(__GNUC__) || defined(__clang__) | ||
| #define ODBC_BUILTIN_UNREACHABLE __builtin_unreachable() | ||
| #elif defined(_MSC_VER) | ||
| #define ODBC_BUILTIN_UNREACHABLE __assume(0) | ||
| #endif | ||
| //------------------------------------------------------------------------------ | ||
| // Use these macros at code locations that should be unreachable. | ||
| #define ODBC_TERMINATE_CHECK_UNREACHABLE \ | ||
| ODBC_TERMINATE("Reached unreachable code location") | ||
| //------------------------------------------------------------------------------ | ||
| // Use this macro at code locations that are unreachable. | ||
| #ifdef ODBC_DBG | ||
| #define ODBC_ASSERT_UNREACHABLE ODBC_TERMINATE_CHECK_UNREACHABLE | ||
| #else | ||
| #define ODBC_ASSERT_UNREACHABLE ODBC_BUILTIN_UNREACHABLE | ||
| #endif | ||
| //------------------------------------------------------------------------------ |
There was a problem hiding this comment.
I would not add all that stuff because certain functionality is platform dependent. At other locations, we just use assert , which is good enough from my point of view. The macros don't do anything with message anyway.
src/odbc/StringConverter.cpp
Outdated
| u16string str(dstLength, 0); | ||
|
|
||
| const char* curr = begin; | ||
| size_t i = 0; BE9D |
There was a problem hiding this comment.
To get rid of i, we could just call reserve on str (instead of creating it with a given size) and then use push_back. That way we also wouldn't do the unnecessary initialization with 0 anymore.
src/odbc/StringConverter.cpp
Outdated
| curr += cp.first; | ||
|
|
||
| ODBC_CHECK(utf16::isRepresentable(cp.second), | ||
| "Codepoint " << (uint32_t)cp.second << " is invalid"); |
There was a problem hiding this comment.
Well, it's valid, but cannot be represented. Maybe something like: The UTF-8 string contains codepoint U+xxxxxx, which cannot represented in UTF-16. The codepoint should also be encoded in hexadecimal, because that's the convention.
src/odbc/StringConverter.cpp
Outdated
| 4D23 | str[i] = static_cast<char16_t>(sp.first); | |
| str[i + 1] = static_cast<char16_t>(sp.second); |
There was a problem hiding this comment.
static_casts are not needed here.
src/odbc/StringConverter.cpp
Outdated
| // We have to make sure that the sequence does not contain a terminating | ||
| // zero and the following byte-sequence is valid. |
There was a problem hiding this comment.
The comment about the terminating zero does not apply here.
test/StringConverterTest.cpp
Outdated
| u16string actual = p.srcLength >= 0 ? | ||
| StringConverter::utf8ToUtf16(p.src, p.srcLength) : | ||
| StringConverter::utf8ToUtf16(p.src); | ||
| ASSERT_EQ(p.expected.length(), actual.length()); |
There was a problem hiding this comment.
I think ASSERT_EQ(actual, p.expected) should work, because there is an comparison operator for u16string and const char16_t*.
test/StringConverterTest.cpp
Outdated
| { | ||
| "\x73\x74\xE5\x5D\x0D\x0A", | ||
| 6, | ||
| "The string contains an incomplete byte-sequence at position 2." |
There was a problem hiding this comment.
We should either use "byte sequence" or "byte-sequence". Currently, we have both.
src/odbc/StringConverter.cpp
Outdated
| ODBC_CHECK(utf16::isRepresentable(cp.second), | ||
| "The UTF-8 string contains codepoint U+" << | ||
| std::hex << (uint32_t)cp.second << | ||
| ", which cannot be represented in UTF-16."); |
There was a problem hiding this comment.
An assert should suffice here because we checked this already in utf8ToUtf16Length().
src/odbc/StringConverter.cpp
Outdated
| ODBC_FAIL("The string contains an incomplete byte sequence at " | ||
| "position " << (curr - begin) << "."); |
There was a problem hiding this comment.
Probably we need to distinguish the "incomplete sequence" from the "invalid sequence" case.
No description provided.