-
Notifications
You must be signed in to change notification settings - Fork 28
Implement StringConverter class #38
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
2955dd5
Implement StringConverter class
mrylov cdfc27b
Implement StringConverter class
mrylov 779a9b6
10BC0
Implement StringConverter class (patchset 2)
mrylov a289480
Implement StringConverter class (patchset 3)
mrylov 1b283ba
Implement StringConverter class (patchset 4)
mrylov File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| #include <odbc/Exception.h> | ||
| #include <odbc/StringConverter.h> | ||
| #include <odbc/internal/Macros.h> | ||
| #include <odbc/internal/charset/Utf16.h> | ||
| #include <odbc/internal/charset/Utf8.h> | ||
| #include <cassert> | ||
| #include <cstring> | ||
| #include <sstream> | ||
| //------------------------------------------------------------------------------ | ||
| using namespace std; | ||
| //------------------------------------------------------------------------------ | ||
| NS_ODBC_START | ||
| //------------------------------------------------------------------------------ | ||
| // StringConverter class | ||
| //------------------------------------------------------------------------------ | ||
| u16string StringConverter::utf8ToUtf16(const char* src) | ||
| { | ||
| ODBC_CHECK(src != nullptr, "Input string must not be nullptr."); | ||
| return utf8ToUtf16(src, nullptr); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| u16string StringConverter::utf8ToUtf16(const char* src, size_t srcLength) | ||
| { | ||
| ODBC_CHECK(src != nullptr, "Input string must not be nullptr."); | ||
| return utf8ToUtf16(src, src + srcLength); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| u16string StringConverter::utf8ToUtf16(const char* begin, const char* end) | ||
| { | ||
| assert(begin != nullptr); | ||
|
|
||
| if (end == nullptr) | ||
| end = begin + strlen(begin); | ||
|
|
||
| size_t len = utf8ToUtf16Length(begin, end); | ||
| u16string str; | ||
| str.reserve(len); | ||
|
|
||
| const char* curr = begin; | ||
|
|
||
| while (curr < end) | ||
| { | ||
| pair<int, char32_t> cp = utf8ToCodePoint(begin, curr, end); | ||
| curr += cp.first; | ||
|
|
||
| assert(utf16::isRepresentable(cp.second)); | ||
|
|
||
| if (utf16::needsSurrogatePair(cp.second)) | ||
| { | ||
| pair<char16_t, char16_t> sp = utf16::encodeSurrogatePair(cp.second); | ||
| str.push_back(sp.first); | ||
| str.push_back(sp.second); | ||
| } | ||
| else | ||
| { | ||
| str.push_back(cp.second); | ||
| } | ||
| } | ||
|
|
||
| return str; | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| pair<int, char32_t> StringConverter::utf8ToCodePoint( | ||
| const char* begin, const char* curr, const char* end) | ||
| { | ||
| assert(begin != nullptr && end != nullptr); | ||
| assert(begin <= curr && curr < end); | ||
|
|
||
| int len = utf8::getSequenceLength(*curr); | ||
| if (len == 1) | ||
| { | ||
| // Short-cut for the easy and common case. | ||
| return pair<int, char32_t>(len, *curr); | ||
| } | ||
| if (len == -1) | ||
| { | ||
| ODBC_FAIL("The string contains an invalid UTF-8 byte sequence at " | ||
| "position " << (curr - begin) << "."); | ||
| } | ||
|
|
||
| // We have to make sure that we don't exceed the end of the string. | ||
| if ((curr + len) > end) | ||
| { | ||
| ODBC_FAIL("The string contains an incomplete UTF-8 byte sequence at " | ||
| "position " << (curr - begin) << "."); | ||
| } | ||
|
|
||
| if (!utf8::isValidSequence(len, curr)) | ||
| { | ||
| ODBC_FAIL("The string contains an invalid UTF-8 byte sequence at " | ||
| "position " << (curr - begin) << "."); | ||
| } | ||
|
|
||
| return pair<int, char32_t>(len, utf8::decode(len, curr)); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| size_t StringConverter::utf8ToUtf16Length(const char* begin, const char* end) | ||
| { | ||
| assert(begin != nullptr && end != nullptr); | ||
|
|
||
| size_t len = 0; | ||
|
|
||
| const char* curr = begin; | ||
| while (curr < end) | ||
| { | ||
| pair<int, char32_t> cp = utf8ToCodePoint(begin, curr, end); | ||
| curr += cp.first; | ||
|
|
||
| ODBC_CHECK(utf16::isRepresentable(cp.second), | ||
| "The UTF-8 string contains codepoint U+" << | ||
| std::hex << (uint32_t)cp.second << | ||
| ", which cannot be represented in UTF-16."); | ||
|
|
||
| len += utf16::needsSurrogatePair(cp.second) ? 2 : 1; | ||
stefanuhrig marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| return len; | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| NS_ODBC_END | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| #ifndef ODBC_STRING_CONVERTER_H_INCLUDED | ||
| #define ODBC_STRING_CONVERTER_H_INCLUDED | ||
| //------------------------------------------------------------------------------ | ||
| #include <odbc/Config.h> | ||
| #include <cstddef> | ||
| #include <utility> | ||
| #include <string> | ||
| //------------------------------------------------------------------------------ | ||
| NS_ODBC_START | ||
| //------------------------------------------------------------------------------ | ||
| class ODBC_EXPORT StringConverter | ||
| { | ||
| public: | ||
| StringConverter() = delete; | ||
|
|
||
| /** | ||
| * Converts a null-terminated UTF-8 string to a UTF-16 string. | ||
| * | ||
| * @param src The null-terminated UTF-8 string to be converted. | ||
| * @return The resulting UTF-16 string. | ||
| */ | ||
| static std::u16string utf8ToUtf16(const char* src); | ||
|
|
||
| /** | ||
| * Converts a UTF-8 string to a UTF-16 string. | ||
| * | ||
| * @param src The UTF-8 string to be converted. | ||
| * @param srcLength The length of the input string. | ||
| * @return The resulting UTF-16 string. | ||
| */ | ||
| static std::u16string utf8ToUtf16(const char* src, std::size_t srcLength); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should also add an overload just taking a const char* for null-terminated strings.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done.
stefanuhrig marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| private: | ||
| static std::u16string utf8ToUtf16(const char* begin, const char* end); | ||
|
|
||
| static std::pair<int, char32_t> utf8ToCodePoint( | ||
| const char* begin, | ||
| const char* curr, | ||
| const char* end); | ||
|
|
||
| static std::size_t utf8ToUtf16Length(const char* begin, const char* end); | ||
| }; | ||
| //------------------------------------------------------------------------------ | ||
| NS_ODBC_END | ||
| //------------------------------------------------------------------------------ | ||
| #endif | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| #ifndef ODBC_INTERNAL_CHARSET_UTF16_H_INCLUDED | ||
| #define ODBC_INTERNAL_CHARSET_UTF16_H_INCLUDED | ||
| //------------------------------------------------------------------------------ | ||
| #include <cassert> | ||
| #include <utility> | ||
| #include <odbc/Config.h> | ||
| //------------------------------------------------------------------------------ | ||
| NS_ODBC_START | ||
| //------------------------------------------------------------------------------ | ||
| namespace utf16 { | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Checks if a code point is representable in UTF-16, i.e. it is a code-point | ||
| * less or equal to U+10FFFF and not a surrogate part. | ||
| * | ||
| * @param c The code point to check. | ||
| * @return True if the code point is representable in UTF-16, false otherwise. | ||
| */ | ||
| inline bool isRepresentable(char32_t c) | ||
| { | ||
| return c <= 0x10FFFF && !(c >= 0xD800 && c <= 0xDFFF); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Checks if a code point must be represented by a surrogate pair. | ||
| * | ||
| * @param c The code point to check. | ||
| * @return True if a surrogate pair is needed, false otherwise. | ||
| */ | ||
| inline bool needsSurrogatePair(char32_t c) | ||
| { | ||
| return c >= 0x10000; | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Encodes a code point from the supplementary planes as a surrogate pair. | ||
| * | ||
| * @param c Character from the supplementary planes, i.e. from U+10000 to | ||
| * U+10FFFF. | ||
| * @return A pair containing the high surrogate as first and the low surrogate | ||
| * as second. | ||
| */ | ||
| inline std::pair<char16_t, char16_t> encodeSurrogatePair(char32_t c) | ||
| { | ||
| assert((c >= 0x10000) && (c <= 0x10FFFF)); | ||
| c -= 0x10000; | ||
| return std::pair<char16_t, char16_t>{ | ||
| (char16_t)(0xD800 | (c >> 10)), (char16_t)(0xDC00 | (c & 0x3FF))}; | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| } // namespace utf16 | ||
| NS_ODBC_END | ||
| //------------------------------------------------------------------------------ | ||
| #endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,130 @@ | ||
| #ifndef ODBC_INTERNAL_CHARSET_UTF8_H_INCLUDED | ||
| #define ODBC_INTERNAL_CHARSET_UTF8_H_INCLUDED | ||
| //------------------------------------------------------------------------------ | ||
| #include <cassert> | ||
| #include <odbc/Config.h> | ||
| #include <odbc/internal/Macros.h> | ||
| //------------------------------------------------------------------------------ | ||
| NS_ODBC_START | ||
| //------------------------------------------------------------------------------ | ||
| namespace utf8 { | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Determine the UTF-8 sequence length given the first byte of a UTF-8 sequence. | ||
| * | ||
| * @param c First byte of a UTF-8 sequence. | ||
| * @return The length of the sequence or -1 if the passed byte cannot be the | ||
| * first byte of a sequence. | ||
| */ | ||
| inline int getSequenceLength(char c) | ||
| { | ||
| if ((c & 0x80) == 0x00) | ||
| return 1; | ||
| if ((c & 0xE0) == 0xC0) | ||
| return 2; | ||
| if ((c & 0xF0) == 0xE0) | ||
| return 3; | ||
| if ((c & 0xF8) == 0xF0) | ||
| return 4; | ||
| return -1; | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Checks if an UTF-8 sequence is valid. | ||
| * | ||
| * @param len Length of the sequence. Must be in [1,4]. | ||
| * @param c Pointer to the sequence start. | ||
| * @return Returns true if the sequence is a valid UTF-8 sequence of the | ||
| * given length, false otherwise. | ||
| */ | ||
| inline bool isValidSequence(int len, const char* c) | ||
| { | ||
| switch (len) | ||
| { | ||
| case 1: | ||
| return ((c[0] & 0x80) == 0x00); | ||
| case 2: | ||
| return ((c[0] & 0xE0) == 0xC0) && ((c[1] & 0xC0) == 0x80); | ||
| case 3: | ||
| return ((c[0] & 0xF0) == 0xE0) && ((c[1] & 0xC0) == 0x80) | ||
| && ((c[2] & 0xC0) == 0x80); | ||
| case 4: | ||
| return ((c[0] & 0xF8) == 0xF0) && ((c[1] & 0xC0) == 0x80) | ||
| && ((c[2] & 0xC0) == 0x80) && ((c[3] & 0xC0) == 0x80); | ||
| } | ||
| assert(false); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Decodes a UTF-8 sequence of length 2. | ||
| * | ||
| * @param c Pointer to the sequence start. | ||
| * @return Returns the decoded character. | ||
| */ | ||
| inline char32_t decode2(const char* c) | ||
| { | ||
| assert(isValidSequence(2, c)); | ||
| char32_t b1 = (char32_t)(unsigned char)(c[0]); | ||
| char32_t b2 = (char32_t)(unsigned char)(c[1]); | ||
| return ((b1 & 0x1F) << 6) | (b2 & 0x3F); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Decodes a UTF-8 sequence of length 3. | ||
| * | ||
| * @param c Pointer to the sequence start. | ||
| * @return Returns the decoded character. | ||
| */ | ||
| inline char32_t decode3(const char* c) | ||
| { | ||
| assert(isValidSequence(3, c)); | ||
| char32_t b1 = (char32_t)(unsigned char)(c[0]); | ||
| char32_t b2 = (char32_t)(unsigned char)(c[1]); | ||
| char32_t b3 = (char32_t)(unsigned char)(c[2]); | ||
| return ((b1 & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Decodes a UTF-8 sequence of length 4. | ||
| * | ||
| * @param c Pointer to the sequence start. | ||
| * @return Returns the decoded character. | ||
| */ | ||
| inline char32_t decode4(const char* c) | ||
| { | ||
| assert(isValidSequence(4, c)); | ||
| char32_t b1 = (char32_t)(unsigned char)(c[0]); | ||
| char32_t b2 = (char32_t)(unsigned char)(c[1]); | ||
| char32_t b3 = (char32_t)(unsigned char)(c[2]); | ||
| char32_t b4 = (char32_t)(unsigned char)(c[3]); | ||
| return ((b1 & 0x07) << 18) | ((b2 & 0x3F) << 12) | ((b3 & 0x3F) << 6) | ||
| | (b4 & 0x3F); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| /** | ||
| * Decodes a UTF-8 sequence of the given length, which must be in [1, 4]. | ||
| * | ||
| * @param len The length of the sequence. Must be in [1, 4]. | ||
| * @param c Pointer to the sequence start. | ||
| * @return Returns the decoded character. | ||
| */ | ||
| inline char32_t decode(int len, const char* c) | ||
| { | ||
| switch (len) | ||
| { | ||
| case 1: | ||
| return *c; | ||
| case 2: | ||
| return decode2(c); | ||
| case 3: | ||
| return decode3(c); | ||
| case 4: | ||
| return decode4(c); | ||
| } | ||
| assert(false); | ||
| } | ||
| //------------------------------------------------------------------------------ | ||
| } // namespace utf8 | ||
| NS_ODBC_END | ||
| //------------------------------------------------------------------------------ | ||
| #endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.