E5E9 Implement StringConverter class by mrylov · Pull Request #38 · SAP/odbc-cpp-wrapper · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/odbc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ SET(public_headers
ResultSetMetaDataUnicode.h
Statement.h
StatementBase.h
StringConverter.h
Types.h
Util.h
)
Expand All @@ -38,6 +39,7 @@ SET(odbccpp_sources
ResultSetMetaDataUnicode.cpp
Statement.cpp
StatementBase.cpp
StringConverter.cpp
Types.cpp
Util.cpp
internal/Batch.cpp
Expand Down
1 change: 1 addition & 0 deletions src/odbc/Exception.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class ODBC_EXPORT Exception : public std::exception
friend class ResultSetMetaDataUnicode;
friend class Statement;
friend class StatementBase;
friend class StringConverter;
friend class time;
friend class timestamp;
friend class ValueBuffer;
Expand Down
1 change: 1 addition & 0 deletions src/odbc/Forwards.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class ResultSetMetaDataBase;
class ResultSetMetaDataUnicode;
class StatementBase;
class Statement;
class StringConverter;
class ValueBuffer;
//------------------------------------------------------------------------------
typedef Reference<Connection> ConnectionRef;
Expand Down
120 changes: 120 additions & 0 deletions src/odbc/StringConverter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#include <odbc/Exception.h>
#include <odbc/StringConverter.h>
#include <odbc/internal/Macros.h>
#include <odbc/internal/charset/Utf16.h>
#include <odbc/internal/charset/Utf8.h>
#include <cassert>
#include <cstring>
#include <sstream>
//------------------------------------------------------------------------------
using namespace std;
//------------------------------------------------------------------------------
NS_ODBC_START
//------------------------------------------------------------------------------
// StringConverter class
//------------------------------------------------------------------------------
u16string StringConverter::utf8ToUtf16(const char* src)
{
ODBC_CHECK(src != nullptr, "Input string must not be nullptr.");
return utf8ToUtf16(src, nullptr);
}
//------------------------------------------------------------------------------
u16string StringConverter::utf8ToUtf16(const char* src, size_t srcLength)
{
ODBC_CHECK(src != nullptr, "Input string must not be nullptr.");
return utf8ToUtf16(src, src + srcLength);
}
//------------------------------------------------------------------------------
u16string StringConverter::utf8ToUtf16(const char* begin, const char* end)
{
assert(begin != nullptr);

if (end == nullptr)
end = begin + strlen(begin);

size_t len = utf8ToUtf16Length(begin, end);
u16string str;
str.reserve(len);

const char* curr = begin;

while (curr < end)
{
pair<int, char32_t> cp = utf8ToCodePoint(begin, curr, end);
curr += cp.first;

assert(utf16::isRepresentable(cp.second));

if (utf16::needsSurrogatePair(cp.second))
{
pair<char16_t, char16_t> sp = utf16::encodeSurrogatePair(cp.second);
str.push_back(sp.first);
str.push_back(sp.second);
}
else
{
str.push_back(cp.second);
}
}

return str;
}
//------------------------------------------------------------------------------
pair<int, char32_t> StringConverter::utf8ToCodePoint(
const char* begin, const char* curr, const char* end)
{
assert(begin != nullptr && end != nullptr);
assert(begin <= curr && curr < end);

int len = utf8::getSequenceLength(*curr);
if (len == 1)
{
// Short-cut for the easy and common case.
return pair<int, char32_t>(len, *curr);
}
if (len == -1)
{
ODBC_FAIL("The string contains an invalid UTF-8 byte sequence at "
"position " << (curr - begin) << ".");
}

// We have to make sure that we don't exceed the end of the string.
if ((curr + len) > end)
{
ODBC_FAIL("The string contains an incomplete UTF-8 byte sequence at "
"position " << (curr - begin) << ".");
}

if (!utf8::isValidSequence(len, curr))
{
ODBC_FAIL("The string contains an invalid UTF-8 byte sequence at "
"position " << (curr - begin) << ".");
}

return pair<int, char32_t>(len, utf8::decode(len, curr));
}
//------------------------------------------------------------------------------
size_t StringConverter::utf8ToUtf16Length(const char* begin, const char* end)
{
assert(begin != nullptr && end != nullptr);

size_t len = 0;

const char* curr = begin;
while (curr < end)
{
pair<int, char32_t> cp = utf8ToCodePoint(begin, curr, end);
curr += cp.first;

ODBC_CHECK(utf16::isRepresentable(cp.second),
"The UTF-8 string contains codepoint U+" <<
std::hex << (uint32_t)cp.second <<
", which cannot be represented in UTF-16.");

len += utf16::needsSurrogatePair(cp.second) ? 2 : 1;
}

return len;
}
//------------------------------------------------------------------------------
NS_ODBC_END
46 changes: 46 additions & 0 deletions src/odbc/StringConverter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef ODBC_STRING_CONVERTER_H_INCLUDED
#define ODBC_STRING_CONVERTER_H_INCLUDED
//------------------------------------------------------------------------------
#include <odbc/Config.h>
#include <cstddef>
#include <utility>
#include <string>
//------------------------------------------------------------------------------
NS_ODBC_START
//------------------------------------------------------------------------------
class ODBC_EXPORT StringConverter
{
public:
StringConverter() = delete;

/**
* Converts a null-terminated UTF-8 string to a UTF-16 string.
*
* @param src The null-terminated UTF-8 string to be converted.
* @return The resulting UTF-16 string.
*/
static std::u16string utf8ToUtf16(const char* src);

/**
* Converts a UTF-8 string to a UTF-16 string.
*
* @param src The UTF-8 string to be converted.
* @param srcLength The length of the input string.
* @return The resulting UTF-16 string.
*/
static std::u16string utf8ToUtf16(const char* src, std::size_t srcLength);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should also add an overload just taking a const char* for null-terminated strings.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


private:
static std::u16string utf8ToUtf16(const char* begin, const char* end);

static std::pair<int, char32_t> utf8ToCodePoint(
const char* begin,
const char* curr,
const char* end);

static std::size_t utf8ToUtf16Length(const char* begin, const char* end);
};
//------------------------------------------------------------------------------
NS_ODBC_END
//------------------------------------------------------------------------------
#endif
54 changes: 54 additions & 0 deletions src/odbc/internal/charset/Utf16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef ODBC_INTERNAL_CHARSET_UTF16_H_INCLUDED
#define ODBC_INTERNAL_CHARSET_UTF16_H_INCLUDED
//------------------------------------------------------------------------------
#include <cassert>
#include <utility>
#include <odbc/Config.h>
//------------------------------------------------------------------------------
NS_ODBC_START
//------------------------------------------------------------------------------
namespace utf16 {
//------------------------------------------------------------------------------
/**
* Checks if a code point is representable in UTF-16, i.e. it is a code-point
* less or equal to U+10FFFF and not a surrogate part.
*
* @param c The code point to check.
* @return True if the code point is representable in UTF-16, false otherwise.
*/
inline bool isRepresentable(char32_t c)
{
return c <= 0x10FFFF && !(c >= 0xD800 && c <= 0xDFFF);
}
//------------------------------------------------------------------------------
/**
* Checks if a code point must be represented by a surrogate pair.
*
* @param c The code point to check.
* @return True if a surrogate pair is needed, false otherwise.
*/
inline bool needsSurrogatePair(char32_t c)
{
return c >= 0x10000;
}
//------------------------------------------------------------------------------
/**
* Encodes a code point from the supplementary planes as a surrogate pair.
*
* @param c Character from the supplementary planes, i.e. from U+10000 to
* U+10FFFF.
* @return A pair containing the high surrogate as first and the low surrogate
* as second.
*/
inline std::pair<char16_t, char16_t> encodeSurrogatePair(char32_t c)
{
assert((c >= 0x10000) && (c <= 0x10FFFF));
c -= 0x10000;
return std::pair<char16_t, char16_t>{
(char16_t)(0xD800 | (c >> 10)), (char16_t)(0xDC00 | (c & 0x3FF))};
}
//------------------------------------------------------------------------------
} // namespace utf16
NS_ODBC_END
//------------------------------------------------------------------------------
#endif
130 changes: 130 additions & 0 deletions src/odbc/internal/charset/Utf8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#ifndef ODBC_INTERNAL_CHARSET_UTF8_H_INCLUDED
#define ODBC_INTERNAL_CHARSET_UTF8_H_INCLUDED
//------------------------------------------------------------------------------
#include <cassert>
#include <odbc/Config.h>
#include <odbc/internal/Macros.h>
//------------------------------------------------------------------------------
NS_ODBC_START
//------------------------------------------------------------------------------
namespace utf8 {
//------------------------------------------------------------------------------
/**
* Determine the UTF-8 sequence length given the first byte of a UTF-8 sequence.
*
* @param c First byte of a UTF-8 sequence.
* @return The length of the sequence or -1 if the passed byte cannot be the
* first byte of a sequence.
*/
inline int getSequenceLength(char c)
{
if ((c & 0x80) == 0x00)
return 1;
if ((c & 0xE0) == 0xC0)
return 2;
if ((c & 0xF0) == 0xE0)
return 3;
if ((c & 0xF8) == 0xF0)
return 4;
return -1;
}
//------------------------------------------------------------------------------
/**
* Checks if an UTF-8 sequence is valid.
*
* @param len Length of the sequence. Must be in [1,4].
* @param c Pointer to the sequence start.
* @return Returns true if the sequence is a valid UTF-8 sequence of the
* given length, false otherwise.
*/
inline bool isValidSequence(int len, const char* c)
{
switch (len)
{
case 1:
return ((c[0] & 0x80) == 0x00);
case 2:
return ((c[0] & 0xE0) == 0xC0) && ((c[1] & 0xC0) == 0x80);
case 3:
return ((c[0] & 0xF0) == 0xE0) && ((c[1] & 0xC0) == 0x80)
&& ((c[2] & 0xC0) == 0x80);
case 4:
return ((c[0] & 0xF8) == 0xF0) && ((c[1] & 0xC0) == 0x80)
&& ((c[2] & 0xC0) == 0x80) && ((c[3] & 0xC0) == 0x80);
}
assert(false);
}
//------------------------------------------------------------------------------
/**
* Decodes a UTF-8 sequence of length 2.
*
* @param c Pointer to the sequence start.
* @return Returns the decoded character.
*/
inline char32_t decode2(const char* c)
{
assert(isValidSequence(2, c));
char32_t b1 = (char32_t)(unsigned char)(c[0]);
char32_t b2 = (char32_t)(unsigned char)(c[1]);
return ((b1 & 0x1F) << 6) | (b2 & 0x3F);
}
//------------------------------------------------------------------------------
/**
* Decodes a UTF-8 sequence of length 3.
*
* @param c Pointer to the sequence start.
* @return Returns the decoded character.
*/
inline char32_t decode3(const char* c)
{
assert(isValidSequence(3, c));
char32_t b1 = (char32_t)(unsigned char)(c[0]);
char32_t b2 = (char32_t)(unsigned char)(c[1]);
char32_t b3 = (char32_t)(unsigned char)(c[2]);
return ((b1 & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
}
//------------------------------------------------------------------------------
/**
* Decodes a UTF-8 sequence of length 4.
*
* @param c Pointer to the sequence start.
* @return Returns the decoded character.
*/
inline char32_t decode4(const char* c)
{
assert(isValidSequence(4, c));
char32_t b1 = (char32_t)(unsigned char)(c[0]);
char32_t b2 = (char32_t)(unsigned char)(c[1]);
char32_t b3 = (char32_t)(unsigned char)(c[2]);
char32_t b4 = (char32_t)(unsigned char)(c[3]);
return ((b1 & 0x07) << 18) | ((b2 & 0x3F) << 12) | ((b3 & 0x3F) << 6)
| (b4 & 0x3F);
}
//------------------------------------------------------------------------------
/**
* Decodes a UTF-8 sequence of the given length, which must be in [1, 4].
*
* @param len The length of the sequence. Must be in [1, 4].
* @param c Pointer to the sequence start.
* @return Returns the decoded character.
*/
inline char32_t decode(int len, const char* c)
{
switch (len)
{
case 1:
return *c;
case 2:
return decode2(c);
case 3:
return decode3(c);
case 4:
return decode4(c);
}
assert(false);
}
//------------------------------------------------------------------------------
} // namespace utf8
NS_ODBC_END
//------------------------------------------------------------------------------
#endif
3 changes: 3 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/src)

SET(OdbcCppTest_sources
EnvironmentTest.cpp
StringConverterTest.cpp
TestMain.cpp
TypesTest.cpp
internal/ParameterDataTest.cpp
internal/UtilInternalTest.cpp
internal/charset/Utf16Test.cpp
internal/charset/Utf8Test.cpp
)

ADD_EXECUTABLE(
Expand Down
Loading
0