8000 Feature/escape unicode control chars by cpjulia · Pull Request #14805 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Feature/escape unicode control chars #14805

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Sep 30, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4bfa630
Added parser for retaining or escaping control and unicode characters…
cpjulia Sep 18, 2021
600deca
Merge branch 'devel' of https://github.com/arangodb/arangodb into fea…
cpjulia Sep 20, 2021
feb58b7
Added unicode escaping for 4 bytes representation, parsing for broken…
cpjulia Sep 20, 2021
616265e
Merge branch 'devel' of https://github.com/arangodb/arangodb into fea…
cpjulia Sep 20, 2021
f48f87a
Added more tests
cpjulia Sep 20, 2021
4cc1e64
Removed unused functions, updated CHANGELOG, removed unused include i…
cpjulia Sep 20, 2021
2448e9c
Resolved CHANGELOG conflict from merge with devel
cpjulia Sep 20, 2021
3c29365
Update tests/Logger/EscaperTest.cpp
cpjulia Sep 20, 2021
bfa05eb
Update lib/Logger/LoggerFeature.cpp
cpjulia Sep 20, 2021
0ca0d25
Update lib/Logger/LoggerFeature.h
cpjulia Sep 20, 2021
39f97e1
Update lib/Logger/Escaper.h
cpjulia Sep 20, 2021
7a3480f
Update lib/Logger/Escaper.cpp
cpjulia Sep 20, 2021
963921b
Update CHANGELOG
cpjulia Sep 20, 2021
015a188
Update CHANGELOG
cpjulia Sep 21, 2021
a007926
Update CHANGELOG
cpjulia Sep 21, 2021
2f8f3a2
Updated CHANGELOG
cpjulia Sep 21, 2021 8000
3067a5a
Added more tests, updated CHANGELOG
cpjulia Sep 21, 2021
482e186
Update tests/Logger/EscaperTest.cpp
cpjulia Sep 22, 2021
0ea373f
Update tests/Logger/EscaperTest.cpp
cpjulia Sep 22, 2021
c9ad897
Update CHANGELOG
cpjulia Sep 22, 2021
832f569
Update CHANGELOG
cpjulia Sep 22, 2021
7950a3f
Update CHANGELOG
cpjulia Sep 22, 2021
7818ca2
Merge branch 'devel' of github.com:arangodb/arangodb into feature/esc…
jsteemann Sep 22, 2021
a3d9738
Merge branch 'devel' into feature/escape-unicode-control-chars
cpjulia Sep 27, 2021
d9385b1
Merge branch 'devel' of https://github.com/arangodb/arangodb into fea…
cpjulia Sep 29, 2021
432a737
Merge branch 'feature/escape-unicode-control-chars' of https://github…
cpjulia Sep 29, 2021
92f3cbf
Merge branch 'devel' into feature/escape-unicode-control-chars
mchacki Sep 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Added unicode escaping for 4 bytes representation, parsing for broken…
… unicode and more unit tests
  • Loading branch information
cpjulia committed Sep 20, 2021
commit feb58b7d085bea43bf6d9bf2f48b4e43a3121d38
71 changes: 40 additions & 31 deletions lib/Logger/Escaper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
////////////////////////////////////////////////////////////////////////////////

#include "Escaper.h"
#include <assert.h>

namespace arangodb {

Expand Down Expand Up @@ -63,18 +62,13 @@ void ControlCharsEscaper::writeCharIntoOutputBuffer(uint32_t c, char*& output, i
void UnicodeCharsRetainer::writeCharIntoOutputBuffer(uint32_t c, char*& output, int numBytes) {
if (numBytes == 2) {
uint16_t num1 = c & 0xffff;
uint8_t num2 = ((num1 >> 6) & 0x1f) | 0xc0;
uint8_t num3 = (num1 & 0x3f) | 0x80;
*output++ = num2;
*output++ = num3;
*output++ = ((num1 >> 6) & 0x1f) | 0xc0;
*output++ = (num1 & 0x3f) | 0x80;
} else if (numBytes == 3) {
uint16_t num1 = c & 0xffff;
uint8_t num2 = ((num1 >> 12) & 0x0f) | 0xe0;
uint8_t num3 = ((num1 >> 6) & 0x3f) | 0x80;
uint8_t num4 = (num1 & 0x3f) | 0x80;
*output++ = num2;
*output++ = num3;
*output++ = num4;
*output++ = ((num1 >> 12) & 0x0f) | 0xe0;
*output++ = ((num1 >> 6) & 0x3f) | 0x80;
*output++ = (num1 & 0x3f) | 0x80;
} else if (numBytes == 4) {
*output++ = ((c >> 18) & 0x07) | 0xF0;
*output++ = ((c >> 12) & 0x3f) | 0x80;
Expand All @@ -83,7 +77,7 @@ void UnicodeCharsRetainer::writeCharIntoOutputBuffer(uint32_t c, char*& output,
}
}

void UnicodeCharsEscaper::writeCharIntoOutputBuffer(uint32_t c, char*& output, int numBytes) {
void UnicodeCharsEscaper::writeCharHelper(uint16_t c, char*& output) {
*output++ = '\\';
*output++ = 'u';

Expand All @@ -98,14 +92,28 @@ void UnicodeCharsEscaper::writeCharIntoOutputBuffer(uint32_t c, char*& output, i
*output++ = (i4 < 10) ? ('0' + i4) : ('A' + i4 - 10);
}

void UnicodeCharsEscaper::writeCharIntoOutputBuffer(uint32_t c, char*& output, int numBytes) {
if (numBytes == 4) {
c -= 0x10000U;
uint16_t high = (uint16_t) (((c & 0xffc00U) >> 10) + 0xd800);
writeCharHelper(high, output);
uint16_t low = (c & 0x3ffU) + 0xdc00U;
writeCharHelper(low, output);
} else {
writeCharHelper(c, output);
}
}

template <typename ControlCharHandler, typename UnicodeCharHandler>
size_t Escaper<ControlCharHandler, UnicodeCharHandler>::determineOutputBufferSize(std::string const& message) const {
size_t Escaper<ControlCharHandler, UnicodeCharHandler>::determineOutputBufferSize(
std::string const& message) const {
return message.size() * std::max(this->_controlHandler.maxCharLength(),
this->_unicodeHandler.maxCharLength());
}

template <typename ControlCharHandler, typename UnicodeCharHandler>
void Escaper<ControlCharHandler, UnicodeCharHandler>::writeIntoOutputBuffer(std::string const& message, char*& buffer) {
void Escaper<ControlCharHandler, UnicodeCharHandler>::writeIntoOutputBuffer(
std::string const& message, char*& buffer) {
unsigned char const* p = reinterpret_cast<unsigned char const*>(message.data());
unsigned char const* end = p + message.length();
while (p < end) {
Expand All @@ -125,38 +133,39 @@ void Escaper<ControlCharHandler, UnicodeCharHandler>::writeIntoOutputBuffer(std:
}
uint8_t d = (uint8_t) * (p + 1);
if ((d & 0xC0) == 0x80) {
this->_unicodeHandler.writeCharIntoOutputBuffer(((c & 0x1F) << 6) | (d & 0x3F), buffer, 2);
this->_unicodeHandler.writeCharIntoOutputBuffer(((c & 0x1F) << 6) | (d & 0x3F),
buffer, 2);
++p;
} else {
*buffer++ = '?';
break;
}
p++;
} else if (c < 240) {
} else if (c < 240) {
if ((p + 2) >= end) {
*buffer++ = '?';
break;
}
p++;
continue;
}
uint8_t d = (uint8_t) * (p + 1);
if ((d & 0xC0) == 0x80) {
++p;
uint8_t e = (uint8_t) * (p + 1);
if ((e & 0xC0) == 0x80) {
++p;
this->_unicodeHandler.writeCharIntoOutputBuffer(((c & 0x0F) << 12) | ((d & 0x3F) << 6) | (e & 0x3F), buffer, 3);
} else {
this->_unicodeHandler.writeCharIntoOutputBuffer(
((c & 0x0F) << 12) | ((d & 0x3F) << 6) | (e & 0x3F), buffer, 3);
} else {
*buffer++ = '?';
// break;
}
} else {
*buffer++ = '?';
// break;
}
p++;
} else if (c < 248) {
if ((p + 3) >= end) {
*buffer++ = '?';
break;
p++;
continue;
}
uint8_t d = (uint8_t) * (p + 1);
if ((d & 0xC0) == 0x80) {
Expand All @@ -165,21 +174,21 @@ void Escaper<ControlCharHandler, UnicodeCharHandler>::writeIntoOutputBuffer(std:
if ((e & 0xC0) == 0x80) {
++p;
uint8_t f = (uint8_t) * (p + 1);
if((f & 0xC0) == 0x80) {
if ((f & 0xC0) == 0x80) {
p++;
this->_unicodeHandler.writeCharIntoOutputBuffer(
((c & 0x07) << 18) | ((d & 0x3F) << 12) | ((e & 0x3F) << 6) | (f & 0x3F), buffer, 4);
} else {
this->_unicodeHandler.writeCharIntoOutputBuffer(((c & 0x07) << 18) |
((d & 0x3F) << 12) |
((e & 0x3F) << 6) |
(f & 0x3F),
buffer, 4);
} else {
*buffer++ = '?';
// break;
}
} else {
*buffer++ = '?';
//break;
}
} else {
*buffer++ = '?';
// break;
}
p++;
} else {
Expand Down
1 change: 1 addition & 0 deletions lib/Logger/Escaper.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ struct UnicodeCharsRetainer { //worst case 4 digits
struct UnicodeCharsEscaper { //\u +4 digits
size_t maxCharLength() const { return 6; }
void writeCharIntoOutputBuffer(uint32_t c, char*& output, int numBytes);
void writeCharHelper(uint16_t c, char*& output);
};

class GeneralEscaper {
Expand Down
59 changes: 34 additions & 25 deletions tests/Logger/EscaperTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,9 @@
#include "gtest/gtest.h"

#include "Logger/Escaper.h"
#include "Logger/Logger.h"
#include "Logger/LogMacros.h"

#include <array>
#include <cstring>
#include <string>
#include <string.h>
#include <string>

#ifdef TRI_HAVE_UNISTD_H
#include <unistd.h>
Expand All @@ -45,26 +41,22 @@ using namespace arangodb;
// --SECTION-- test suite
// -----------------------------------------------------------------------------

template<typename EscaperType>
void verifyExpectedValues(std::string const& inputString, std::string const& expectedOutput, size_t expectedSize, EscaperType& escaper) {
//LOG_DEVEL << "verifyExpectedValues";
template <typename EscaperType>
void verifyExpectedValues(std::string const& inputString, std::string const& expectedOutput,
size_t expectedSize, EscaperType& escaper) {
size_t messageSize = escaper.determineOutputBufferSize(inputString);
LOG_DEVEL << "size " << messageSize;
EXPECT_EQ(messageSize, expectedSize);
auto buffer = std::make_unique<char[]>(messageSize);
char* output = buffer.get();
escaper.writeIntoOutputBuffer(inputString, output);
size_t outputBufferSize = output - buffer.get();
// LOG_DEVEL << "output size " << outputBufferSize;
std::string outputString(buffer.get(), outputBufferSize);
// LOG_DEVEL << "output " << outputString << " " << outputString.size();
EXPECT_EQ(outputString.compare(expectedOutput), 0);
EXPECT_EQ(outputString, expectedOutput);
}

TEST(EscaperTest, test_suppress_control_retain_unicode) {
Escaper<ControlCharsSuppressor, UnicodeCharsRetainer> escaper;
// LOG_DEVEL << "SuppressControlRetainUnicode";
verifyExpectedValues("€", "€", 12, escaper);
verifyExpectedValues(" € ", " € ", 24, escaper);
verifyExpectedValues("mötör", "mötör", 28, escaper);
Expand All @@ -75,13 +67,18 @@ TEST(EscaperTest, test_suppress_control_retain_unicode) {
verifyExpectedValues("犬\r", "犬 ", 16, escaper);
verifyExpectedValues("", "", 0, escaper);
verifyExpectedValues("a", "a", 4, escaper);
std::string validUnicode = "€";
verifyExpectedValues("𐍈", "𐍈", 16, escaper); //\uD800\uDF48
verifyExpectedValues("𐍈 ", "𐍈 ", 20, escaper); //\uD800\uDF48
std::string validUnicode = "€";
verifyExpectedValues(validUnicode.substr(0, 1), "?", 4, escaper);
verifyExpectedValues(validUnicode.substr(0, 1) + "\n", "? ", 8, escaper);
verifyExpectedValues("\x07", " ", 4, escaper);
verifyExpectedValues(std::string("\0", 1), " ", 4, escaper);
//invalid unicode: '\ufffe', '\U110000','\ud800', 'test\xFE'
}
verifyExpectedValues(std::string("\0", 1), " ", 4, escaper);
validUnicode = "𐍈";
verifyExpectedValues(validUnicode.substr(0, 1), "?", 4, escaper);
verifyExpectedValues(validUnicode.substr(0, 1) + "\n", "? ", 8, escaper);
// invalid unicode: '\ufffe', '\U110000',&# 9E7A 39;\ud800', 'test\xFE'
}

TEST(EscaperTest, test_suppress_control_escape_unicode) {
Escaper<ControlCharsSuppressor, UnicodeCharsEscaper> escaper;
Expand All @@ -95,12 +92,16 @@ TEST(EscaperTest, test_suppress_control_escape_unicode) {
verifyExpectedValues("犬\r", "\\u72AC ", 24, escaper);
verifyExpectedValues("", "", 0, escaper);
verifyExpectedValues("a", "a", 6, escaper);
std::string validUnicode = "€";
verifyExpectedValues("𐍈", "\\uD800\\uDF48", 24, escaper); //\uD800\uDF48
verifyExpectedValues("𐍈 ", "\\uD800\\uDF48 ", 30, escaper); //\uD800\uDF48
std::string validUnicode = "€";
verifyExpectedValues(validUnicode.substr(0, 1), "?", 6, escaper);
verifyExpectedValues(validUnicode.substr(0, 1) + "\n", "? ", 12, escaper);
validUnicode = "𐍈";
verifyExpectedValues(validUnicode.substr(0, 1), "?", 6, escaper);
verifyExpectedValues(validUnicode.substr(0, 1) + "\n", "? ", 12, escaper);
verifyExpectedValues("\x07", " ", 6, escaper);
verifyExpectedValues(std::string("\0", 1), " ", 6, escaper);

verifyExpectedValues(std::string("\0", 1), " ", 6, escaper);
}
TEST(EscaperTest, test_escape_control_retain_unicode) {
Escaper<ControlCharsEscaper, UnicodeCharsRetainer> escaper;
Expand All @@ -114,11 +115,16 @@ TEST(EscaperTest, test_escape_control_retain_unicode) {
verifyExpectedValues("犬\r", "犬\\r", 16, escaper);
verifyExpectedValues("", "", 0, escaper);
verifyExpectedValues("a", "a", 4, escaper);
std::string validUnicode = "€";
verifyExpectedValues("𐍈", "𐍈", 16, escaper); //\uD800\uDF48
verifyExpectedValues("𐍈 ", "𐍈 ", 20, escaper); //\uD800\uDF48
std::string validUnicode = "€";
verifyExpectedValues(validUnicode.substr(0, 1), "?", 4, escaper);
verifyExpectedValues(validUnicode.substr(0, 1) + "\n", "?\\n", 8, escaper);
validUnicode = "𐍈";
verifyExpectedValues(validUnicode.substr(0, 1), "?", 4, escaper);
verifyExpectedValues(validUnicode.substr(0, 1) + "\n", "?\\n", 8, escaper);
verifyExpectedValues("\x07", "\\x07", 4, escaper);
verifyExpectedValues(std::string("\0", 1), "\\x00", 4, escaper);
verifyExpectedValues(std::string("\0", 1), "\\x00", 4, escaper);
}
TEST(EscaperTest, test_escape_control_escape_unicode) {
Escaper<ControlCharsEscaper, UnicodeCharsEscaper> escaper;
Expand All @@ -132,11 +138,14 @@ TEST(EscaperTest, test_escape_control_escape_unicode) {
verifyExpectedValues("犬\r", "\\u72AC\\r", 24, escaper);
verifyExpectedValues("", "", 0, escaper);
verifyExpectedValues("a", "a", 6, escaper);
std::string validUnicode = "€";
verifyExpectedValues("𐍈", "\\uD800\\uDF48", 24, escaper); //\uD800\uDF48
verifyExpectedValues("𐍈 ", "\\uD800\\uDF48 ", 30, escaper); //\uD800\uDF48
std::string validUnicode = "€";
verifyExpectedValues(validUnicode.substr(0, 1), "?", 6, escaper);
verifyExpectedValues(validUnicode.substr(0, 1) + "\n", "?\\n", 12, escaper);
validUnicode = "𐍈";
verifyExpectedValues(validUnicode.substr(0, 1), "?", 6, escaper);
verifyExpectedValues(validUnicode.substr(0, 1) + "\n", "?\\n", 12, escaper);
verifyExpectedValues("\x07", "\\x07", 6, escaper);
verifyExpectedValues(std::string("\0", 1), "\\x00", 6, escaper);
verifyExpectedValues(std::string("\0", 1), "\\x00", 6, escaper);
}


0