8000 Feature/escape unicode control chars by cpjulia · Pull Request #14805 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Feature/escape unicode control chars #14805

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Sep 30, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4bfa630
Added parser for retaining or escaping control and unicode characters…
cpjulia Sep 18, 2021
600deca
Merge branch 'devel' of https://github.com/arangodb/arangodb into fea…
cpjulia Sep 20, 2021
feb58b7
Added unicode escaping for 4 bytes representation, parsing for broken…
cpjulia Sep 20, 2021
616265e
Merge branch 'devel' of https://github.com/arangodb/arangodb into fea…
cpjulia Sep 20, 2021
f48f87a
Added more tests
cpjulia Sep 20, 2021
4cc1e64
Removed unused functions, updated CHANGELOG, removed unused include i…
cpjulia Sep 20, 2021
2448e9c
Resolved CHANGELOG conflict from merge with devel
cpjulia Sep 20, 2021
3c29365
Update tests/Logger/EscaperTest.cpp
cpjulia Sep 20, 2021
bfa05eb
Update lib/Logger/LoggerFeature.cpp
cpjulia Sep 20, 2021
0ca0d25
Update lib/Logger/LoggerFeature.h
cpjulia Sep 20, 2021
39f97e1
Update lib/Logger/Escaper.h
cpjulia Sep 20, 2021
7a3480f
Update lib/Logger/Escaper.cpp
cpjulia Sep 20, 2021
963921b
Update CHANGELOG
cpjulia Sep 20, 2021
015a188
Update CHANGELOG
cpjulia Sep 21, 2021
a007926
Update CHANGELOG
cpjulia Sep 21, 2021
2f8f3a2
Updated CHANGELOG
cpjulia Sep 21, 2021
3067a5a
Added more tests, updated CHANGELOG
cpjulia Sep 21, 2021
482e186
Update tests/Logger/EscaperTest.cpp
cpjulia Sep 22, 2021
0ea373f
Update tests/Logger/EscaperTest.cpp
cpjulia Sep 22, 2021
c9ad897
Update CHANGELOG
cpjulia Sep 22, 2021
832f569
Update CHANGELOG
cpjulia Sep 22, 2021
7950a3f
Update CHANGELOG
cpjulia Sep 22, 2021
7818ca2
Merge branch 'devel' of github.com:arangodb/arangodb into feature/esc…
jsteemann Sep 22, 2021
a3d9738
Merge branch 'devel' into feature/escape-unicode-control-chars
cpjulia Sep 27, 2021
d9385b1
Merge branch 'devel' of https://github.com/arangodb/arangodb into fea…
cpjulia Sep 29, 2021
432a737
Merge branch 'feature/escape-unicode-control-chars' of https://github…
cpjulia Sep 29, 2021
92f3cbf
Merge branch 'devel' into feature/escape-unicode-control-chars
mchacki Sep 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10000
Prev Previous commit
Next Next commit
Added more tests
  • Loading branch information
cpjulia committed Sep 20, 2021
commit f48f87a225296eda836e4db5f0f8dfe27d95254c
60 changes: 31 additions & 29 deletions lib/Logger/Escaper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
////////////////////////////////////////////////////////////////////////////////

#include "Escaper.h"
#include "Basics/debugging.h"

namespace arangodb {

Expand Down Expand Up @@ -93,7 +94,8 @@ void UnicodeCharsEscaper::writeCharHelper(uint16_t c, char*& output) {
}

void UnicodeCharsEscaper::writeCharIntoOutputBuffer(uint32_t c, char*& output, int numBytes) {
if (numBytes == 4) {
if (numBytes == 4) { // when the unicode requires 4 bytes for representation, its code is escaped with surrogate pairs, the highest and the lowest bytes of the character
TRI_ASSERT(c >= 0x10000U);
c -= 0x10000U;
uint16_t high = (uint16_t) (((c & 0xffc00U) >> 10) + 0xd800);
writeCharHelper(high, output);
Expand All @@ -118,80 +120,80 @@ void Escaper<ControlCharHandler, UnicodeCharHandler>::writeIntoOutputBuffer(
unsigned char const* end = p + message.length();
while (p < end) {
unsigned char c = *p;
if (c < 128) {
if (c < 0x20) {
this->_controlHandler.writeCharIntoOutputBuffer(c, buffer, 1);
} else {
if (c < 128) { // the character is ASCII
if (c < 0x20 || c == 0x7f) { // the character is either control, which comprises codes until 32, or is DEL, which is not a visible character
this->_controlHandler.writeCharIntoOutputBuffer(c, buffer, 1); //retain or escape the control character
} else { // is a visible ascii character
*buffer++ = c;
}
// single byte
p++;
} else if (c < 224) {
if ((p + 1) >= end) {
} else if (c < 224) { // unicode which requires 2 bytes for representation
if ((p + 1) >= end) { // no next byte to represent it, so it's broken unicode
*buffer++ = '?';
break;
p++;
continue;
}
uint8_t d = (uint8_t) * (p + 1);
if ((d & 0xC0) == 0x80) {
if ((d & 0xC0) == 0x80) { // is within the rules for representing unicode characters for the second byte
this->_unicodeHandler.writeCharIntoOutputBuffer(((c & 0x1F) << 6) | (d & 0x3F),
buffer, 2);
buffer, 2); // retain or escape the unicode character represented by 2 bytes
++p;
} else {
} else { // the next byte is broken unicode
*buffer++ = '?';
}
p++;
} else if (c < 240) {
if ((p + 2) >= end) {
} else if (c < 240) { // unicode which requires 3 bytes for representation
if ((p + 2) >= end) { // there's no 2 other sequential bytes to represent the unicode character, so it's broken unicode
*buffer++ = '?';
p++;
continue;
}
uint8_t d = (uint8_t) * (p + 1);
if ((d & 0xC0) == 0x80) {
if ((d & 0xC0) == 0x80) { // second byte is within the rules for representing a unicode character that requires 3 bytes for representation
++p;
uint8_t e = (uint8_t) * (p + 1);
if ((e & 0xC0) == 0x80) {
if ((e & 0xC0) == 0x80) { // third byte is within the rules for representing a unicode character that requires 3 bytes for representation
++p;
this->_unicodeHandler.writeCharIntoOutputBuffer(
((c & 0x0F) << 12) | ((d & 0x3F) << 6) | (e & 0x3F), buffer, 3);
} else {
((c & 0x0F) << 12) | ((d & 0x3F) << 6) | (e & 0x3F), buffer, 3); // retain or escape the unicode character represented by 3 bytes
} else { // second byte is not within the rules for representing a unicode character
*buffer++ = '?';
}
} else {
} else { // third byte is not within the rules for representing a unicode character
*buffer++ = '?';
}
p++;
} else if (c < 248) {
if ((p + 3) >= end) {
} else if (c < 248) { // unicode which requires 4 bytes for representation
if ((p + 3) >= end) { // there's not 3 sequential bytes for representing this unicode character, so it's broken unicode
*buffer++ = '?';
p++;
continue;
}
uint8_t d = (uint8_t) * (p + 1);
if ((d & 0xC0) == 0x80) {
if ((d & 0xC0) == 0x80) { // second byte is within the rules for representing a unicode character that requires 3 bytes for representation
++p;
uint8_t e = (uint8_t) * (p + 1);
if ((e & 0xC0) == 0x80) {
if ((e & 0xC0) == 0x80) { // third byte is within the rules for representing a unicode character that requires 3 bytes for representation
++p;
uint8_t f = (uint8_t) * (p + 1);
if ((f & 0xC0) == 0x80) {
if ((f & 0xC0) == 0x80) { // fourth byte is within the rules for representing a unicode character that requires 3 bytes for representation
p++;
this->_unicodeHandler.writeCharIntoOutputBuffer(((c & 0x07) << 18) |
((d & 0x3F) << 12) |
((e & 0x3F) << 6) |
(f & 0x3F),
buffer, 4);
} else {
buffer, 4); // retain or escape the unicode character represented by 4 bytes
} else { // second byte is not within the rules for representing a unicode character
*buffer++ = '?';
}
} else {
} else { // third byte is not within the rules for representing a unicode character
*buffer++ = '?';
}
} else {
} else { // fourth byte is not within the rules for representing a unicode character
*buffer++ = '?';
}
p++;
} else {
} else { // broken unicode, is not ascii and not represented with 2, 3 or 4 bytes
*buffer++ = '?';
// invalid UTF-8 sequence
break;
Expand Down
37 changes: 32 additions & 5 deletions tests/Logger/EscaperTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@

#include "Logger/Escaper.h"

#include "Logger/LogMacros.h"

#include <string.h>
#include <string>

Expand All @@ -41,6 +43,22 @@ using namespace arangodb;
// --SECTION-- test suite
// -----------------------------------------------------------------------------

class EscaperTest : public ::testing::Test {
protected:
std::string asciiVisibleChars;
std::string bigString;

EscaperTest() {
for (int i = 33; i <= 126; ++i) {
asciiVisibleChars += i;
}
while (bigString.size() < 1000) {
bigString += asciiVisibleChars;
}
}

};

template <typename EscaperType>
void verifyExpectedValues(std::string const& inputString, std::string const& expectedOutput,
size_t expectedSize, EscaperType& escaper) {
Expand All @@ -55,8 +73,10 @@ void verifyExpectedValues(std::string const& inputString, std::string const& exp
EXPECT_EQ(outputString, expectedOutput);
}

TEST(EscaperTest, test_suppress_control_retain_unicode) {
TEST_F(EscaperTest, test_suppress_control_retain_unicode) {
Escaper<ControlCharsSuppressor, UnicodeCharsRetainer> escaper;
verifyExpectedValues(asciiVisibleChars, asciiVisibleChars, asciiVisibleChars.size()*4, escaper);
verifyExpectedValues(bigString, bigString, bigString.size()*4, escaper);
verifyExpectedValues("€", "€", 12, escaper);
verifyExpectedValues(" € ", " € ", 24, escaper);
verifyExpectedValues("mötör", "mötör", 28, escaper);
Expand All @@ -77,11 +97,12 @@ TEST(EscaperTest, test_suppress_control_retain_unicode) {
validUnicode = "𐍈";
verifyExpectedValues(validUnicode.substr(0, 1), "?", 4, escaper);
verifyExpectedValues(validUnicode.substr(0, 1) + "\n", "? ", 8, escaper);
// invalid unicode: '\ufffe', '\U110000','\ud800', 'test\xFE'
}

TEST(EscaperTest, test_suppress_control_escape_unicode) {
TEST_F(EscaperTest, test_suppress_control_escape_unicode) {
Escaper<ControlCharsSuppressor, UnicodeCharsEscaper> escaper;
verifyExpectedValues(asciiVisibleChars, asciiVisibleChars, asciiVisibleChars.size()*6, escaper);
verifyExpectedValues(bigString, bigString, bigString.size()*6, escaper);
verifyExpectedValues("€", "\\u20AC", 18, escaper);
verifyExpectedValues(" € ", " \\u20AC ", 36, escaper);
verifyExpectedValues("mötör", "m\\u00F6t\\u00F6r", 42, escaper);
Expand All @@ -103,8 +124,11 @@ TEST(EscaperTest, test_suppress_control_escape_unicode) {
verifyExpectedValues("\x07", " ", 6, escaper);
verifyExpectedValues(std::string("\0", 1), " ", 6, escaper);
}
TEST(EscaperTest, test_escape_control_retain_unicode) {

TEST_F(EscaperTest, test_escape_control_retain_unicode) {
Escaper<ControlCharsEscaper, UnicodeCharsRetainer> escaper;
verifyExpectedValues(asciiVisibleChars, asciiVisibleChars, asciiVisibleChars.size()*4, escaper);
verifyExpectedValues(bigString, bigString, bigString.size()*4, escaper);
verifyExpectedValues("€", "€", 12, escaper);
verifyExpectedValues(" € ", " € ", 24, escaper);
verifyExpectedValues("mötör", "mötör", 28, escaper);
Expand All @@ -126,8 +150,11 @@ TEST(EscaperTest, test_escape_control_retain_unicode) {
verifyExpectedValues("\x07", "\\x07", 4, escaper);
verifyExpectedValues(std::string("\0", 1), "\\x00", 4, escaper);
}
TEST(EscaperTest, test_escape_control_escape_unicode) {

TEST_F(EscaperTest, test_escape_control_escape_unicode) {
Escaper<ControlCharsEscaper, UnicodeCharsEscaper> escaper;
verifyExpectedValues(asciiVisibleChars, asciiVisibleChars, asciiVisibleChars.size()*6, escaper);
verifyExpectedValues(bigString, bigString, bigString.size()*6, escaper);
verifyExpectedValues("€", "\\u20AC", 18, escaper);
verifyExpectedValues(" € ", " \\u20AC ", 36, escaper);
verifyExpectedValues("mötör", "m\\u00F6t\\u00F6r", 42, escaper);
Expand Down
0