8000 Added UTF-8 validation by Nostimo · Pull Request #13 · simdjson/simdjson-java · GitHub
[go: up one dir, main page]

Skip to content

Added UTF-8 validation #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 26, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Switched assertions to use AssertJ. Updated some of the tests to use …
…SimdJsonParser::parse rather than the internal classes
  • Loading branch information
Nostimo committed Oct 1, 2023
commit e726341077f3673e6506f63db82e23eb8838e2f6
125 changes: 90 additions & 35 deletions src/test/java/org/simdjson/Utf8ValidatorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import java.util.Arrays;
import java.util.Objects;

import static org.junit.jupiter.api.Assertions.*;
import static org.assertj.core.api.Assertions.*;

class Utf8ValidatorTest {
private static final VectorSpecies<Byte> VECTOR_SPECIES = ByteVector.SPECIES_256;
Expand All @@ -21,15 +21,15 @@ void isAscii_true() {
byte[] bytes = new byte[]{'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'm', 'n', 'o', 'p', 'q'};
VectorMask<Byte> nonZeroBytes = VECTOR_SPECIES.indexInRange(0, bytes.length);

assertTrue(Utf8Validator.isAscii(ByteVector.fromArray(VECTOR_SPECIES, bytes, 0, nonZeroBytes)));
assertThat(Utf8Validator.isAscii(ByteVector.fromArray(VECTOR_SPECIES, bytes, 0, nonZeroBytes))).isTrue();
}

@Test
void isAscii_false() {
byte[] bytes = new byte[]{'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'm', 'n', 'o', 'p', (byte) 0b1_0000000};
VectorMask<Byte> nonZeroBytes = VECTOR_SPECIES.indexInRange(0, bytes.length);

assertFalse(Utf8Validator.isAscii(ByteVector.fromArray(VECTOR_SPECIES, bytes, 0, nonZeroBytes)));
assertThat(Utf8Validator.isAscii(ByteVector.fromArray(VECTOR_SPECIES, bytes, 0, nonZeroBytes))).isFalse();
}

/* ASCII / 1 BYTE TESTS */
Expand All @@ -41,7 +41,7 @@ void validate_allSevenBitValues_validAscii() {
allValidAscii[i] = (byte) i;
}

assertDoesNotThrow(() -> Utf8Validator.validate(allValidAscii));
assertThatCode(() -> Utf8Validator.validate(allValidAscii)).doesNotThrowAnyException();
}

@Test
Expand All @@ -53,9 +53,13 @@ void validate_allEightBitValues_invalidAscii() {
invalidAscii[index++] = (byte) eightBitVal;
}

SimdJsonParser parser = new SimdJsonParser();
for (int i = 0; i < 128; i += VECTOR_SPECIES.vectorByteSize()) {
byte[] vectorChunk = Arrays.copyOfRange(invalidAscii, i, i + VECTOR_SPECIES.vectorByteSize());
assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(vectorChunk));

assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length))
.withMessage("Invalid UTF8");
}
}

Expand All @@ -76,10 +80,13 @@ void validate_continuationByteOutOfOrder_invalid() {
continuationByte++;
}


SimdJsonParser parser = new SimdJsonParser();
for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.length()) {
byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize());
assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(vectorChunk));

assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length))
.withMessage("Invalid UTF8");
}
}

Expand All @@ -90,15 +97,21 @@ void validate_extraContinuationByte_2Byte_invalid() {
inputBytes[1] = (byte) 0b10_000000;
inputBytes[2] = (byte) 0b10_000000; // two byte lead should only have one continuation byte

assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}

@Test
void validate_continuationOneByteTooShort_2Byte_invalid() {
byte[] inputBytes = new byte[1];
inputBytes[0] = (byte) 0b110_00010;

assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}

@Test
Expand All @@ -109,7 +122,10 @@ void validate_extraContinuationByte_3Byte_invalid() {
inputBytes[2] = (byte) 0b10_000000;
inputBytes[3] = (byte) 0b10_000000; // three byte lead should only have two continuation bytes

assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}

@Test
Expand All @@ -118,15 +134,21 @@ void validate_continuationOneByteTooShort_3Byte_invalid() {
inputBytes[0] = (byte) 0b1110_0000;
inputBytes[1] = (byte) 0b10_100000;

assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}

@Test
void validate_continuationTwoBytesTooShort_3Byte_invalid() {
byte[] inputBytes = new byte[1];
inputBytes[0] = (byte) 0b1110_0000;

assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}

@Test
Expand All @@ -138,7 +160,10 @@ void validate_extraContinuationByte_4Byte_invalid() {
inputBytes[3] = (byte) 0b10_000000;
inputBytes[4] = (byte) 0b10_000000; // four byte lead should only have three continuation bytes

assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}

@Test
Expand All @@ -148,7 +173,10 @@ void validate_continuationOneByteTooShort_4Byte_invalid() {
inputBytes[1] = (byte) 0b10_010000;
inputBytes[2] = (byte) 0b10_000000;

assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}

@Test
Expand All @@ -157,15 +185,21 @@ void validate_continuationTwoBytesTooShort_4Byte_invalid() {
inputBytes[0] = (byte) 0b11110_000;
inputBytes[1] = (byte) 0b10_010000;

assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}

@Test
void validate_continuationThreeBytesTooShort_4Byte_invalid() {
byte[] inputBytes = new byte[1];
inputBytes[0] = (byte) 0b11110_000;

assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}


Expand Down Expand Up @@ -200,7 +234,7 @@ void validate_LatinCharacters_allValid() {
}
}

assertDoesNotThrow(() -> Utf8Validator.validate(inputBytes));
assertThatCode(() -> Utf8Validator.validate(inputBytes)).doesNotThrowAnyException();
}

@Test
Expand Down Expand Up @@ -228,9 +262,13 @@ void validate_overlong_2byte_invalid() {
}
}

SimdJsonParser parser = new SimdJsonParser();
for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.length()) {
byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize());
assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(vectorChunk));

assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length))
.withMessage("Invalid UTF8");
}
}

Expand Down Expand Up @@ -265,9 +303,13 @@ void validate_overlong_3Byte_allInvalid() {
}
}

SimdJsonParser parser = new SimdJsonParser();
for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.length()) {
byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize());
assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(vectorChunk));

assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length))
.withMessage("Invalid UTF8");
}
}

Expand Down Expand Up @@ -312,7 +354,7 @@ void validate_asiaticFirstSector_allValid() {
}
}

assertDoesNotThrow(() -> Utf8Validator.validate(inputBytes));
assertThatCode(() -> Utf8Validator.validate(inputBytes)).doesNotThrowAnyException();
}

/* code points in the range of U+D800 - U+DFFF (inclusive) are the surrogates for UTF-16.
Expand Down Expand Up @@ -345,9 +387,13 @@ void validate_surrogateCodePoints_invalid() {
}
}

SimdJsonParser parser = new SimdJsonParser();
for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.vectorByteSize()) {
byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize());
assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(vectorChunk));

assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length))
.withMessage("Invalid UTF8");
}
}

Expand Down Expand Up @@ -387,7 +433,7 @@ void validate_asiaticSecondSector_allValid() {
}
}

assertDoesNotThrow(() -> Utf8Validator.validate(inputBytes));
assertThatCode(() -> Utf8Validator.validate(inputBytes)).doesNotThrowAnyException();
}


Expand Down Expand Up @@ -439,7 +485,7 @@ void validate_supplementary_allValid() {
}
}

assertDoesNotThrow(() -> Utf8Validator.validate(inputBytes));
assertThatCode(() -> Utf8Validator.validate(inputBytes)).doesNotThrowAnyException();
}

/* Overlong Test, the decoded character must be above U+FFFF / 11110_000 10_001111 10_111111 10_111111 */
Expand Down Expand Up @@ -476,10 +522,13 @@ void validate_overlong_4Byte_allInvalid() {
}
}


SimdJsonParser parser = new SimdJsonParser();
for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.vectorByteSize()) {
byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize());
assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(vectorChunk));

assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length))
.withMessage("Invalid UTF8");
}
}

Expand Down Expand Up @@ -527,10 +576,13 @@ void validate_tooLarge_4Byte_allInvalid() {
}
}


SimdJsonParser parser = new SimdJsonParser();
for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.vectorByteSize()) {
byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize());
assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(vectorChunk));

assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length))
.withMessage("Invalid UTF8");
}
}

Expand All @@ -545,7 +597,7 @@ void validate_continuationOneByteTooShort_2Byte_eof_invalid() {
inputBytes[vectorBytes - 1] = (byte) 0b110_00010;

ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, 0);
Utf8Validator.isIncomplete(utf8Vector);
assertThat(Utf8Validator.isIncomplete(utf8Vector)).isNotEqualTo(0L);
}

@Test
Expand All @@ -556,7 +608,7 @@ void validate_continuationOneByteTooShort_3Byte_eof_invalid() {
inputBytes[vectorBytes - 1] = (byte) 0b10_100000;

ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, 0);
assertNotEquals(0L, Utf8Validator.isIncomplete(utf8Vector));
assertThat(Utf8Validator.isIncomplete(utf8Vector)).isNotEqualTo(0L);
}

@Test
Expand All @@ -566,7 +618,7 @@ void validate_continuationTwoBytesTooShort_3Byte_eof_invalid() {
inputBytes[vectorBytes - 1] = (byte) 0b1110_0000;

ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, 0);
assertNotEquals(0L, Utf8Validator.isIncomplete(utf8Vector));
assertThat(Utf8Validator.isIncomplete(utf8Vector)).isNotEqualTo(0L);
}

@Test
Expand All @@ -579,7 +631,7 @@ void validate_continuationOneByteTooShort_4Byte_eof_invalid() {


ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, 0);
assertNotEquals(0L, Utf8Validator.isIncomplete(utf8Vector));
assertThat(Utf8Validator.isIncomplete(utf8Vector)).isNotEqualTo(0L);
}

@Test
Expand All @@ -590,7 +642,7 @@ void validate_continuationTwoBytesTooShort_4Byte_eof_invalid() {
inputBytes[vectorBytes - 1] = (byte) 0b10_010000;

ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, 0);
assertNotEquals(0L, Utf8Validator.isIncomplete(utf8Vector));
assertThat(Utf8Validator.isIncomplete(utf8Vector)).isNotEqualTo(0L);
}

@Test
Expand All @@ -600,7 +652,7 @@ void validate_continuationThreeBytesTooShort_4Byte_eof_invalid() {
inputBytes[vectorBytes - 1] = (byte) 0b11110_000;

ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, 0);
assertNotEquals(0L, Utf8Validator.isIncomplete(utf8Vector));
assertThat(Utf8Validator.isIncomplete(utf8Vector)).isNotEqualTo(0L);
}


Expand All @@ -610,12 +662,15 @@ void validate_continuationThreeBytesTooShort_4Byte_eof_invalid() {
@ValueSource(strings = {"/twitter.json", "/nhkworld.json", "/greek.txt", "/emoji-test.txt", "/amazon_cellphones.ndjson"})
void validate_utf8InputFiles_valid(String inputFilePath) throws IOException {
byte[] inputBytes = Objects.requireNonNull(Utf8ValidatorTest.class.getResourceAsStream(inputFilePath)).readAllBytes();
assertDoesNotThrow(() -> Utf8Validator.validate(inputBytes));
assertThatCode(() -> Utf8Validator.validate(inputBytes)).doesNotThrowAnyException();
}

@Test
void validate_utf8InputFile_invalid() throws IOException {
byte[] inputBytes = Objects.requireNonNull(Utf8ValidatorTest.class.getResourceAsStream("/malformed.txt")).readAllBytes();
assertThrowsExactly(JsonParsingException.class, () -> Utf8Validator.validate(inputBytes));
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
.withMessage("Invalid UTF8");
}
}
0