8000 Added UTF-8 validation by Nostimo · Pull Request #13 · simdjson/simdjson-java · GitHub
[go: up one dir, main page]

Skip to content

Added UTF-8 validation #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 26, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File f 8000 ilter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
moved methods to private, updated tests to run all supported vector w…
…idths, code cleanup
  • Loading branch information
Nostimo committed Nov 15, 2023
commit 7c2b4c484a6d355a10033cdb1bc3120bbe028143
17 changes: 7 additions & 10 deletions src/main/java/org/simdjson/Utf8Validator.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@ public class Utf8Validator {
private static final ByteVector LOW_NIBBLE_MASK = ByteVector.broadcast(VECTOR_SPECIES, 0b0000_1111);
private static final ByteVector ALL_ASCII_MASK = ByteVector.broadcast(VECTOR_SPECIES, (byte) 0b1000_0000);


/**
* Validate the input bytes are valid UTF8
*
* @param inputBytes the input bytes to validate
* @throws JsonParsingException if the input is not valid UTF8
*/
public static void validate(byte[] inputBytes) {
static void validate(byte[] inputBytes) {
long previousIncomplete = 0;
long errors = 0;
int previousFourUtf8Bytes = 0;
Expand All @@ -46,9 +45,7 @@ public static void validate(byte[] inputBytes) {
// if the input file doesn't align with the vector width, pad the missing bytes with zero
VectorMask<Byte> remainingBytes = VECTOR_SPECIES.indexInRange(idx, inputBytes.length);
ByteVector lastVectorChunk = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, idx, remainingBytes);
if (isAscii(lastVectorChunk)) {
errors |= previousIncomplete;
} else {
if (!isAscii(lastVectorChunk)) {
previousIncomplete = isIncomplete(lastVectorChunk);

var fourBytesPrevious = fourBytesPreviousSlice(lastVectorChunk, previousFourUtf8Bytes);
Expand Down Expand Up @@ -96,9 +93,9 @@ private static ByteVector firstTwoByteSequenceCheck(IntVector utf8Vector, IntVec
// low nibbles of the shifted input (e.g. 0xC3 & 0xF = 0x3)
ByteVector byte1LowNibbles = oneBytePrevious.and(LOW_NIBBLE_MASK);

ByteVector byte1HighState = byte2HighNibbles.selectFrom(LookupTable.byte2High);
ByteVector byte1LowState = byte1HighNibbles.selectFrom(LookupTable.byte1High);
ByteVector byte2HighState = byte1LowNibbles.selectFrom(LookupTable.byte1Low);
ByteVector byte1HighState = byte1HighNibbles.selectFrom(LookupTable.byte1High);
ByteVector byte1LowState = byte1LowNibbles.selectFrom(LookupTable.byte1Low);
ByteVector byte2HighState = byte2HighNibbles.selectFrom(LookupTable.byte2High);

return byte1HighState.and(byte1LowState).and(byte2HighState);
}
Expand Down Expand Up @@ -131,12 +128,12 @@ private static ByteVector getIncompleteCheck() {
return ByteVector.fromArray(VECTOR_SPECIES, eofArray, 0);
}

protected static long isIncomplete(ByteVector utf8Vector) {
private static long isIncomplete(ByteVector utf8Vector) {
return utf8Vector.compare(VectorOperators.UNSIGNED_GE, INCOMPLETE_CHECK).toLong();
}

// ASCII will never exceed 01111_1111
protected static boolean isAscii(ByteVector utf8Vector) {
private static boolean isAscii(ByteVector utf8Vector) {
return utf8Vector.and(ALL_ASCII_MASK).compare(VectorOperators.EQ, 0).allTrue();
}

Expand Down
6 changes: 3 additions & 3 deletions src/test/java/org/simdjson/Utf8ValidatorTest.java
7F0D
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import static org.assertj.core.api.Assertions.*;

class Utf8ValidatorTest {
private static final VectorSpecies<Byte> VECTOR_SPECIES = ByteVector.SPECIES_256;
private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.SPECIES;


/* ASCII / 1 BYTE TESTS */
Expand Down Expand Up @@ -482,14 +482,14 @@ void validate_continuationThreeBytesTooShort_4Byte_eof_invalid() {
@ParameterizedTest
@ValueSource(strings = {"/twitter.json", "/nhkworld.json"})
void validate_utf8InputFiles_valid(String inputFilePath) throws IOException {
byte[] inputBytes = Objects.requireNonNull(Utf8ValidatorTest.class.getResourceAsStream(inputFilePath)).readAllBytes();
byte[] inputBytes = TestUtils.loadTestFile(inputFilePath);
SimdJsonParser parser = new SimdJsonParser();
assertThatCode(() -> parser.parse(inputBytes, inputBytes.length)).doesNotThrowAnyException();
}

@Test
void validate_utf8InputFile_invalid() throws IOException {
byte[] inputBytes = Objects.requireNonNull(Utf8ValidatorTest.class.getResourceAsStream("/malformed.txt")).readAllBytes();
byte[] inputBytes = TestUtils.loadTestFile("/malformed.txt");
SimdJsonParser parser = new SimdJsonParser();
assertThatExceptionOfType(JsonParsingException.class)
.isThrownBy(() -> parser.parse(inputBytes, inputBytes.length))
Expand Down
0