8000 Added UTF-8 validation (#13) · aaron-ds/simdjson-java@bc5e14d · GitHub
[go: up one dir, main page]

Skip to content

Commit bc5e14d

Browse files
authored
Added UTF-8 validation (simdjson#13)
1 parent 5289fa0 commit bc5e14d

File tree

7 files changed

+799
-0
lines changed

7 files changed

+799
-0
lines changed

build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ dependencies {
5151
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.35'
5252
jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'
5353
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion
54+
jmhImplementation group: 'com.google.guava', name: 'guava', version: '32.1.2-jre'
5455
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion
5556

5657
testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package org.simdjson;
2+
3+
import com.google.common.base.Utf8;
4+
import org.openjdk.jmh.annotations.*;
5+
6+
import java.io.IOException;
7+
import java.io.InputStream;
8+
import java.util.concurrent.TimeUnit;
9+
10+
@State(Scope.Benchmark)
11+
@BenchmarkMode(Mode.Throughput)
12+
@OutputTimeUnit(TimeUnit.SECONDS)
13+
public class Utf8ValidatorBenchmark {
14+
@Param({"/twitter.json", "/gsoc-2018.json", "/github_events.json"})
15+
String fileName;
16+
byte[] bytes;
17+
18+
@Setup(Level.Trial)
19+
public void setup() throws IOException {
20+
try (InputStream is = Utf8ValidatorBenchmark.class.getResourceAsStream(fileName)) {
21+
bytes = is.readAllBytes();
22+
}
23+
}
24+
25+
@Benchmark
26+
public void utf8Validator() {
27+
Utf8Validator.validate(bytes);
28+
}
29+
30+
@Benchmark
31+
public boolean guava() {
32+
return Utf8.isWellFormed(bytes);
33+
}
34+
}

src/main/java/org/simdjson/SimdJsonParser.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ public SimdJsonParser(int capacity, int maxDepth) {
2626
}
2727

2828
public JsonValue parse(byte[] buffer, int len) {
29+
stage0(buffer);
2930
byte[] padded = padIfNeeded(buffer, len);
3031
reset(padded, len);
3132
stage1(padded);
@@ -47,6 +48,10 @@ private void reset(byte[] buffer, int len) {
4748
jsonIterator.reset();
4849
}
4950

51+
private void stage0(byte[] buffer) {
52+
Utf8Validator.validate(buffer);
53+
}
54+
5055
private void stage1(byte[] buffer) {
5156
while (reader.hasFullBlock()) {
5257
int blockIndex = reader.getBlockIndex();
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
package org.simdjson;
2+
3+
import jdk.incubator.vector.*;
4+
5+
import java.util.Arrays;
6+
7+
public class Utf8Validator {
8+
private static final VectorSpecies<Byte> VECTOR_SPECIES = ByteVector.SPECIES_256;
9+
private static final ByteVector INCOMPLETE_CHECK = getIncompleteCheck();
10+
private static final VectorShuffle<Integer> SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(IntVector.SPECIES_256,
11+
IntVector.SPECIES_256.elementSize() - 1, 1, true);
12+
private static final ByteVector LOW_NIBBLE_MASK = ByteVector.broadcast(VECTOR_SPECIES, 0b0000_1111);
13+
private static final ByteVector ALL_ASCII_MASK = ByteVector.broadcast(VECTOR_SPECIES, (byte) 0b1000_0000);
14+
15+
/**
16+
* Validate the input bytes are valid UTF8
17+
*
18+
* @param inputBytes the input bytes to validate
19+
* @throws JsonParsingException if the input is not valid UTF8
20+
*/
21+
static void validate(byte[] inputBytes) {
22+
long previousIncomplete = 0;
23+
long errors = 0;
24+
int previousFourUtf8Bytes = 0;
25+
26+
int idx = 0;
27+
for (; idx < VECTOR_SPECIES.loopBound(inputBytes.length); idx += VECTOR_SPECIES.vectorByteSize()) {
28+
ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, idx);
29+
// ASCII fast path can bypass the checks that are only required for multibyte code points
30+
if (isAscii(utf8Vector)) {
31+
errors |= previousIncomplete;
32+
} else {
33+
previousIncomplete = isIncomplete(utf8Vector);
34+
35+
var fourBytesPrevious = fourBytesPreviousSlice(utf8Vector, previousFourUtf8Bytes);
36+
37+
ByteVector firstCheck = firstTwoByteSequenceCheck(utf8Vector.reinterpretAsInts(), fourBytesPrevious);
38+
ByteVector secondCheck = lastTwoByteSequenceCheck(utf8Vector.reinterpretAsInts(), fourBytesPrevious, firstCheck);
39+
40+
errors |= secondCheck.compare(VectorOperators.NE, 0).toLong();
41+
}
42+
previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(IntVector.SPECIES_256.length() - 1);
43+
}
44+
45+
// if the input file doesn't align with the vector width, pad the missing bytes with zero
46+
VectorMask<Byte> remainingBytes = VECTOR_SPECIES.indexInRange(idx, inputBytes.length);
47+
ByteVector lastVectorChunk = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, idx, remainingBytes);
48+
if (!isAscii(lastVectorChunk)) {
49+
previousIncomplete = isIncomplete(lastVectorChunk);
50+
51+
var fourBytesPrevious = fourBytesPreviousSlice(lastVectorChunk, previousFourUtf8Bytes);
52+
53+
ByteVector firstCheck = firstTwoByteSequenceCheck(lastVectorChunk.reinterpretAsInts(), fourBytesPrevious);
54+
ByteVector secondCheck = lastTwoByteSequenceCheck(lastVectorChunk.reinterpretAsInts(), fourBytesPrevious, firstCheck);
55+
56+
errors |= secondCheck.compare(VectorOperators.NE, 0).toLong();
57+
}
58+
59+
if ((errors | previousIncomplete) != 0) {
60+
throw new JsonParsingException("Invalid UTF8");
61+
}
62+
}
63+
64+
/* Shuffles the input forward by four bytes to make space for the previous four bytes.
65+
The previous three bytes are required for validation, pulling in the last integer will give the previous four bytes.
66+
The switch to integer vectors is to allow for integer shifting instead of the more expensive shuffle / slice operations */
67+
private static IntVector fourBytesPreviousSlice(ByteVector vectorChunk, int previousFourUtf8Bytes) {
68+
return vectorChunk.reinterpretAsInts()
69+
.rearrange(SHIFT_FOUR_BYTES_FORWARD)
70+
.withLane(0, previousFourUtf8Bytes);
71+
}
72+
73+
// works similar to previousUtf8Vector.slice(VECTOR_SPECIES.length() - numOfBytesToInclude, utf8Vector) but without the performance cost
74+
private static ByteVector previousVectorSlice(IntVector utf8Vector, IntVector fourBytesPrevious, int numOfPreviousBytes) {
75+
return utf8Vector
76+
.lanewise(VectorOperators.LSHL, Byte.SIZE * numOfPreviousBytes)
77+
.or(fourBytesPrevious.lanewise(VectorOperators.LSHR, Byte.SIZE * (4 - numOfPreviousBytes)))
78+
.reinterpretAsBytes();
79+
}
80+
81+
private static ByteVector firstTwoByteSequenceCheck(IntVector utf8Vector, IntVector fourBytesPrevious) {
82+
// shift the current input forward by 1 byte to include 1 byte from the previous input
83+
var oneBytePrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 1);
84+
85+
// high nibbles of the current input (e.g. 0xC3 >> 4 = 0xC)
86+
ByteVector byte2HighNibbles = utf8Vector.lanewise(VectorOperators.LSHR, 4)
87+
.reinterpretAsBytes().and(LOW_NIBBLE_MASK);
88+
89+
// high nibbles of the shifted input
90+
ByteVector byte1HighNibbles = oneBytePrevious.reinterpretAsInts().lanewise(VectorOperators.LSHR, 4)
91+
.reinterpretAsBytes().and(LOW_NIBBLE_MASK);
92+
93+
// low nibbles of the shifted input (e.g. 0xC3 & 0xF = 0x3)
94+
ByteVector byte1LowNibbles = oneBytePrevious.and(LOW_NIBBLE_MASK);
95+
96+
ByteVector byte1HighState = byte1HighNibbles.selectFrom(LookupTable.byte1High);
97+
ByteVector byte1LowState = byte1LowNibbles.selectFrom(LookupTable.byte1Low);
98+
ByteVector byte2HighState = byte2HighNibbles.selectFrom(LookupTable.byte2High);
99+
100+
return byte1HighState.and(byte1LowState).and(byte2HighState);
101+
}
102+
103+
// All remaining checks are invalid 3–4 byte sequences, which either have too many continuations bytes or not enough
104+
private static ByteVector lastTwoByteSequenceCheck(IntVector utf8Vector, IntVector fourBytesPrevious, ByteVector firstCheck) {
105+
// the minimum 3byte lead - 1110_0000 is always greater than the max 2byte lead - 110_11111
106+
ByteVector twoBytesPrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 2);
107+
VectorMask<Byte> is3ByteLead = twoBytesPrevious.compare(VectorOperators.UNSIGNED_GT, (byte) 0b110_11111);
108+
109+
// the minimum 4byte lead - 1111_0000 is always greater than the max 3byte lead - 1110_1111
110+
ByteVector threeBytesPrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 3);
111+
VectorMask<Byte> is4ByteLead = threeBytesPrevious.compare(VectorOperators.UNSIGNED_GT, (byte) 0b1110_1111);
112+
113+
// the firstCheck vector contains 0x80 values on continuation byte indexes
114+
// the 3/4 byte lead bytes should match up with these indexes and zero them out
115+
return firstCheck.add((byte) 0x80, is3ByteLead.or(is4ByteLead));
116+
}
117+
118+
/* checks that the previous vector isn't in an incomplete state.
119+
Previous vector is in an incomplete state if the last byte is smaller than 0xC0,
120+
or the second last byte is smaller than 0xE0, or the third last byte is smaller than 0xF0.*/
121+
private static ByteVector getIncompleteCheck() {
122+
int vectorBytes = VECTOR_SPECIES.vectorByteSize();
123+
byte[] eofArray = new byte[vectorBytes];
124+
Arrays.fill(eofArray, (byte) 255);
125+
eofArray[vectorBytes - 3] = (byte) 0xF0;
126+
eofArray[vectorBytes - 2] = (byte) 0xE0;
127+
eofArray[vectorBytes - 1] = (byte) 0xC0;
128+
return ByteVector.fromArray(VECTOR_SPECIES, eofArray, 0);
129+
}
130+
131+
private static long isIncomplete(ByteVector utf8Vector) {
132+
return utf8Vector.compare(VectorOperators.UNSIGNED_GE, INCOMPLETE_CHECK).toLong();
133+
}
134+
135+
// ASCII will never exceed 01111_1111
136+
private static boolean isAscii(ByteVector utf8Vector) {
137+
return utf8Vector.and(ALL_ASCII_MASK).compare(VectorOperators.EQ, 0).allTrue();
138+
}
139+
140+
private static class LookupTable {
141+
/* Bit 0 = Too Short (lead byte not followed by a continuation byte but by a lead/ASCII byte)
142+
e.g. 11______ 0_______
143+
11______ 11______ */
144+
static final byte TOO_SHORT = 1;
145+
146+
/* Bit 1 = Too Long (ASCII followed by continuation byte)
147+
e.g. 01111111 10_000000 */
148+
static final byte TOO_LONG = 1 << 1;
149+
150+
/* Bit 2 = Overlong 3-byte
151+
Any 3-byte sequence that could be represented by a shorter sequence
152+
Which is any sequence smaller than 1110_0000 10_100000 10_000000 */
153+
static final byte OVERLONG_3BYTE = 1 << 2;
154+
155+
/* Bit 3 = Too Large
156+
Any decoded codepoint greater than U+10FFFF
157+
e.g. 11110_100 10_010000 10_000000 10_000000 */
158+
static final byte TOO_LARGE = 1 << 3;
159+
160+
/* Bit 4 = Surrogate
161+
code points in the range of U+D800 - U+DFFF (inclusive) are the surrogates for UTF-16.
162+
These 2048 code points that are reserved for UTF-16 are disallowed in UTF-8
163+
e.g. 1110_1101 10_100000 10_000000 */
164+
static final byte SURROGATE = 1 << 4;
165+
166+
/* Bit 5 = Overlong 2-byte
167+
first valid two byte sequence: 110_00010 10_000000
168+
anything smaller is considered overlong as it would fit into a one byte sequence / ASCII */
169+
static final byte OVERLONG_2BYTE = 1 << 5;
170+
171+
/* Bit 6 = Too Large 1000
172+
Similar to TOO_LARGE, but for cases where the continuation byte's high nibble is 1000
173+
e.g. 11110_101 10_000000 10_000000 */
174+
static final byte TOO_LARGE_1000 = 1 << 6;
175+
176+
/* Bit 6 = Overlong 4-byte
177+
Any decoded code point below above U+FFFF / 11110_000 10_001111 10_111111 10_111111
178+
e.g. 11110_000 10_000000 10_000000 10_000000 */
179+
static final byte OVERLONG_4BYTE = 1 << 6;
180+
181+
/* Bit 7 = Two Continuations
182+
e.g. 10_000000 10_000000 */
183+
static final byte TWO_CONTINUATIONS = (byte) (1 << 7);
184+
185+
private final static ByteVector byte1High = getByte1HighLookup();
186+
private final static ByteVector byte1Low = getByte1LowLookup();
187+
private final static ByteVector byte2High = getByte2HighLookup();
188+
189+
private static ByteVector getByte1HighLookup() {
190+
byte[] byte1HighArray = new byte[]{
191+
/* ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table */
192+
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
193+
/* Continuation high nibble = 1000 -> 1011 */
194+
TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS,
195+
/* Two byte lead high nibble = 1100 -> 1101 */
196+
TOO_SHORT | OVERLONG_2BYTE, TOO_SHORT,
197+
/* Three byte lead high nibble = 1110 */
198+
TOO_SHORT | OVERLONG_3BYTE | SURROGATE,
199+
/* Four byte lead high nibble = 1111 */
200+
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4BYTE
201+
};
202+
203+
return alignArrayToVector(byte1HighArray);
204+
}
205+
206+
private static ByteVector alignArrayToVector(byte[] arrayValues) {
207+
// pad array with zeroes to align up with vector size
208+
byte[] alignedArray = new byte[VECTOR_SPECIES.vectorByteSize()];
209+
System.arraycopy(arrayValues, 0, alignedArray, 0, arrayValues.length);
210+
return ByteVector.fromArray(VECTOR_SPECIES, alignedArray, 0);
211+
}
212+
213+
private static ByteVector getByte1LowLookup() {
214+
final byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTINUATIONS;
215+
byte[] byte1LowArray = new byte[]{
216+
/* ASCII, two Byte lead and three byte lead low nibble = 0000 -> 1111,
217+
* Four byte lead low nibble = 0000 -> 0111
218+
* Continuation byte low nibble is inconsequential
219+
* Low nibble does not affect the states TOO_SHORT, TOO_LONG, TWO_CONTINUATIONS, so they will be carried over regardless */
220+
CARRY | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
221+
// 0001
222+
CARRY | OVERLONG_2BYTE,
223+
CARRY,
224+
CARRY,
225+
// 1111_0100 -> 1111 = TOO_LARGE range
226+
CARRY | TOO_LARGE,
227+
CARRY | TOO_LARGE | TOO_LARGE_1000,
228+
CARRY | TOO_LARGE | TOO_LARGE_1000,
229+
CARRY | TOO_LARGE | TOO_LARGE_1000,
230+
CARRY | TOO_LARGE | TOO_LARGE_1000,
231+
CARRY | TOO_LARGE | TOO_LARGE_1000,
232+
CARRY | TOO_LARGE | TOO_LARGE_1000,
233+
CARRY | TOO_LARGE | TOO_LARGE_1000,
234+
CARRY | TOO_LARGE | TOO_LARGE_1000,
235+
// 1110_1101
236+
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
237+
CARRY | TOO_LARGE | TOO_LARGE_1000,
238+
CARRY | TOO_LARGE | TOO_LARGE_1000
239+
};
240+
241+
return alignArrayToVector(byte1LowArray);
242+
}
243+
244+
private static ByteVector getByte2HighLookup() {
245+
byte[] byte2HighArray = new byte[]{
246+
// ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table
247+
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
248+
// Continuation high nibble - 1000 -> 1011
249+
TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE | TOO_LARGE_1000,
250+
TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | TOO_LARGE,
251+
TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE,
252+
TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE,
253+
// 1100 -> 1111 = unexpected lead byte
254+
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
255+
};
256+
257+
return alignArrayToVector(byte2HighArray);
258+
}
259+
}
260+
}

0 commit comments

Comments
 (0)
0