|
| 1 | +package org.simdjson; |
| 2 | + |
| 3 | +import jdk.incubator.vector.*; |
| 4 | + |
| 5 | +import java.util.Arrays; |
| 6 | + |
| 7 | +public class Utf8Validator { |
| 8 | + private static final VectorSpecies<Byte> VECTOR_SPECIES = ByteVector.SPECIES_256; |
| 9 | + private static final ByteVector INCOMPLETE_CHECK = getIncompleteCheck(); |
| 10 | + private static final VectorShuffle<Integer> SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(IntVector.SPECIES_256, |
| 11 | + IntVector.SPECIES_256.elementSize() - 1, 1, true); |
| 12 | + private static final ByteVector LOW_NIBBLE_MASK = ByteVector.broadcast(VECTOR_SPECIES, 0b0000_1111); |
| 13 | + private static final ByteVector ALL_ASCII_MASK = ByteVector.broadcast(VECTOR_SPECIES, (byte) 0b1000_0000); |
| 14 | + |
| 15 | + /** |
| 16 | + * Validate the input bytes are valid UTF8 |
| 17 | + * |
| 18 | + * @param inputBytes the input bytes to validate |
| 19 | + * @throws JsonParsingException if the input is not valid UTF8 |
| 20 | + */ |
| 21 | + static void validate(byte[] inputBytes) { |
| 22 | + long previousIncomplete = 0; |
| 23 | + long errors = 0; |
| 24 | + int previousFourUtf8Bytes = 0; |
| 25 | + |
| 26 | + int idx = 0; |
| 27 | + for (; idx < VECTOR_SPECIES.loopBound(inputBytes.length); idx += VECTOR_SPECIES.vectorByteSize()) { |
| 28 | + ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, idx); |
| 29 | + // ASCII fast path can bypass the checks that are only required for multibyte code points |
| 30 | + if (isAscii(utf8Vector)) { |
| 31 | + errors |= previousIncomplete; |
| 32 | + } else { |
| 33 | + previousIncomplete = isIncomplete(utf8Vector); |
| 34 | + |
| 35 | + var fourBytesPrevious = fourBytesPreviousSlice(utf8Vector, previousFourUtf8Bytes); |
| 36 | + |
| 37 | + ByteVector firstCheck = firstTwoByteSequenceCheck(utf8Vector.reinterpretAsInts(), fourBytesPrevious); |
| 38 | + ByteVector secondCheck = lastTwoByteSequenceCheck(utf8Vector.reinterpretAsInts(), fourBytesPrevious, firstCheck); |
| 39 | + |
| 40 | + errors |= secondCheck.compare(VectorOperators.NE, 0).toLong(); |
| 41 | + } |
| 42 | + previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(IntVector.SPECIES_256.length() - 1); |
| 43 | + } |
| 44 | + |
| 45 | + // if the input file doesn't align with the vector width, pad the missing bytes with zero |
| 46 | + VectorMask<Byte> remainingBytes = VECTOR_SPECIES.indexInRange(idx, inputBytes.length); |
| 47 | + ByteVector lastVectorChunk = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, idx, remainingBytes); |
| 48 | + if (!isAscii(lastVectorChunk)) { |
| 49 | + previousIncomplete = isIncomplete(lastVectorChunk); |
| 50 | + |
| 51 | + var fourBytesPrevious = fourBytesPreviousSlice(lastVectorChunk, previousFourUtf8Bytes); |
| 52 | + |
| 53 | + ByteVector firstCheck = firstTwoByteSequenceCheck(lastVectorChunk.reinterpretAsInts(), fourBytesPrevious); |
| 54 | + ByteVector secondCheck = lastTwoByteSequenceCheck(lastVectorChunk.reinterpretAsInts(), fourBytesPrevious, firstCheck); |
| 55 | + |
| 56 | + errors |= secondCheck.compare(VectorOperators.NE, 0).toLong(); |
| 57 | + } |
| 58 | + |
| 59 | + if ((errors | previousIncomplete) != 0) { |
| 60 | + throw new JsonParsingException("Invalid UTF8"); |
| 61 | + } |
| 62 | + } |
| 63 | + |
| 64 | + /* Shuffles the input forward by four bytes to make space for the previous four bytes. |
| 65 | + The previous three bytes are required for validation, pulling in the last integer will give the previous four bytes. |
| 66 | + The switch to integer vectors is to allow for integer shifting instead of the more expensive shuffle / slice operations */ |
| 67 | + private static IntVector fourBytesPreviousSlice(ByteVector vectorChunk, int previousFourUtf8Bytes) { |
| 68 | + return vectorChunk.reinterpretAsInts() |
| 69 | + .rearrange(SHIFT_FOUR_BYTES_FORWARD) |
| 70 | + .withLane(0, previousFourUtf8Bytes); |
| 71 | + } |
| 72 | + |
| 73 | + // works similar to previousUtf8Vector.slice(VECTOR_SPECIES.length() - numOfBytesToInclude, utf8Vector) but without the performance cost |
| 74 | + private static ByteVector previousVectorSlice(IntVector utf8Vector, IntVector fourBytesPrevious, int numOfPreviousBytes) { |
| 75 | + return utf8Vector |
| 76 | + .lanewise(VectorOperators.LSHL, Byte.SIZE * numOfPreviousBytes) |
| 77 | + .or(fourBytesPrevious.lanewise(VectorOperators.LSHR, Byte.SIZE * (4 - numOfPreviousBytes))) |
| 78 | + .reinterpretAsBytes(); |
| 79 | + } |
| 80 | + |
| 81 | + private static ByteVector firstTwoByteSequenceCheck(IntVector utf8Vector, IntVector fourBytesPrevious) { |
| 82 | + // shift the current input forward by 1 byte to include 1 byte from the previous input |
| 83 | + var oneBytePrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 1); |
| 84 | + |
| 85 | + // high nibbles of the current input (e.g. 0xC3 >> 4 = 0xC) |
| 86 | + ByteVector byte2HighNibbles = utf8Vector.lanewise(VectorOperators.LSHR, 4) |
| 87 | + .reinterpretAsBytes().and(LOW_NIBBLE_MASK); |
| 88 | + |
| 89 | + // high nibbles of the shifted input |
| 90 | + ByteVector byte1HighNibbles = oneBytePrevious.reinterpretAsInts().lanewise(VectorOperators.LSHR, 4) |
| 91 | + .reinterpretAsBytes().and(LOW_NIBBLE_MASK); |
| 92 | + |
| 93 | + // low nibbles of the shifted input (e.g. 0xC3 & 0xF = 0x3) |
| 94 | + ByteVector byte1LowNibbles = oneBytePrevious.and(LOW_NIBBLE_MASK); |
| 95 | + |
| 96 | + ByteVector byte1HighState = byte1HighNibbles.selectFrom(LookupTable.byte1High); |
| 97 | + ByteVector byte1LowState = byte1LowNibbles.selectFrom(LookupTable.byte1Low); |
| 98 | + ByteVector byte2HighState = byte2HighNibbles.selectFrom(LookupTable.byte2High); |
| 99 | + |
| 100 | + return byte1HighState.and(byte1LowState).and(byte2HighState); |
| 101 | + } |
| 102 | + |
| 103 | + // All remaining checks are invalid 3–4 byte sequences, which either have too many continuations bytes or not enough |
| 104 | + private static ByteVector lastTwoByteSequenceCheck(IntVector utf8Vector, IntVector fourBytesPrevious, ByteVector firstCheck) { |
| 105 | + // the minimum 3byte lead - 1110_0000 is always greater than the max 2byte lead - 110_11111 |
| 106 | + ByteVector twoBytesPrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 2); |
| 107 | + VectorMask<Byte> is3ByteLead = twoBytesPrevious.compare(VectorOperators.UNSIGNED_GT, (byte) 0b110_11111); |
| 108 | + |
| 109 | + // the minimum 4byte lead - 1111_0000 is always greater than the max 3byte lead - 1110_1111 |
| 110 | + ByteVector threeBytesPrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 3); |
| 111 | + VectorMask<Byte> is4ByteLead = threeBytesPrevious.compare(VectorOperators.UNSIGNED_GT, (byte) 0b1110_1111); |
| 112 | + |
| 113 | + // the firstCheck vector contains 0x80 values on continuation byte indexes |
| 114 | + // the 3/4 byte lead bytes should match up with these indexes and zero them out |
| 115 | + return firstCheck.add((byte) 0x80, is3ByteLead.or(is4ByteLead)); |
| 116 | + } |
| 117 | + |
| 118 | + /* checks that the previous vector isn't in an incomplete state. |
| 119 | + Previous vector is in an incomplete state if the last byte is smaller than 0xC0, |
| 120 | + or the second last byte is smaller than 0xE0, or the third last byte is smaller than 0xF0.*/ |
| 121 | + private static ByteVector getIncompleteCheck() { |
| 122 | + int vectorBytes = VECTOR_SPECIES.vectorByteSize(); |
| 123 | + byte[] eofArray = new byte[vectorBytes]; |
| 124 | + Arrays.fill(eofArray, (byte) 255); |
| 125 | + eofArray[vectorBytes - 3] = (byte) 0xF0; |
| 126 | + eofArray[vectorBytes - 2] = (byte) 0xE0; |
| 127 | + eofArray[vectorBytes - 1] = (byte) 0xC0; |
| 128 | + return ByteVector.fromArray(VECTOR_SPECIES, eofArray, 0); |
| 129 | + } |
| 130 | + |
| 131 | + private static long isIncomplete(ByteVector utf8Vector) { |
| 132 | + return utf8Vector.compare(VectorOperators.UNSIGNED_GE, INCOMPLETE_CHECK).toLong(); |
| 133 | + } |
| 134 | + |
| 135 | + // ASCII will never exceed 01111_1111 |
| 136 | + private static boolean isAscii(ByteVector utf8Vector) { |
| 137 | + return utf8Vector.and(ALL_ASCII_MASK).compare(VectorOperators.EQ, 0).allTrue(); |
| 138 | + } |
| 139 | + |
| 140 | + private static class LookupTable { |
| 141 | + /* Bit 0 = Too Short (lead byte not followed by a continuation byte but by a lead/ASCII byte) |
| 142 | + e.g. 11______ 0_______ |
| 143 | + 11______ 11______ */ |
| 144 | + static final byte TOO_SHORT = 1; |
| 145 | + |
| 146 | + /* Bit 1 = Too Long (ASCII followed by continuation byte) |
| 147 | + e.g. 01111111 10_000000 */ |
| 148 | + static final byte TOO_LONG = 1 << 1; |
| 149 | + |
| 150 | + /* Bit 2 = Overlong 3-byte |
| 151 | + Any 3-byte sequence that could be represented by a shorter sequence |
| 152 | + Which is any sequence smaller than 1110_0000 10_100000 10_000000 */ |
| 153 | + static final byte OVERLONG_3BYTE = 1 << 2; |
| 154 | + |
| 155 | + /* Bit 3 = Too Large |
| 156 | + Any decoded codepoint greater than U+10FFFF |
| 157 | + e.g. 11110_100 10_010000 10_000000 10_000000 */ |
| 158 | + static final byte TOO_LARGE = 1 << 3; |
| 159 | + |
| 160 | + /* Bit 4 = Surrogate |
| 161 | + code points in the range of U+D800 - U+DFFF (inclusive) are the surrogates for UTF-16. |
| 162 | + These 2048 code points that are reserved for UTF-16 are disallowed in UTF-8 |
| 163 | + e.g. 1110_1101 10_100000 10_000000 */ |
| 164 | + static final byte SURROGATE = 1 << 4; |
| 165 | + |
| 166 | + /* Bit 5 = Overlong 2-byte |
| 167 | + first valid two byte sequence: 110_00010 10_000000 |
| 168 | + anything smaller is considered overlong as it would fit into a one byte sequence / ASCII */ |
| 169 | + static final byte OVERLONG_2BYTE = 1 << 5; |
| 170 | + |
| 171 | + /* Bit 6 = Too Large 1000 |
| 172 | + Similar to TOO_LARGE, but for cases where the continuation byte's high nibble is 1000 |
| 173 | + e.g. 11110_101 10_000000 10_000000 */ |
| 174 | + static final byte TOO_LARGE_1000 = 1 << 6; |
| 175 | + |
| 176 | + /* Bit 6 = Overlong 4-byte |
| 177 | + Any decoded code point below above U+FFFF / 11110_000 10_001111 10_111111 10_111111 |
| 178 | + e.g. 11110_000 10_000000 10_000000 10_000000 */ |
| 179 | + static final byte OVERLONG_4BYTE = 1 << 6; |
| 180 | + |
| 181 | + /* Bit 7 = Two Continuations |
| 182 | + e.g. 10_000000 10_000000 */ |
| 183 | + static final byte TWO_CONTINUATIONS = (byte) (1 << 7); |
| 184 | + |
| 185 | + private final static ByteVector byte1High = getByte1HighLookup(); |
| 186 | + private final static ByteVector byte1Low = getByte1LowLookup(); |
| 187 | + private final static ByteVector byte2High = getByte2HighLookup(); |
| 188 | + |
| 189 | + private static ByteVector getByte1HighLookup() { |
| 190 | + byte[] byte1HighArray = new byte[]{ |
| 191 | + /* ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table */ |
| 192 | + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, |
| 193 | + /* Continuation high nibble = 1000 -> 1011 */ |
| 194 | + TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS, |
| 195 | + /* Two byte lead high nibble = 1100 -> 1101 */ |
| 196 | + TOO_SHORT | OVERLONG_2BYTE, TOO_SHORT, |
| 197 | + /* Three byte lead high nibble = 1110 */ |
| 198 | + TOO_SHORT | OVERLONG_3BYTE | SURROGATE, |
| 199 | + /* Four byte lead high nibble = 1111 */ |
| 200 | + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4BYTE |
| 201 | + }; |
| 202 | + |
| 203 | + return alignArrayToVector(byte1HighArray); |
| 204 | + } |
| 205 | + |
| 206 | + private static ByteVector alignArrayToVector(byte[] arrayValues) { |
| 207 | + // pad array with zeroes to align up with vector size |
| 208 | + byte[] alignedArray = new byte[VECTOR_SPECIES.vectorByteSize()]; |
| 209 | + System.arraycopy(arrayValues, 0, alignedArray, 0, arrayValues.length); |
| 210 | + return ByteVector.fromArray(VECTOR_SPECIES, alignedArray, 0); |
| 211 | + } |
| 212 | + |
| 213 | + private static ByteVector getByte1LowLookup() { |
| 214 | + final byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTINUATIONS; |
| 215 | + byte[] byte1LowArray = new byte[]{ |
| 216 | + /* ASCII, two Byte lead and three byte lead low nibble = 0000 -> 1111, |
| 217 | + * Four byte lead low nibble = 0000 -> 0111 |
| 218 | + * Continuation byte low nibble is inconsequential |
| 219 | + * Low nibble does not affect the states TOO_SHORT, TOO_LONG, TWO_CONTINUATIONS, so they will be carried over regardless */ |
| 220 | + CARRY | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, |
| 221 | + // 0001 |
| 222 | + CARRY | OVERLONG_2BYTE, |
| 223 | + CARRY, |
| 224 | + CARRY, |
| 225 | + // 1111_0100 -> 1111 = TOO_LARGE range |
| 226 | + CARRY | TOO_LARGE, |
| 227 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 228 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 229 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 230 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 231 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 232 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 233 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 234 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 235 | + // 1110_1101 |
| 236 | + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, |
| 237 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 238 | + CARRY | TOO_LARGE | TOO_LARGE_1000 |
| 239 | + }; |
| 240 | + |
| 241 | + return alignArrayToVector(byte1LowArray); |
| 242 | + } |
| 243 | + |
| 244 | + private static ByteVector getByte2HighLookup() { |
| 245 | + byte[] byte2HighArray = new byte[]{ |
| 246 | + // ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table |
| 247 | + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, |
| 248 | + // Continuation high nibble - 1000 -> 1011 |
| 249 | + TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE | TOO_LARGE_1000, |
| 250 | + TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | TOO_LARGE, |
| 251 | + TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE, |
| 252 | + TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE, |
| 253 | + // 1100 -> 1111 = unexpected lead byte |
| 254 | + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT |
| 255 | + }; |
| 256 | + |
| 257 | + return alignArrayToVector(byte2HighArray); |
| 258 | + } |
| 259 | + } |
| 260 | +} |
0 commit comments