diff --git a/build.gradle b/build.gradle index 8fb175d..60e5f4b 100644 --- a/build.gradle +++ b/build.gradle @@ -136,24 +136,20 @@ jmh { jvmArgsPrepend = [ '--add-modules=jdk.incubator.vector' ] - if (getBooleanProperty('jmh.profilersEnabled', false)) { - createDirIfDoesNotExist('./profilers') - if (OperatingSystem.current().isLinux()) { - def profilerList = [ - 'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getLibPath('LD_LIBRARY_PATH') - ] - if (getBooleanProperty('jmh.jitLogEnabled', false)) { - createDirIfDoesNotExist('./profilers/perfasm') - profilerList += [ - 'perfasm:intelSyntax=true;saveLog=true;saveLogTo=./profilers/perfasm' - ] - } - profilers = profilerList - } else if (OperatingSystem.current().isMacOsX()) { - profilers = [ - 'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getLibPath('DYLD_LIBRARY_PATH') - ] + if (OperatingSystem.current().isLinux()) { + def profilerList = [] + if (getBooleanProperty('jmh.asyncProfilerEnabled', false)) { + createDirIfDoesNotExist('./profilers/async') + profilerList += ['async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getLibPath('LD_LIBRARY_PATH')] + } + if (getBooleanProperty('jmh.perfAsmEnabled', false)) { + createDirIfDoesNotExist('./profilers/perfasm') + profilerList += ['perfasm:intelSyntax=true;saveLog=true;saveLogTo=./profilers/perfasm'] + } + if (getBooleanProperty('jmh.perfEnabled', false)) { + profilerList += ['perf'] } + profilers = profilerList } if (project.hasProperty('jmh.includes')) { includes = [project.findProperty('jmh.includes')] diff --git a/src/jmh/java/org/simdjson/ParseAndSelectBenchmark.java b/src/jmh/java/org/simdjson/ParseAndSelectBenchmark.java index fcb056f..1cae0b1 100644 --- a/src/jmh/java/org/simdjson/ParseAndSelectBenchmark.java +++ b/src/jmh/java/org/simdjson/ParseAndSelectBenchmark.java @@ -39,7 +39,7 @@ public void setup() throws IOException { buffer = is.readAllBytes(); bufferPadded = padded(buffer); } - System.out.println("VectorSpecies = " + StructuralIndexer.BYTE_SPECIES); + System.out.println("VectorSpecies = " + VectorUtils.BYTE_SPECIES); } @Benchmark diff --git a/src/jmh/java/org/simdjson/SchemaBasedParseAndSelectBenchmark.java b/src/jmh/java/org/simdjson/SchemaBasedParseAndSelectBenchmark.java index a001e3f..cdd98f9 100644 --- a/src/jmh/java/org/simdjson/SchemaBasedParseAndSelectBenchmark.java +++ b/src/jmh/java/org/simdjson/SchemaBasedParseAndSelectBenchmark.java @@ -41,7 +41,7 @@ public void setup() throws IOException { buffer = is.readAllBytes(); bufferPadded = padded(buffer); } - System.out.println("VectorSpecies = " + StructuralIndexer.BYTE_SPECIES); + System.out.println("VectorSpecies = " + VectorUtils.BYTE_SPECIES); } @Benchmark diff --git a/src/jmh/java/org/simdjson/Utf8ValidatorBenchmark.java b/src/jmh/java/org/simdjson/Utf8ValidatorBenchmark.java index 51a6948..7661d38 100644 --- a/src/jmh/java/org/simdjson/Utf8ValidatorBenchmark.java +++ b/src/jmh/java/org/simdjson/Utf8ValidatorBenchmark.java @@ -1,7 +1,15 @@ package org.simdjson; import com.google.common.base.Utf8; -import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; import java.io.IOException; import java.io.InputStream; @@ -11,6 +19,7 @@ @BenchmarkMode(Mode.Throughput) @OutputTimeUnit(TimeUnit.SECONDS) public class Utf8ValidatorBenchmark { + @Param({"/twitter.json", "/gsoc-2018.json", "/github_events.json"}) String fileName; byte[] bytes; @@ -24,7 +33,7 @@ public void setup() throws IOException { @Benchmark public void utf8Validator() { - Utf8Validator.validate(bytes); + Utf8Validator.validate(bytes, bytes.length); } @Benchmark diff --git a/src/main/java/org/simdjson/BlockReader.java b/src/main/java/org/simdjson/BlockReader.java deleted file mode 100644 index 4567386..0000000 --- a/src/main/java/org/simdjson/BlockReader.java +++ /dev/null @@ -1,49 +0,0 @@ -package org.simdjson; - -import java.util.Arrays; - -class BlockReader { - - private static final byte SPACE = 0x20; - - private final int stepSize; - private final byte[] lastBlock; - private final byte[] spaces; - - private byte[] buffer; - private int len; - private int idx = 0; - private int lenMinusStep; - - BlockReader(int stepSize) { - this.stepSize = stepSize; - this.lastBlock = new byte[stepSize]; - this.spaces = new byte[stepSize]; - Arrays.fill(spaces, SPACE); - } - - void reset(byte[] buffer, int len) { - this.idx = 0; - this.len = len; - this.buffer = buffer; - this.lenMinusStep = len < stepSize ? 0 : len - stepSize; - } - - boolean hasFullBlock() { - return idx < lenMinusStep; - } - - byte[] remainder() { - System.arraycopy(spaces, 0, lastBlock, 0, lastBlock.length); - System.arraycopy(buffer, idx, lastBlock, 0, len - idx); - return lastBlock; - } - - void advance() { - idx += stepSize; - } - - int getBlockIndex() { - return idx; - } -} diff --git a/src/main/java/org/simdjson/CharactersClassifier.java b/src/main/java/org/simdjson/CharactersClassifier.java deleted file mode 100644 index 68b685c..0000000 --- a/src/main/java/org/simdjson/CharactersClassifier.java +++ /dev/null @@ -1,66 +0,0 @@ -package org.simdjson; - -import jdk.incubator.vector.ByteVector; -import jdk.incubator.vector.VectorShuffle; - -class CharactersClassifier { - - private static final byte LOW_NIBBLE_MASK = 0x0f; - - private static final ByteVector WHITESPACE_TABLE = - ByteVector.fromArray( - StructuralIndexer.BYTE_SPECIES, - repeat(new byte[]{' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4), - 0); - - private static final ByteVector OP_TABLE = - ByteVector.fromArray( - StructuralIndexer.BYTE_SPECIES, - repeat(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4), - 0); - - private static byte[] repeat(byte[] array, int n) { - byte[] result = new byte[n * array.length]; - for (int dst = 0; dst < result.length; dst += array.length) { - System.arraycopy(array, 0, result, dst, array.length); - } - return result; - } - - JsonCharacterBlock classify(ByteVector chunk0) { - VectorShuffle chunk0Low = extractLowNibble(chunk0).toShuffle(); - long whitespace = eq(chunk0, WHITESPACE_TABLE.rearrange(chunk0Low)); - ByteVector curlified0 = curlify(chunk0); - long op = eq(curlified0, OP_TABLE.rearrange(chunk0Low)); - return new JsonCharacterBlock(whitespace, op); - } - - JsonCharacterBlock classify(ByteVector chunk0, ByteVector chunk1) { - VectorShuffle chunk0Low = extractLowNibble(chunk0).toShuffle(); - VectorShuffle chunk1Low = extractLowNibble(chunk1).toShuffle(); - long whitespace = eq(chunk0, WHITESPACE_TABLE.rearrange(chunk0Low), chunk1, WHITESPACE_TABLE.rearrange(chunk1Low)); - ByteVector curlified0 = curlify(chunk0); - ByteVector curlified1 = curlify(chunk1); - long op = eq(curlified0, OP_TABLE.rearrange(chunk0Low), curlified1, OP_TABLE.rearrange(chunk1Low)); - return new JsonCharacterBlock(whitespace, op); - } - - private ByteVector extractLowNibble(ByteVector vector) { - return vector.and(LOW_NIBBLE_MASK); - } - - private ByteVector curlify(ByteVector vector) { - // turns [ into { and ] into } - return vector.or((byte) 0x20); - } - - private long eq(ByteVector chunk0, ByteVector mask0) { - return chunk0.eq(mask0).toLong(); - } - - private long eq(ByteVector chunk0, ByteVector mask0, ByteVector chunk1, ByteVector mask1) { - long r0 = chunk0.eq(mask0).toLong(); - long r1 = chunk1.eq(mask1).toLong(); - return r0 | (r1 << 32); - } -} diff --git a/src/main/java/org/simdjson/JsonCharacterBlock.java b/src/main/java/org/simdjson/JsonCharacterBlock.java deleted file mode 100644 index b99db20..0000000 --- a/src/main/java/org/simdjson/JsonCharacterBlock.java +++ /dev/null @@ -1,8 +0,0 @@ -package org.simdjson; - -record JsonCharacterBlock(long whitespace, long op) { - - long scalar() { - return ~(op | whitespace); - } -} diff --git a/src/main/java/org/simdjson/JsonStringBlock.java b/src/main/java/org/simdjson/JsonStringBlock.java deleted file mode 100644 index d806681..0000000 --- a/src/main/java/org/simdjson/JsonStringBlock.java +++ /dev/null @@ -1,12 +0,0 @@ -package org.simdjson; - -record JsonStringBlock(long quote, long inString) { - - long stringTail() { - return inString ^ quote; - } - - long nonQuoteInsideString(long mask) { - return mask & inString; - } -} diff --git a/src/main/java/org/simdjson/JsonStringScanner.java b/src/main/java/org/simdjson/JsonStringScanner.java deleted file mode 100644 index 6d856ac..0000000 --- a/src/main/java/org/simdjson/JsonStringScanner.java +++ /dev/null @@ -1,90 +0,0 @@ -package org.simdjson; - -import jdk.incubator.vector.ByteVector; - -class JsonStringScanner { - - private static final long EVEN_BITS_MASK = 0x5555555555555555L; - private static final long ODD_BITS_MASK = ~EVEN_BITS_MASK; - - private final ByteVector backslashMask; - private final ByteVector quoteMask; - - private long prevInString = 0; - private long prevEscaped = 0; - - JsonStringScanner() { - this.backslashMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '\\'); - this.quoteMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '"'); - } - - JsonStringBlock next(ByteVector chunk0) { - long backslash = eq(chunk0, backslashMask); - long escaped = findEscaped(backslash); - long quote = eq(chunk0, quoteMask) & ~escaped; - long inString = prefixXor(quote) ^ prevInString; - prevInString = inString >> 63; - return new JsonStringBlock(quote, inString); - } - - JsonStringBlock next(ByteVector chunk0, ByteVector chunk1) { - long backslash = eq(chunk0, chunk1, backslashMask); - long escaped = findEscaped(backslash); - long quote = eq(chunk0, chunk1, quoteMask) & ~escaped; - long inString = prefixXor(quote) ^ prevInString; - prevInString = inString >> 63; - return new JsonStringBlock(quote, inString); - } - - private long eq(ByteVector chunk0, ByteVector mask) { - long r = chunk0.eq(mask).toLong(); - return r; - } - - private long eq(ByteVector chunk0, ByteVector chunk1, ByteVector mask) { - long r0 = chunk0.eq(mask).toLong(); - long r1 = chunk1.eq(mask).toLong(); - return r0 | (r1 << 32); - } - - private long findEscaped(long backslash) { - if (backslash == 0) { - long escaped = prevEscaped; - prevEscaped = 0; - return escaped; - } - backslash &= ~prevEscaped; - long followsEscape = backslash << 1 | prevEscaped; - long oddSequenceStarts = backslash & ODD_BITS_MASK & ~followsEscape; - - long sequencesStartingOnEvenBits = oddSequenceStarts + backslash; - // Here, we check if the unsigned addition above caused an overflow. If that's the case, we store 1 in prevEscaped. - // The formula used to detect overflow was taken from 'Hacker's Delight, Second Edition' by Henry S. Warren, Jr., - // Chapter 2-13. - prevEscaped = ((oddSequenceStarts >>> 1) + (backslash >>> 1) + ((oddSequenceStarts & backslash) & 1)) >>> 63; - - long invertMask = sequencesStartingOnEvenBits << 1; - return (EVEN_BITS_MASK ^ invertMask) & followsEscape; - } - - private long prefixXor(long bitmask) { - bitmask ^= bitmask << 1; - bitmask ^= bitmask << 2; - bitmask ^= bitmask << 4; - bitmask ^= bitmask << 8; - bitmask ^= bitmask << 16; - bitmask ^= bitmask << 32; - return bitmask; - } - - void reset() { - prevInString = 0; - prevEscaped = 0; - } - - void finish() { - if (prevInString != 0) { - throw new JsonParsingException("Unclosed string. A string is opened, but never closed."); - } - } -} diff --git a/src/main/java/org/simdjson/SimdJsonParser.java b/src/main/java/org/simdjson/SimdJsonParser.java index a752bc1..707124c 100644 --- a/src/main/java/org/simdjson/SimdJsonParser.java +++ b/src/main/java/org/simdjson/SimdJsonParser.java @@ -2,12 +2,10 @@ public class SimdJsonParser { - private static final int STEP_SIZE = 64; private static final int PADDING = 64; private static final int DEFAULT_CAPACITY = 34 * 1024 * 1024; // we should be able to handle jsons <= 34MiB private static final int DEFAULT_MAX_DEPTH = 1024; - private final BlockReader reader; private final StructuralIndexer indexer; private final BitIndexes bitIndexes; private final JsonIterator jsonIterator; @@ -24,23 +22,20 @@ public SimdJsonParser(int capacity, int maxDepth) { jsonIterator = new JsonIterator(bitIndexes, stringBuffer, capacity, maxDepth, PADDING); schemaBasedJsonIterator = new SchemaBasedJsonIterator(bitIndexes, stringBuffer, PADDING); paddedBuffer = new byte[capacity]; - reader = new BlockReader(STEP_SIZE); indexer = new StructuralIndexer(bitIndexes); } public T parse(byte[] buffer, int len, Class expectedType) { - stage0(buffer); byte[] padded = padIfNeeded(buffer, len); - reset(padded, len); - stage1(padded); + reset(); + stage1(padded, len); return schemaBasedJsonIterator.walkDocument(padded, len, expectedType); } public JsonValue parse(byte[] buffer, int len) { - stage0(buffer); byte[] padded = padIfNeeded(buffer, len); - reset(padded, len); - stage1(padded); + reset(); + stage1(padded, len); return jsonIterator.walkDocument(padded, len); } @@ -52,25 +47,13 @@ private byte[] padIfNeeded(byte[] buffer, int len) { return buffer; } - private void reset(byte[] buffer, int len) { - indexer.reset(); - reader.reset(buffer, len); + private void reset() { bitIndexes.reset(); jsonIterator.reset(); } - private void stage0(byte[] buffer) { - Utf8Validator.validate(buffer); - } - - private void stage1(byte[] buffer) { - while (reader.hasFullBlock()) { - int blockIndex = reader.getBlockIndex(); - indexer.step(buffer, blockIndex, blockIndex); - reader.advance(); - } - indexer.step(reader.remainder(), 0, reader.getBlockIndex()); - reader.advance(); - indexer.finish(reader.getBlockIndex()); + private void stage1(byte[] buffer, int length) { + Utf8Validator.validate(buffer, length); + indexer.index(buffer, length); } } diff --git a/src/main/java/org/simdjson/StringParser.java b/src/main/java/org/simdjson/StringParser.java index c03e7eb..6452de9 100644 --- a/src/main/java/org/simdjson/StringParser.java +++ b/src/main/java/org/simdjson/StringParser.java @@ -9,7 +9,7 @@ class StringParser { private static final byte BACKSLASH = '\\'; private static final byte QUOTE = '"'; - private static final int BYTES_PROCESSED = StructuralIndexer.BYTE_SPECIES.vectorByteSize(); + private static final int BYTES_PROCESSED = VectorUtils.BYTE_SPECIES.vectorByteSize(); private static final int MIN_HIGH_SURROGATE = 0xD800; private static final int MAX_HIGH_SURROGATE = 0xDBFF; private static final int MIN_LOW_SURROGATE = 0xDC00; @@ -30,7 +30,7 @@ private int doParseString(byte[] buffer, int idx, byte[] stringBuffer, int offse int src = idx + 1; int dst = offset; while (true) { - ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.BYTE_SPECIES, buffer, src); + ByteVector srcVec = ByteVector.fromArray(VectorUtils.BYTE_SPECIES, buffer, src); srcVec.intoArray(stringBuffer, dst); long backslashBits = srcVec.eq(BACKSLASH).toLong(); long quoteBits = srcVec.eq(QUOTE).toLong(); diff --git a/src/main/java/org/simdjson/StructuralIndexer.java b/src/main/java/org/simdjson/StructuralIndexer.java index b2c4cbf..3720fda 100644 --- a/src/main/java/org/simdjson/StructuralIndexer.java +++ b/src/main/java/org/simdjson/StructuralIndexer.java @@ -1,121 +1,320 @@ package org.simdjson; import jdk.incubator.vector.ByteVector; -import jdk.incubator.vector.IntVector; -import jdk.incubator.vector.VectorShape; -import jdk.incubator.vector.VectorSpecies; +import jdk.incubator.vector.VectorShuffle; +import java.util.Arrays; + +import static jdk.incubator.vector.ByteVector.SPECIES_256; +import static jdk.incubator.vector.ByteVector.SPECIES_512; import static jdk.incubator.vector.VectorOperators.UNSIGNED_LE; class StructuralIndexer { - static final VectorSpecies INT_SPECIES; - static final VectorSpecies BYTE_SPECIES; - static final int N_CHUNKS; + private static final int VECTOR_BIT_SIZE = VectorUtils.BYTE_SPECIES.vectorBitSize(); + private static final int STEP_SIZE = 64; + private static final byte BACKSLASH = (byte) '\\'; + private static final byte QUOTE = (byte) '"'; + private static final byte SPACE = 0x20; + private static final byte LAST_CONTROL_CHARACTER = (byte) 0x1F; + private static final long EVEN_BITS_MASK = 0x5555555555555555L; + private static final long ODD_BITS_MASK = ~EVEN_BITS_MASK; + private static final byte LOW_NIBBLE_MASK = 0x0f; + private static final ByteVector WHITESPACE_TABLE = VectorUtils.repeat( + new byte[]{' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100} + ); + private static final ByteVector OP_TABLE = VectorUtils.repeat( + new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0} + ); + private static final byte[] LAST_BLOCK_SPACES = new byte[STEP_SIZE]; static { - String species = System.getProperty("org.simdjson.species", "preferred"); - switch (species) { - case "preferred" -> { - BYTE_SPECIES = ByteVector.SPECIES_PREFERRED; - INT_SPECIES = IntVector.SPECIES_PREFERRED; - } - case "512" -> { - BYTE_SPECIES = ByteVector.SPECIES_512; - INT_SPECIES = IntVector.SPECIES_512; - } - case "256" -> { - BYTE_SPECIES = ByteVector.SPECIES_256; - INT_SPECIES = IntVector.SPECIES_256; - } - default -> throw new IllegalArgumentException("Unsupported vector species: " + species); - } - N_CHUNKS = 64 / BYTE_SPECIES.vectorByteSize(); - assertSupportForSpecies(BYTE_SPECIES); - assertSupportForSpecies(INT_SPECIES); + Arrays.fill(LAST_BLOCK_SPACES, SPACE); } - private static void assertSupportForSpecies(VectorSpecies species) { - if (species.vectorShape() != VectorShape.S_256_BIT && species.vectorShape() != VectorShape.S_512_BIT) { - throw new IllegalArgumentException("Unsupported vector species: " + species); - } - } - - private final JsonStringScanner stringScanner; - private final CharactersClassifier classifier; private final BitIndexes bitIndexes; - - private long prevStructurals = 0; - private long unescapedCharsError = 0; - private long prevScalar = 0; + private final byte[] lastBlock = new byte[STEP_SIZE]; StructuralIndexer(BitIndexes bitIndexes) { - this.stringScanner = new JsonStringScanner(); - this.classifier = new CharactersClassifier(); this.bitIndexes = bitIndexes; } - void step(byte[] buffer, int offset, int blockIndex) { - switch (N_CHUNKS) { - case 1: step1(buffer, offset, blockIndex); break; - case 2: step2(buffer, offset, blockIndex); break; - default: throw new RuntimeException("Unsupported vector width: " + N_CHUNKS * 64); + void index(byte[] buffer, int length) { + bitIndexes.reset(); + switch (VECTOR_BIT_SIZE) { + case 256 -> index256(buffer, length); + case 512 -> index512(buffer, length); + default -> throw new UnsupportedOperationException("Unsupported vector width: " + VECTOR_BIT_SIZE * 64); } } - private void step1(byte[] buffer, int offset, int blockIndex) { - ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_512, buffer, offset); - JsonStringBlock strings = stringScanner.next(chunk0); - JsonCharacterBlock characters = classifier.classify(chunk0); - long unescaped = lteq(chunk0, (byte) 0x1F); - finishStep(characters, strings, unescaped, blockIndex); - } + private void index256(byte[] buffer, int length) { + long prevInString = 0; + long prevEscaped = 0; + long prevStructurals = 0; + long unescapedCharsError = 0; + long prevScalar = 0; - private void step2(byte[] buffer, int offset, int blockIndex) { - ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_256, buffer, offset); - ByteVector chunk1 = ByteVector.fromArray(ByteVector.SPECIES_256, buffer, offset + 32); - JsonStringBlock strings = stringScanner.next(chunk0, chunk1); - JsonCharacterBlock characters = classifier.classify(chunk0, chunk1); - long unescaped = lteq(chunk0, chunk1, (byte) 0x1F); - finishStep(characters, strings, unescaped, blockIndex); - } + // Using SPECIES_512 here is not a mistake. Each iteration of the below loop processes two 256-bit chunks, + // so effectively it processes 512 bits at once. + int loopBound = SPECIES_512.loopBound(length); + int offset = 0; + int blockIndex = 0; + for (; offset < loopBound; offset += STEP_SIZE) { + ByteVector chunk0 = ByteVector.fromArray(SPECIES_256, buffer, offset); + ByteVector chunk1 = ByteVector.fromArray(SPECIES_256, buffer, offset + 32); + + // string scanning + long backslash0 = chunk0.eq(BACKSLASH).toLong(); + long backslash1 = chunk1.eq(BACKSLASH).toLong(); + long backslash = backslash0 | (backslash1 << 32); + + long escaped; + if (backslash == 0) { + escaped = prevEscaped; + prevEscaped = 0; + } else { + backslash &= ~prevEscaped; + long followsEscape = backslash << 1 | prevEscaped; + long oddSequenceStarts = backslash & ODD_BITS_MASK & ~followsEscape; + + long sequencesStartingOnEvenBits = oddSequenceStarts + backslash; + // Here, we check if the unsigned addition above caused an overflow. If that's the case, we store 1 in prevEscaped. + // The formula used to detect overflow was taken from 'Hacker's Delight, Second Edition' by Henry S. Warren, Jr., + // Chapter 2-13. + prevEscaped = ((oddSequenceStarts >>> 1) + (backslash >>> 1) + ((oddSequenceStarts & backslash) & 1)) >>> 63; + + long invertMask = sequencesStartingOnEvenBits << 1; + escaped = (EVEN_BITS_MASK ^ invertMask) & followsEscape; + } + + long unescaped0 = chunk0.compare(UNSIGNED_LE, LAST_CONTROL_CHARACTER).toLong(); + long unescaped1 = chunk1.compare(UNSIGNED_LE, LAST_CONTROL_CHARACTER).toLong(); + long unescaped = unescaped0 | (unescaped1 << 32); + + long quote0 = chunk0.eq(QUOTE).toLong(); + long quote1 = chunk1.eq(QUOTE).toLong(); + long quote = (quote0 | (quote1 << 32)) & ~escaped; + + long inString = prefixXor(quote) ^ prevInString; + prevInString = inString >> 63; + + // characters classification + VectorShuffle chunk0Low = chunk0.and(LOW_NIBBLE_MASK).toShuffle(); + VectorShuffle chunk1Low = chunk1.and(LOW_NIBBLE_MASK).toShuffle(); + + long whitespace0 = chunk0.eq(WHITESPACE_TABLE.rearrange(chunk0Low)).toLong(); + long whitespace1 = chunk1.eq(WHITESPACE_TABLE.rearrange(chunk1Low)).toLong(); + long whitespace = whitespace0 | (whitespace1 << 32); + + ByteVector curlified0 = chunk0.or((byte) 0x20); + ByteVector curlified1 = chunk1.or((byte) 0x20); + long op0 = curlified0.eq(OP_TABLE.rearrange(chunk0Low)).toLong(); + long op1 = curlified1.eq(OP_TABLE.rearrange(chunk1Low)).toLong(); + long op = op0 | (op1 << 32); + + // finish + long scalar = ~(op | whitespace); + long nonQuoteScalar = scalar & ~quote; + long followsNonQuoteScalar = nonQuoteScalar << 1 | prevScalar; + prevScalar = nonQuoteScalar >>> 63; + long potentialScalarStart = scalar & ~followsNonQuoteScalar; + long potentialStructuralStart = op | potentialScalarStart; + bitIndexes.write(blockIndex, prevStructurals); + blockIndex += STEP_SIZE; + prevStructurals = potentialStructuralStart & ~(inString ^ quote); + unescapedCharsError |= unescaped & inString; + } + + byte[] remainder = remainder(buffer, length, blockIndex); + ByteVector chunk0 = ByteVector.fromArray(SPECIES_256, remainder, 0); + ByteVector chunk1 = ByteVector.fromArray(SPECIES_256, remainder, 32); + + // string scanning + long backslash0 = chunk0.eq(BACKSLASH).toLong(); + long backslash1 = chunk1.eq(BACKSLASH).toLong(); + long backslash = backslash0 | (backslash1 << 32); + + long escaped; + if (backslash == 0) { + escaped = prevEscaped; + } else { + backslash &= ~prevEscaped; + long followsEscape = backslash << 1 | prevEscaped; + long oddSequenceStarts = backslash & ODD_BITS_MASK & ~followsEscape; + + long sequencesStartingOnEvenBits = oddSequenceStarts + backslash; + long invertMask = sequencesStartingOnEvenBits << 1; + escaped = (EVEN_BITS_MASK ^ invertMask) & followsEscape; + } + + long unescaped0 = chunk0.compare(UNSIGNED_LE, LAST_CONTROL_CHARACTER).toLong(); + long unescaped1 = chunk1.compare(UNSIGNED_LE, LAST_CONTROL_CHARACTER).toLong(); + long unescaped = unescaped0 | (unescaped1 << 32); + + long quote0 = chunk0.eq(QUOTE).toLong(); + long quote1 = chunk1.eq(QUOTE).toLong(); + long quote = (quote0 | (quote1 << 32)) & ~escaped; + + long inString = prefixXor(quote) ^ prevInString; + prevInString = inString >> 63; + + // characters classification + VectorShuffle chunk0Low = chunk0.and(LOW_NIBBLE_MASK).toShuffle(); + VectorShuffle chunk1Low = chunk1.and(LOW_NIBBLE_MASK).toShuffle(); - private void finishStep(JsonCharacterBlock characters, JsonStringBlock strings, long unescaped, int blockIndex) { - long scalar = characters.scalar(); - long nonQuoteScalar = scalar & ~strings.quote(); + long whitespace0 = chunk0.eq(WHITESPACE_TABLE.rearrange(chunk0Low)).toLong(); + long whitespace1 = chunk1.eq(WHITESPACE_TABLE.rearrange(chunk1Low)).toLong(); + long whitespace = whitespace0 | (whitespace1 << 32); + + ByteVector curlified0 = chunk0.or((byte) 0x20); + ByteVector curlified1 = chunk1.or((byte) 0x20); + long op0 = curlified0.eq(OP_TABLE.rearrange(chunk0Low)).toLong(); + long op1 = curlified1.eq(OP_TABLE.rearrange(chunk1Low)).toLong(); + long op = op0 | (op1 << 32); + + // finish + long scalar = ~(op | whitespace); + long nonQuoteScalar = scalar & ~quote; long followsNonQuoteScalar = nonQuoteScalar << 1 | prevScalar; - prevScalar = nonQuoteScalar >>> 63; long potentialScalarStart = scalar & ~followsNonQuoteScalar; - long potentialStructuralStart = characters.op() | potentialScalarStart; + long potentialStructuralStart = op | potentialScalarStart; + bitIndexes.write(blockIndex, prevStructurals); + blockIndex += STEP_SIZE; + prevStructurals = potentialStructuralStart & ~(inString ^ quote); + unescapedCharsError |= unescaped & inString; bitIndexes.write(blockIndex, prevStructurals); - prevStructurals = potentialStructuralStart & ~strings.stringTail(); - unescapedCharsError |= strings.nonQuoteInsideString(unescaped); + bitIndexes.finish(); + if (prevInString != 0) { + throw new JsonParsingException("Unclosed string. A string is opened, but never closed."); + } + if (unescapedCharsError != 0) { + throw new JsonParsingException("Unescaped characters. Within strings, there are characters that should be escaped."); + } } - private long lteq(ByteVector chunk0, byte scalar) { - return chunk0.compare(UNSIGNED_LE, scalar).toLong(); - } + private void index512(byte[] buffer, int length) { + long prevInString = 0; + long prevEscaped = 0; + long prevStructurals = 0; + long unescapedCharsError = 0; + long prevScalar = 0; - private long lteq(ByteVector chunk0, ByteVector chunk1, byte scalar) { - long r0 = chunk0.compare(UNSIGNED_LE, scalar).toLong(); - long r1 = chunk1.compare(UNSIGNED_LE, scalar).toLong(); - return r0 | (r1 << 32); - } + int loopBound = SPECIES_512.loopBound(length); + int offset = 0; + int blockIndex = 0; + for (; offset < loopBound; offset += STEP_SIZE) { + ByteVector chunk = ByteVector.fromArray(SPECIES_512, buffer, offset); + + // string scanning + long backslash = chunk.eq(BACKSLASH).toLong(); + + long escaped; + if (backslash == 0) { + escaped = prevEscaped; + prevEscaped = 0; + } else { + backslash &= ~prevEscaped; + long followsEscape = backslash << 1 | prevEscaped; + long oddSequenceStarts = backslash & ODD_BITS_MASK & ~followsEscape; + + long sequencesStartingOnEvenBits = oddSequenceStarts + backslash; + // Here, we check if the unsigned addition above caused an overflow. If that's the case, we store 1 in prevEscaped. + // The formula used to detect overflow was taken from 'Hacker's Delight, Second Edition' by Henry S. Warren, Jr., + // Chapter 2-13. + prevEscaped = ((oddSequenceStarts >>> 1) + (backslash >>> 1) + ((oddSequenceStarts & backslash) & 1)) >>> 63; + + long invertMask = sequencesStartingOnEvenBits << 1; + escaped = (EVEN_BITS_MASK ^ invertMask) & followsEscape; + } + + long unescaped = chunk.compare(UNSIGNED_LE, LAST_CONTROL_CHARACTER).toLong(); + long quote = chunk.eq(QUOTE).toLong() & ~escaped; + long inString = prefixXor(quote) ^ prevInString; + prevInString = inString >> 63; + + // characters classification + VectorShuffle chunkLow = chunk.and(LOW_NIBBLE_MASK).toShuffle(); + long whitespace = chunk.eq(WHITESPACE_TABLE.rearrange(chunkLow)).toLong(); + ByteVector curlified = chunk.or((byte) 0x20); + long op = curlified.eq(OP_TABLE.rearrange(chunkLow)).toLong(); - void finish(int blockIndex) { + // finish + long scalar = ~(op | whitespace); + long nonQuoteScalar = scalar & ~quote; + long followsNonQuoteScalar = nonQuoteScalar << 1 | prevScalar; + prevScalar = nonQuoteScalar >>> 63; + long potentialScalarStart = scalar & ~followsNonQuoteScalar; + long potentialStructuralStart = op | potentialScalarStart; + bitIndexes.write(blockIndex, prevStructurals); + blockIndex += STEP_SIZE; + prevStructurals = potentialStructuralStart & ~(inString ^ quote); + unescapedCharsError |= unescaped & inString; + } + + byte[] remainder = remainder(buffer, length, blockIndex); + ByteVector chunk = ByteVector.fromArray(SPECIES_512, remainder, 0); + + // string scanning + long backslash = chunk.eq(BACKSLASH).toLong(); + + long escaped; + if (backslash == 0) { + escaped = prevEscaped; + } else { + backslash &= ~prevEscaped; + long followsEscape = backslash << 1 | prevEscaped; + long oddSequenceStarts = backslash & ODD_BITS_MASK & ~followsEscape; + + long sequencesStartingOnEvenBits = oddSequenceStarts + backslash; + long invertMask = sequencesStartingOnEvenBits << 1; + escaped = (EVEN_BITS_MASK ^ invertMask) & followsEscape; + } + + long unescaped = chunk.compare(UNSIGNED_LE, LAST_CONTROL_CHARACTER).toLong(); + long quote = chunk.eq(QUOTE).toLong() & ~escaped; + long inString = prefixXor(quote) ^ prevInString; + prevInString = inString >> 63; + + // characters classification + VectorShuffle chunkLow = chunk.and(LOW_NIBBLE_MASK).toShuffle(); + long whitespace = chunk.eq(WHITESPACE_TABLE.rearrange(chunkLow)).toLong(); + ByteVector curlified = chunk.or((byte) 0x20); + long op = curlified.eq(OP_TABLE.rearrange(chunkLow)).toLong(); + + // finish + long scalar = ~(op | whitespace); + long nonQuoteScalar = scalar & ~quote; + long followsNonQuoteScalar = nonQuoteScalar << 1 | prevScalar; + long potentialScalarStart = scalar & ~followsNonQuoteScalar; + long potentialStructuralStart = op | potentialScalarStart; + bitIndexes.write(blockIndex, prevStructurals); + blockIndex += STEP_SIZE; + prevStructurals = potentialStructuralStart & ~(inString ^ quote); + unescapedCharsError |= unescaped & inString; bitIndexes.write(blockIndex, prevStructurals); bitIndexes.finish(); - - stringScanner.finish(); + if (prevInString != 0) { + throw new JsonParsingException("Unclosed string. A string is opened, but never closed."); + } if (unescapedCharsError != 0) { throw new JsonParsingException("Unescaped characters. Within strings, there are characters that should be escaped."); } } - void reset() { - stringScanner.reset(); - prevStructurals = 0; - unescapedCharsError = 0; - prevScalar = 0; + private byte[] remainder(byte[] buffer, int length, int idx) { + System.arraycopy(LAST_BLOCK_SPACES, 0, lastBlock, 0, lastBlock.length); + System.arraycopy(buffer, idx, lastBlock, 0, length - idx); + return lastBlock; + } + + private static long prefixXor(long bitmask) { + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; } } diff --git a/src/main/java/org/simdjson/Utf8Validator.java b/src/main/java/org/simdjson/Utf8Validator.java index e4d9c63..7645fd1 100644 --- a/src/main/java/org/simdjson/Utf8Validator.java +++ b/src/main/java/org/simdjson/Utf8Validator.java @@ -1,261 +1,250 @@ package org.simdjson; -import jdk.incubator.vector.*; +import jdk.incubator.vector.ByteVector; +import jdk.incubator.vector.IntVector; +import jdk.incubator.vector.VectorMask; +import jdk.incubator.vector.VectorShuffle; import java.util.Arrays; -class Utf8Validator { +import static jdk.incubator.vector.VectorOperators.EQ; +import static jdk.incubator.vector.VectorOperators.LSHL; +import static jdk.incubator.vector.VectorOperators.LSHR; +import static jdk.incubator.vector.VectorOperators.NE; +import static jdk.incubator.vector.VectorOperators.UNSIGNED_GE; +import static jdk.incubator.vector.VectorOperators.UNSIGNED_GT; +import static jdk.incubator.vector.VectorShuffle.iota; +import static org.simdjson.VectorUtils.BYTE_SPECIES; +import static org.simdjson.VectorUtils.INT_SPECIES; - private static final VectorSpecies VECTOR_SPECIES = StructuralIndexer.BYTE_SPECIES; - private static final ByteVector INCOMPLETE_CHECK = getIncompleteCheck(); - private static final VectorShuffle SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(StructuralIndexer.INT_SPECIES, - StructuralIndexer.INT_SPECIES.elementSize() - 1, 1, true); - private static final ByteVector LOW_NIBBLE_MASK = ByteVector.broadcast(VECTOR_SPECIES, 0b0000_1111); - private static final ByteVector ALL_ASCII_MASK = ByteVector.broadcast(VECTOR_SPECIES, (byte) 0b1000_0000); +class Utf8Validator { - /** - * Validate the input bytes are valid UTF8 - * - * @param inputBytes the input bytes to validate - * @throws JsonParsingException if the input is not valid UTF8 - */ - static void validate(byte[] inputBytes) { + // Leading byte not followed by a continuation byte but by another leading or ASCII byte, e.g. 11______ 0_______, 11______ 11______ + private static final byte TOO_SHORT = 1; + // ASCII followed by continuation byte e.g. 01111111 10_000000. + private static final byte TOO_LONG = 1 << 1; + // Any 3-byte sequence that could be represented by a shorter sequence (any sequence smaller than 1110_0000 10_100000 10_000000). + private static final byte OVERLONG_3BYTE = 1 << 2; + // Any decoded code point greater than U+10FFFF. e.g. 11110_100 10_010000 10_000000 10_000000. + private static final byte TOO_LARGE = 1 << 3; + // Code points in the range of U+D800 - U+DFFF (inclusive) are the surrogates for UTF-16. + // These 2048 code points that are reserved for UTF-16 are disallowed in UTF-8, e.g. 1110_1101 10_100000 10_000000. + private static final byte SURROGATE = 1 << 4; + // First valid 2-byte sequence: 110_00010 10_000000. Anything smaller is considered overlong as it fits into a 1-byte sequence. + private static final byte OVERLONG_2BYTE = 1 << 5; + // Similar to TOO_LARGE, but for cases where the continuation byte's high nibble is 1000, e.g. 11110_101 10_000000 10_000000. + private static final byte TOO_LARGE_1000 = 1 << 6; + // Any decoded code point below above U+FFFF, e.g. 11110_000 10_000000 10_000000 10_000000. + private static final byte OVERLONG_4BYTE = 1 << 6; + // An example: 10_000000 10_000000. + private static final byte TWO_CONTINUATIONS = (byte) (1 << 7); + private static final byte MAX_2_LEADING_BYTE = (byte) 0b110_11111; + private static final byte MAX_3_LEADING_BYTE = (byte) 0b1110_1111; + private static final int TWO_BYTES_SIZE = Byte.SIZE * 2; + private static final int THREE_BYTES_SIZE = Byte.SIZE * 3; + private static final ByteVector BYTE_1_HIGH_LOOKUP = createByte1HighLookup(); + private static final ByteVector BYTE_1_LOW_LOOKUP = createByte1LowLookup(); + private static final ByteVector BYTE_2_HIGH_LOOKUP = createByte2HighLookup(); + private static final ByteVector INCOMPLETE_CHECK = createIncompleteCheck(); + private static final byte LOW_NIBBLE_MASK = 0b0000_1111; + private static final byte ALL_ASCII_MASK = (byte) 0b1000_0000; + private static final VectorShuffle FOUR_BYTES_FORWARD_SHIFT = iota(INT_SPECIES, INT_SPECIES.elementSize() - 1, 1, true); + private static final int STEP_SIZE = BYTE_SPECIES.vectorByteSize(); + + static void validate(byte[] buffer, int length) { long previousIncomplete = 0; long errors = 0; int previousFourUtf8Bytes = 0; - int idx = 0; - for (; idx < VECTOR_SPECIES.loopBound(inputBytes.length); idx += VECTOR_SPECIES.vectorByteSize()) { - ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, idx); - // ASCII fast path can bypass the checks that are only required for multibyte code points - if (isAscii(utf8Vector)) { + int loopBound = BYTE_SPECIES.loopBound(length); + int offset = 0; + for (; offset < loopBound; offset += STEP_SIZE) { + ByteVector chunk = ByteVector.fromArray(BYTE_SPECIES, buffer, offset); + IntVector chunkAsInts = chunk.reinterpretAsInts(); + // ASCII fast path can bypass the checks that are only required for multibyte code points. + if (chunk.and(ALL_ASCII_MASK).compare(EQ, 0).allTrue()) { errors |= previousIncomplete; } else { - previousIncomplete = isIncomplete(utf8Vector); - - var fourBytesPrevious = fourBytesPreviousSlice(utf8Vector, previousFourUtf8Bytes); - - ByteVector firstCheck = firstTwoByteSequenceCheck(utf8Vector.reinterpretAsInts(), fourBytesPrevious); - ByteVector secondCheck = lastTwoByteSequenceCheck(utf8Vector.reinterpretAsInts(), fourBytesPrevious, firstCheck); - - errors |= secondCheck.compare(VectorOperators.NE, 0).toLong(); + previousIncomplete = chunk.compare(UNSIGNED_GE, INCOMPLETE_CHECK).toLong(); + // Shift the input forward by four bytes to make space for the previous four bytes. + // The previous three bytes are required for validation, pulling in the last integer + // will give the previous four bytes. The switch to integer vectors is to allow for + // integer shifting instead of the more expensive shuffle / slice operations. + IntVector chunkWithPreviousFourBytes = chunkAsInts + .rearrange(FOUR_BYTES_FORWARD_SHIFT) + .withLane(0, previousFourUtf8Bytes); + // Shift the current input forward by one byte to include one byte from the previous chunk. + ByteVector previousOneByte = chunkAsInts + .lanewise(LSHL, Byte.SIZE) + .or(chunkWithPreviousFourBytes.lanewise(LSHR, THREE_BYTES_SIZE)) + .reinterpretAsBytes(); + ByteVector byte2HighNibbles = chunkAsInts.lanewise(LSHR, 4) + .reinterpretAsBytes() + .and(LOW_NIBBLE_MASK); + ByteVector byte1HighNibbles = previousOneByte.reinterpretAsInts() + .lanewise(LSHR, 4) + .reinterpretAsBytes() + .and(LOW_NIBBLE_MASK); + ByteVector byte1LowNibbles = previousOneByte.and(LOW_NIBBLE_MASK); + ByteVector byte1HighState = byte1HighNibbles.selectFrom(BYTE_1_HIGH_LOOKUP); + ByteVector byte1LowState = byte1LowNibbles.selectFrom(BYTE_1_LOW_LOOKUP); + ByteVector byte2HighState = byte2HighNibbles.selectFrom(BYTE_2_HIGH_LOOKUP); + ByteVector firstCheck = byte1HighState.and(byte1LowState).and(byte2HighState); + // All remaining checks are for invalid 3 and 4-byte sequences, which either have too many + // continuation bytes or not enough. + ByteVector previousTwoBytes = chunkAsInts + .lanewise(LSHL, TWO_BYTES_SIZE) + .or(chunkWithPreviousFourBytes.lanewise(LSHR, TWO_BYTES_SIZE)) + .reinterpretAsBytes(); + // The minimum leading byte of 3-byte sequences is always greater than the maximum leading byte of 2-byte sequences. + VectorMask is3ByteLead = previousTwoBytes.compare(UNSIGNED_GT, MAX_2_LEADING_BYTE); + ByteVector previousThreeBytes = chunkAsInts + .lanewise(LSHL, THREE_BYTES_SIZE) + .or(chunkWithPreviousFourBytes.lanewise(LSHR, Byte.SIZE)) + .reinterpretAsBytes(); + // The minimum leading byte of 4-byte sequences is always greater than the maximum leading byte of 3-byte sequences. + VectorMask is4ByteLead = previousThreeBytes.compare(UNSIGNED_GT, MAX_3_LEADING_BYTE); + // The firstCheck vector contains 0x80 values on continuation byte indexes. + // The leading bytes of 3 and 4-byte sequences should match up with these indexes and zero them out. + ByteVector secondCheck = firstCheck.add((byte) 0x80, is3ByteLead.or(is4ByteLead)); + errors |= secondCheck.compare(NE, 0).toLong(); } - previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(StructuralIndexer.INT_SPECIES.length() - 1); + previousFourUtf8Bytes = chunkAsInts.lane(INT_SPECIES.length() - 1); } - // if the input file doesn't align with the vector width, pad the missing bytes with zero - VectorMask remainingBytes = VECTOR_SPECIES.indexInRange(idx, inputBytes.length); - ByteVector lastVectorChunk = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, idx, remainingBytes); - if (!isAscii(lastVectorChunk)) { - previousIncomplete = isIncomplete(lastVectorChunk); - - var fourBytesPrevious = fourBytesPreviousSlice(lastVectorChunk, previousFourUtf8Bytes); - - ByteVector firstCheck = firstTwoByteSequenceCheck(lastVectorChunk.reinterpretAsInts(), fourBytesPrevious); - ByteVector secondCheck = lastTwoByteSequenceCheck(lastVectorChunk.reinterpretAsInts(), fourBytesPrevious, firstCheck); - - errors |= secondCheck.compare(VectorOperators.NE, 0).toLong(); + // If the input file doesn't align with the vector width, pad the missing bytes with zeros. + VectorMask remainingBytes = BYTE_SPECIES.indexInRange(offset, length); + ByteVector chunk = ByteVector.fromArray(BYTE_SPECIES, buffer, offset, remainingBytes); + if (!chunk.and(ALL_ASCII_MASK).compare(EQ, 0).allTrue()) { + IntVector chunkAsInts = chunk.reinterpretAsInts(); + previousIncomplete = chunk.compare(UNSIGNED_GE, INCOMPLETE_CHECK).toLong(); + // Shift the input forward by four bytes to make space for the previous four bytes. + // The previous three bytes are required for validation, pulling in the last integer + // will give the previous four bytes. The switch to integer vectors is to allow for + // integer shifting instead of the more expensive shuffle / slice operations. + IntVector chunkWithPreviousFourBytes = chunkAsInts + .rearrange(FOUR_BYTES_FORWARD_SHIFT) + .withLane(0, previousFourUtf8Bytes); + // Shift the current input forward by one byte to include one byte from the previous chunk. + ByteVector previousOneByte = chunkAsInts + .lanewise(LSHL, Byte.SIZE) + .or(chunkWithPreviousFourBytes.lanewise(LSHR, THREE_BYTES_SIZE)) + .reinterpretAsBytes(); + ByteVector byte2HighNibbles = chunkAsInts.lanewise(LSHR, 4) + .reinterpretAsBytes() + .and(LOW_NIBBLE_MASK); + ByteVector byte1HighNibbles = previousOneByte.reinterpretAsInts() + .lanewise(LSHR, 4) + .reinterpretAsBytes() + .and(LOW_NIBBLE_MASK); + ByteVector byte1LowNibbles = previousOneByte.and(LOW_NIBBLE_MASK); + ByteVector byte1HighState = byte1HighNibbles.selectFrom(BYTE_1_HIGH_LOOKUP); + ByteVector byte1LowState = byte1LowNibbles.selectFrom(BYTE_1_LOW_LOOKUP); + ByteVector byte2HighState = byte2HighNibbles.selectFrom(BYTE_2_HIGH_LOOKUP); + ByteVector firstCheck = byte1HighState.and(byte1LowState).and(byte2HighState); + // All remaining checks are for invalid 3 and 4-byte sequences, which either have too many + // continuation bytes or not enough. + ByteVector previousTwoBytes = chunkAsInts + .lanewise(LSHL, TWO_BYTES_SIZE) + .or(chunkWithPreviousFourBytes.lanewise(LSHR, TWO_BYTES_SIZE)) + .reinterpretAsBytes(); + // The minimum leading byte of 3-byte sequences is always greater than the maximum leading byte of 2-byte sequences. + VectorMask is3ByteLead = previousTwoBytes.compare(UNSIGNED_GT, MAX_2_LEADING_BYTE); + ByteVector previousThreeBytes = chunkAsInts + .lanewise(LSHL, THREE_BYTES_SIZE) + .or(chunkWithPreviousFourBytes.lanewise(LSHR, Byte.SIZE)) + .reinterpretAsBytes(); + // The minimum leading byte of 4-byte sequences is always greater than the maximum leading byte of 3-byte sequences. + VectorMask is4ByteLead = previousThreeBytes.compare(UNSIGNED_GT, MAX_3_LEADING_BYTE); + // The firstCheck vector contains 0x80 values on continuation byte indexes. + // The leading bytes of 3 and 4-byte sequences should match up with these indexes and zero them out. + ByteVector secondCheck = firstCheck.add((byte) 0x80, is3ByteLead.or(is4ByteLead)); + errors |= secondCheck.compare(NE, 0).toLong(); } if ((errors | previousIncomplete) != 0) { - throw new JsonParsingException("Invalid UTF8"); + throw new JsonParsingException("The input is not valid UTF-8"); } } - /* Shuffles the input forward by four bytes to make space for the previous four bytes. - The previous three bytes are required for validation, pulling in the last integer will give the previous four bytes. - The switch to integer vectors is to allow for integer shifting instead of the more expensive shuffle / slice operations */ - private static IntVector fourBytesPreviousSlice(ByteVector vectorChunk, int previousFourUtf8Bytes) { - return vectorChunk.reinterpretAsInts() - .rearrange(SHIFT_FOUR_BYTES_FORWARD) - .withLane(0, previousFourUtf8Bytes); - } - - // works similar to previousUtf8Vector.slice(VECTOR_SPECIES.length() - numOfBytesToInclude, utf8Vector) but without the performance cost - private static ByteVector previousVectorSlice(IntVector utf8Vector, IntVector fourBytesPrevious, int numOfPreviousBytes) { - return utf8Vector - .lanewise(VectorOperators.LSHL, Byte.SIZE * numOfPreviousBytes) - .or(fourBytesPrevious.lanewise(VectorOperators.LSHR, Byte.SIZE * (4 - numOfPreviousBytes))) - .reinterpretAsBytes(); - } - - private static ByteVector firstTwoByteSequenceCheck(IntVector utf8Vector, IntVector fourBytesPrevious) { - // shift the current input forward by 1 byte to include 1 byte from the previous input - var oneBytePrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 1); - - // high nibbles of the current input (e.g. 0xC3 >> 4 = 0xC) - ByteVector byte2HighNibbles = utf8Vector.lanewise(VectorOperators.LSHR, 4) - .reinterpretAsBytes().and(LOW_NIBBLE_MASK); - - // high nibbles of the shifted input - ByteVector byte1HighNibbles = oneBytePrevious.reinterpretAsInts().lanewise(VectorOperators.LSHR, 4) - .reinterpretAsBytes().and(LOW_NIBBLE_MASK); - - // low nibbles of the shifted input (e.g. 0xC3 & 0xF = 0x3) - ByteVector byte1LowNibbles = oneBytePrevious.and(LOW_NIBBLE_MASK); - - ByteVector byte1HighState = byte1HighNibbles.selectFrom(LookupTable.byte1High); - ByteVector byte1LowState = byte1LowNibbles.selectFrom(LookupTable.byte1Low); - ByteVector byte2HighState = byte2HighNibbles.selectFrom(LookupTable.byte2High); - - return byte1HighState.and(byte1LowState).and(byte2HighState); - } - - // All remaining checks are invalid 3–4 byte sequences, which either have too many continuations bytes or not enough - private static ByteVector lastTwoByteSequenceCheck(IntVector utf8Vector, IntVector fourBytesPrevious, ByteVector firstCheck) { - // the minimum 3byte lead - 1110_0000 is always greater than the max 2byte lead - 110_11111 - ByteVector twoBytesPrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 2); - VectorMask is3ByteLead = twoBytesPrevious.compare(VectorOperators.UNSIGNED_GT, (byte) 0b110_11111); - - // the minimum 4byte lead - 1111_0000 is always greater than the max 3byte lead - 1110_1111 - ByteVector threeBytesPrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 3); - VectorMask is4ByteLead = threeBytesPrevious.compare(VectorOperators.UNSIGNED_GT, (byte) 0b1110_1111); - - // the firstCheck vector contains 0x80 values on continuation byte indexes - // the 3/4 byte lead bytes should match up with these indexes and zero them out - return firstCheck.add((byte) 0x80, is3ByteLead.or(is4ByteLead)); - } - - /* checks that the previous vector isn't in an incomplete state. - Previous vector is in an incomplete state if the last byte is smaller than 0xC0, - or the second last byte is smaller than 0xE0, or the third last byte is smaller than 0xF0.*/ - private static ByteVector getIncompleteCheck() { - int vectorBytes = VECTOR_SPECIES.vectorByteSize(); - byte[] eofArray = new byte[vectorBytes]; + private static ByteVector createIncompleteCheck() { + // Previous vector is in an incomplete state if the last byte is smaller than 0xC0, + // or the second last byte is smaller than 0xE0, or the third last byte is smaller than 0xF0. + int vectorByteSize = BYTE_SPECIES.vectorByteSize(); + byte[] eofArray = new byte[vectorByteSize]; Arrays.fill(eofArray, (byte) 255); - eofArray[vectorBytes - 3] = (byte) 0xF0; - eofArray[vectorBytes - 2] = (byte) 0xE0; - eofArray[vectorBytes - 1] = (byte) 0xC0; - return ByteVector.fromArray(VECTOR_SPECIES, eofArray, 0); + eofArray[vectorByteSize - 3] = (byte) 0xF0; + eofArray[vectorByteSize - 2] = (byte) 0xE0; + eofArray[vectorByteSize - 1] = (byte) 0xC0; + return ByteVector.fromArray(BYTE_SPECIES, eofArray, 0); } - private static long isIncomplete(ByteVector utf8Vector) { - return utf8Vector.compare(VectorOperators.UNSIGNED_GE, INCOMPLETE_CHECK).toLong(); + private static ByteVector createByte1HighLookup() { + byte[] byte1HighArray = new byte[]{ + // ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // Continuation high nibble = 1000 -> 1011 + TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS, + // Two byte lead high nibble = 1100 -> 1101 + TOO_SHORT | OVERLONG_2BYTE, TOO_SHORT, + // Three byte lead high nibble = 1110 + TOO_SHORT | OVERLONG_3BYTE | SURROGATE, + // Four byte lead high nibble = 1111 + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4BYTE + }; + return alignArrayToVector(byte1HighArray); } - // ASCII will never exceed 01111_1111 - private static boolean isAscii(ByteVector utf8Vector) { - return utf8Vector.and(ALL_ASCII_MASK).compare(VectorOperators.EQ, 0).allTrue(); + private static ByteVector createByte1LowLookup() { + final byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTINUATIONS; + byte[] byte1LowArray = new byte[]{ + // ASCII, two byte lead and three byte leading low nibble = 0000 -> 1111, + // Four byte lead low nibble = 0000 -> 0111. + // Continuation byte low nibble is inconsequential + // Low nibble does not affect the states TOO_SHORT, TOO_LONG, TWO_CONTINUATIONS, so they will + // be carried over regardless. + CARRY | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, + // 0001 + CARRY | OVERLONG_2BYTE, + CARRY, + CARRY, + // 1111_0100 -> 1111 = TOO_LARGE range + CARRY | TOO_LARGE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // 1110_1101 + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + }; + return alignArrayToVector(byte1LowArray); } - private static class LookupTable { - /* Bit 0 = Too Short (lead byte not followed by a continuation byte but by a lead/ASCII byte) - e.g. 11______ 0_______ - 11______ 11______ */ - static final byte TOO_SHORT = 1; - - /* Bit 1 = Too Long (ASCII followed by continuation byte) - e.g. 01111111 10_000000 */ - static final byte TOO_LONG = 1 << 1; - - /* Bit 2 = Overlong 3-byte - Any 3-byte sequence that could be represented by a shorter sequence - Which is any sequence smaller than 1110_0000 10_100000 10_000000 */ - static final byte OVERLONG_3BYTE = 1 << 2; - - /* Bit 3 = Too Large - Any decoded codepoint greater than U+10FFFF - e.g. 11110_100 10_010000 10_000000 10_000000 */ - static final byte TOO_LARGE = 1 << 3; - - /* Bit 4 = Surrogate - code points in the range of U+D800 - U+DFFF (inclusive) are the surrogates for UTF-16. - These 2048 code points that are reserved for UTF-16 are disallowed in UTF-8 - e.g. 1110_1101 10_100000 10_000000 */ - static final byte SURROGATE = 1 << 4; - - /* Bit 5 = Overlong 2-byte - first valid two byte sequence: 110_00010 10_000000 - anything smaller is considered overlong as it would fit into a one byte sequence / ASCII */ - static final byte OVERLONG_2BYTE = 1 << 5; - - /* Bit 6 = Too Large 1000 - Similar to TOO_LARGE, but for cases where the continuation byte's high nibble is 1000 - e.g. 11110_101 10_000000 10_000000 */ - static final byte TOO_LARGE_1000 = 1 << 6; - - /* Bit 6 = Overlong 4-byte - Any decoded code point below above U+FFFF / 11110_000 10_001111 10_111111 10_111111 - e.g. 11110_000 10_000000 10_000000 10_000000 */ - static final byte OVERLONG_4BYTE = 1 << 6; - - /* Bit 7 = Two Continuations - e.g. 10_000000 10_000000 */ - static final byte TWO_CONTINUATIONS = (byte) (1 << 7); - - private final static ByteVector byte1High = getByte1HighLookup(); - private final static ByteVector byte1Low = getByte1LowLookup(); - private final static ByteVector byte2High = getByte2HighLookup(); - - private static ByteVector getByte1HighLookup() { - byte[] byte1HighArray = new byte[]{ - /* ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table */ - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - /* Continuation high nibble = 1000 -> 1011 */ - TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS, - /* Two byte lead high nibble = 1100 -> 1101 */ - TOO_SHORT | OVERLONG_2BYTE, TOO_SHORT, - /* Three byte lead high nibble = 1110 */ - TOO_SHORT | OVERLONG_3BYTE | SURROGATE, - /* Four byte lead high nibble = 1111 */ - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4BYTE - }; - - return alignArrayToVector(byte1HighArray); - } - - private static ByteVector alignArrayToVector(byte[] arrayValues) { - // pad array with zeroes to align up with vector size - byte[] alignedArray = new byte[VECTOR_SPECIES.vectorByteSize()]; - System.arraycopy(arrayValues, 0, alignedArray, 0, arrayValues.length); - return ByteVector.fromArray(VECTOR_SPECIES, alignedArray, 0); - } - - private static ByteVector getByte1LowLookup() { - final byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTINUATIONS; - byte[] byte1LowArray = new byte[]{ - /* ASCII, two Byte lead and three byte lead low nibble = 0000 -> 1111, - * Four byte lead low nibble = 0000 -> 0111 - * Continuation byte low nibble is inconsequential - * Low nibble does not affect the states TOO_SHORT, TOO_LONG, TWO_CONTINUATIONS, so they will be carried over regardless */ - CARRY | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, - // 0001 - CARRY | OVERLONG_2BYTE, - CARRY, - CARRY, - // 1111_0100 -> 1111 = TOO_LARGE range - CARRY | TOO_LARGE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - // 1110_1101 - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 - }; - - return alignArrayToVector(byte1LowArray); - } - - private static ByteVector getByte2HighLookup() { - byte[] byte2HighArray = new byte[]{ - // ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - // Continuation high nibble - 1000 -> 1011 - TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE | TOO_LARGE_1000, - TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | TOO_LARGE, - TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE, - TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE, - // 1100 -> 1111 = unexpected lead byte - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - }; + private static ByteVector createByte2HighLookup() { + byte[] byte2HighArray = new byte[]{ + // ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + // Continuation high nibble - 1000 -> 1011 + TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE | TOO_LARGE_1000, + TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | TOO_LARGE, + TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE, + TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE, + // 1100 -> 1111 = unexpected leading byte + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + }; + return alignArrayToVector(byte2HighArray); + } - return alignArrayToVector(byte2HighArray); - } + private static ByteVector alignArrayToVector(byte[] arrayValues) { + // Pad array with zeroes to align up with vector size. + byte[] alignedArray = new byte[BYTE_SPECIES.vectorByteSize()]; + System.arraycopy(arrayValues, 0, alignedArray, 0, arrayValues.length); + return ByteVector.fromArray(BYTE_SPECIES, alignedArray, 0); } } diff --git a/src/main/java/org/simdjson/VectorUtils.java b/src/main/java/org/simdjson/VectorUtils.java new file mode 100644 index 0000000..7a1ce8f --- /dev/null +++ b/src/main/java/org/simdjson/VectorUtils.java @@ -0,0 +1,48 @@ +package org.simdjson; + +import jdk.incubator.vector.ByteVector; +import jdk.incubator.vector.IntVector; +import jdk.incubator.vector.VectorShape; +import jdk.incubator.vector.VectorSpecies; + +class VectorUtils { + + static final VectorSpecies INT_SPECIES; + static final VectorSpecies BYTE_SPECIES; + + static { + String species = System.getProperty("org.simdjson.species", "preferred"); + switch (species) { + case "preferred" -> { + BYTE_SPECIES = ByteVector.SPECIES_PREFERRED; + INT_SPECIES = IntVector.SPECIES_PREFERRED; + assertSupportForSpecies(BYTE_SPECIES); + assertSupportForSpecies(INT_SPECIES); + } + case "512" -> { + BYTE_SPECIES = ByteVector.SPECIES_512; + INT_SPECIES = IntVector.SPECIES_512; + } + case "256" -> { + BYTE_SPECIES = ByteVector.SPECIES_256; + INT_SPECIES = IntVector.SPECIES_256; + } + default -> throw new IllegalArgumentException("Unsupported vector species: " + species); + } + } + + private static void assertSupportForSpecies(VectorSpecies species) { + if (species.vectorShape() != VectorShape.S_256_BIT && species.vectorShape() != VectorShape.S_512_BIT) { + throw new IllegalArgumentException("Unsupported vector species: " + species); + } + } + + static ByteVector repeat(byte[] array) { + int n = BYTE_SPECIES.vectorByteSize() / 4; + byte[] result = new byte[n * array.length]; + for (int dst = 0; dst < result.length; dst += array.length) { + System.arraycopy(array, 0, result, dst, array.length); + } + return ByteVector.fromArray(BYTE_SPECIES, result, 0); + } +} diff --git a/src/test/java/org/simdjson/ArrayParsingTest.java b/src/test/java/org/simdjson/ArrayParsingTest.java index 5481569..ef738f2 100644 --- a/src/test/java/org/simdjson/ArrayParsingTest.java +++ b/src/test/java/org/simdjson/ArrayParsingTest.java @@ -11,7 +11,7 @@ import static org.assertj.core.api.Assertions.fail; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.toUtf8; +import static org.simdjson.testutils.TestUtils.toUtf8; import static org.simdjson.testutils.SimdJsonAssertions.assertThat; public class ArrayParsingTest { diff --git a/src/test/java/org/simdjson/ArraySchemaBasedParsingTest.java b/src/test/java/org/simdjson/ArraySchemaBasedParsingTest.java index e743b87..28e1f1f 100644 --- a/src/test/java/org/simdjson/ArraySchemaBasedParsingTest.java +++ b/src/test/java/org/simdjson/ArraySchemaBasedParsingTest.java @@ -21,9 +21,8 @@ import java.util.Set; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; import static org.simdjson.testutils.SimdJsonAssertions.assertThat; +import static org.simdjson.testutils.TestUtils.toUtf8; public class ArraySchemaBasedParsingTest { @@ -483,7 +482,7 @@ public void emptyJson() { public void passedLengthSmallerThanNullLength() { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("null")); + byte[] json = toUtf8("null"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 3, Boolean[].class)); diff --git a/src/test/java/org/simdjson/BenchmarkCorrectnessTest.java b/src/test/java/org/simdjson/BenchmarkCorrectnessTest.java index 06a1719..4ab21f8 100644 --- a/src/test/java/org/simdjson/BenchmarkCorrectnessTest.java +++ b/src/test/java/org/simdjson/BenchmarkCorrectnessTest.java @@ -11,9 +11,8 @@ import java.util.Set; import static org.assertj.core.api.Assertions.assertThat; -import static org.simdjson.TestUtils.loadTestFile; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; +import static org.simdjson.testutils.TestUtils.loadTestFile; +import static org.simdjson.testutils.TestUtils.toUtf8PaddedWithSpaces; public class BenchmarkCorrectnessTest { @@ -74,7 +73,7 @@ public void numberParserTest(String input, Double expected) { // given Tape tape = new Tape(100); NumberParser numberParser = new NumberParser(); - byte[] numberUtf8Bytes = toUtf8(padWithSpaces(input)); + byte[] numberUtf8Bytes = toUtf8PaddedWithSpaces(input); // when numberParser.parseNumber(numberUtf8Bytes, 0, tape); diff --git a/src/test/java/org/simdjson/BlockReaderTest.java b/src/test/java/org/simdjson/BlockReaderTest.java deleted file mode 100644 index 1e587cf..0000000 --- a/src/test/java/org/simdjson/BlockReaderTest.java +++ /dev/null @@ -1,79 +0,0 @@ -package org.simdjson; - -import org.junit.jupiter.api.Test; - -import java.util.Arrays; - -import static org.assertj.core.api.Assertions.assertThat; - -public class BlockReaderTest { - - @Test - public void iterateOverEntireBuffer() { - // given - int stepSize = 64; - int fullBlockCount = 2; - byte[] buffer = new byte[fullBlockCount * stepSize + stepSize / 2]; - Arrays.fill(buffer, (byte) 'a'); - BlockReader reader = new BlockReader(stepSize); - reader.reset(buffer, buffer.length); - - // when / then - for (int i = 0; i < fullBlockCount; i++) { - assertThat(reader.hasFullBlock()).isTrue(); - assertThat(reader.getBlockIndex()).isEqualTo(i * stepSize); - reader.advance(); - assertThat(reader.getBlockIndex()).isEqualTo((i + 1) * stepSize); - } - assertThat(reader.hasFullBlock()).isFalse(); - byte[] remainder = reader.remainder(); - assertThat(remainder.length).isEqualTo(stepSize); - } - - @Test - public void lastBlockIsTreatedAsRemainder() { - // given - int stepSize = 64; - int blockCount = 2; - byte[] buffer = new byte[blockCount * stepSize]; - Arrays.fill(buffer, (byte) 'a'); - BlockReader reader = new BlockReader(stepSize); - reader.reset(buffer, buffer.length); - assertThat(reader.hasFullBlock()).isTrue(); - - // when - reader.advance(); - - // then - assertThat(reader.hasFullBlock()).isFalse(); - byte[] remainder = reader.remainder(); - assertThat(remainder.length).isEqualTo(stepSize); - for (int i = 0; i < remainder.length; i++) { - assertThat(remainder[i]).isEqualTo(buffer[i]); - } - } - - @Test - public void remainderShouldBeFilledWithSpaces() { - // given - int stepSize = 64; - byte[] buffer = new byte[stepSize / 2]; - Arrays.fill(buffer, (byte) 'a'); - BlockReader reader = new BlockReader(stepSize); - reader.reset(buffer, buffer.length); - assertThat(reader.hasFullBlock()).isFalse(); - - // when - byte[] remainder = reader.remainder(); - - // then - assertThat(remainder.length).isEqualTo(stepSize); - for (int i = 0; i < remainder.length; i++) { - if (i < buffer.length) { - assertThat(remainder[i]).isEqualTo(buffer[i]); - } else { - assertThat(remainder[i]).isEqualTo((byte) 0x20); - } - } - } -} diff --git a/src/test/java/org/simdjson/BooleanParsingTest.java b/src/test/java/org/simdjson/BooleanParsingTest.java index 37979c5..47d4d9e 100644 --- a/src/test/java/org/simdjson/BooleanParsingTest.java +++ b/src/test/java/org/simdjson/BooleanParsingTest.java @@ -8,9 +8,8 @@ import java.util.Iterator; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; import static org.simdjson.testutils.SimdJsonAssertions.assertThat; +import static org.simdjson.testutils.TestUtils.toUtf8; public class BooleanParsingTest { @@ -95,7 +94,7 @@ public void arrayOfBooleans() { public void passedLengthSmallerThanTrueLength() { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("true")); + byte[] json = toUtf8("true"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 3)); @@ -109,7 +108,7 @@ public void passedLengthSmallerThanTrueLength() { public void passedLengthSmallerThanFalseLength() { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("false")); + byte[] json = toUtf8("false"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 4)); diff --git a/src/test/java/org/simdjson/BooleanSchemaBasedParsingTest.java b/src/test/java/org/simdjson/BooleanSchemaBasedParsingTest.java index 353a73f..033f7cf 100644 --- a/src/test/java/org/simdjson/BooleanSchemaBasedParsingTest.java +++ b/src/test/java/org/simdjson/BooleanSchemaBasedParsingTest.java @@ -17,8 +17,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; +import static org.simdjson.testutils.TestUtils.toUtf8; public class BooleanSchemaBasedParsingTest { @@ -552,7 +551,7 @@ public void emptyJson(Class expectedType) { public void passedLengthSmallerThanTrueLength(Class expectedType) { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("true")); + byte[] json = toUtf8("true"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 3, expectedType)); @@ -567,7 +566,7 @@ public void passedLengthSmallerThanTrueLength(Class expectedType) { public void passedLengthSmallerThanFalseLength(Class expectedType) { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("false")); + byte[] json = toUtf8("false"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 4, expectedType)); @@ -581,7 +580,7 @@ public void passedLengthSmallerThanFalseLength(Class expectedType) { public void passedLengthSmallerThanNullLength() { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("null")); + byte[] json = toUtf8("null"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 3, Boolean.class)); diff --git a/src/test/java/org/simdjson/CharactersClassifierTest.java b/src/test/java/org/simdjson/CharactersClassifierTest.java deleted file mode 100644 index ce5a369..0000000 --- a/src/test/java/org/simdjson/CharactersClassifierTest.java +++ /dev/null @@ -1,70 +0,0 @@ -package org.simdjson; - -import org.junit.jupiter.api.Test; - -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.assertj.core.api.Assertions.assertThat; -import static org.simdjson.TestUtils.chunk; - -public class CharactersClassifierTest { - - @Test - public void classifiesOperators() { - // given - CharactersClassifier classifier = new CharactersClassifier(); - String str = "a{bc}1:2,3[efg]aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - - // when - JsonCharacterBlock block = classify(classifier, str); - - // then - assertThat(block.op()).isEqualTo(0x4552); - assertThat(block.whitespace()).isEqualTo(0); - } - - @Test - public void classifiesControlCharactersAsOperators() { - // given - CharactersClassifier classifier = new CharactersClassifier(); - String str = new String(new byte[] { - 'a', 'a', 'a', 0x1a, 'a', 0x0c, 'a', 'a', // 0x1a = , 0x0c = - 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', - 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', - 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', - 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', - 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', - 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', - 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a' - }, UTF_8); - - // when - JsonCharacterBlock block = classify(classifier, str); - - // then - assertThat(block.op()).isEqualTo(0x28); - assertThat(block.whitespace()).isEqualTo(0); - } - - @Test - public void classifiesWhitespaces() { - // given - CharactersClassifier classifier = new CharactersClassifier(); - String str = "a bc\t1\n2\r3efgaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - - // when - JsonCharacterBlock block = classify(classifier, str); - - // then - assertThat(block.whitespace()).isEqualTo(0x152); - assertThat(block.op()).isEqualTo(0); - } - - private JsonCharacterBlock classify(CharactersClassifier classifier, String str) { - return switch (StructuralIndexer.N_CHUNKS) { - case 1 -> classifier.classify(chunk(str, 0)); - case 2 -> classifier.classify(chunk(str, 0), chunk(str, 1)); - default -> throw new RuntimeException("Unsupported chunk count: " + StructuralIndexer.N_CHUNKS); - }; - } - -} diff --git a/src/test/java/org/simdjson/FloatingPointNumberSchemaBasedParsingTest.java b/src/test/java/org/simdjson/FloatingPointNumberSchemaBasedParsingTest.java index 75cc6ae..0315055 100644 --- a/src/test/java/org/simdjson/FloatingPointNumberSchemaBasedParsingTest.java +++ b/src/test/java/org/simdjson/FloatingPointNumberSchemaBasedParsingTest.java @@ -32,8 +32,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; +import static org.simdjson.testutils.TestUtils.toUtf8; public class FloatingPointNumberSchemaBasedParsingTest { @@ -1271,7 +1270,7 @@ public void emptyJson(Class expectedType) { public void passedLengthSmallerThanNullLength(Class expectedType) { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("null")); + byte[] json = toUtf8("null"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 3, expectedType)); @@ -1286,7 +1285,7 @@ public void passedLengthSmallerThanNullLength(Class expectedType) { public void passedLengthSmallerThanNumberLength(Class expectedType) { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("1.234")); + byte[] json = toUtf8("1.234"); // when Object value = parser.parse(json, 3, expectedType); diff --git a/src/test/java/org/simdjson/IntegralNumberSchemaBasedParsingTest.java b/src/test/java/org/simdjson/IntegralNumberSchemaBasedParsingTest.java index 041c725..12e0fb1 100644 --- a/src/test/java/org/simdjson/IntegralNumberSchemaBasedParsingTest.java +++ b/src/test/java/org/simdjson/IntegralNumberSchemaBasedParsingTest.java @@ -36,8 +36,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; +import static org.simdjson.testutils.TestUtils.toUtf8; public class IntegralNumberSchemaBasedParsingTest { @@ -753,7 +752,7 @@ public void emptyJson(Class expectedType) { public void passedLengthSmallerThanNullLength(Class expectedType) { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("null")); + byte[] json = toUtf8("null"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 3, expectedType)); @@ -768,7 +767,7 @@ public void passedLengthSmallerThanNullLength(Class expectedType) { public void passedLengthSmallerThanNumberLength(Class expectedType) { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("1234")); + byte[] json = toUtf8("1234"); // when Object value = parser.parse(json, 2, expectedType); diff --git a/src/test/java/org/simdjson/JsonStringScannerTest.java b/src/test/java/org/simdjson/JsonStringScannerTest.java deleted file mode 100644 index 9c23440..0000000 --- a/src/test/java/org/simdjson/JsonStringScannerTest.java +++ /dev/null @@ -1,141 +0,0 @@ -package org.simdjson; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.simdjson.TestUtils.chunk; -import static org.simdjson.TestUtils.padWithSpaces; - -public class JsonStringScannerTest { - - @Test - public void testUnquotedString() { - // given - JsonStringScanner stringScanner = new JsonStringScanner(); - String str = padWithSpaces("abc 123"); - - // when - JsonStringBlock block = next(stringScanner, str); - - // then - assertThat(block.quote()).isEqualTo(0); - } - - @Test - public void testQuotedString() { - // given - JsonStringScanner stringScanner = new JsonStringScanner(); - String str = padWithSpaces("\"abc 123\""); - - // when - JsonStringBlock block = next(stringScanner, str); - - // then - assertThat(block.quote()).isEqualTo(0x101); - } - - @Test - public void testStartingQuotes() { - // given - JsonStringScanner stringScanner = new JsonStringScanner(); - String str = padWithSpaces("\"abc 123"); - - // when - JsonStringBlock block = next(stringScanner, str); - - // then - assertThat(block.quote()).isEqualTo(0x1); - } - - @Test - public void testQuotedStringSpanningMultipleBlocks() { - // given - JsonStringScanner stringScanner = new JsonStringScanner(); - String str0 = "abc \"a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 b0 b1 b2 b3 b4 b5 b6 b7 b8 b9"; - String str1 = " c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 d0 d1 d2 d3 d4 d5 d6 d7 d8 d\" def"; - - // when - JsonStringBlock firstBlock = next(stringScanner, str0); - JsonStringBlock secondBlock = next(stringScanner, str1); - - // then - assertThat(firstBlock.quote()).isEqualTo(0x10); - assertThat(secondBlock.quote()).isEqualTo(0x800000000000000L); - } - - @ParameterizedTest - @ValueSource(strings = { - "abc \\\"123", // abc \"123 - "abc \\\\\\\"123" // abc \\\"123 - }) - public void testEscapedQuote(String str) { - // given - JsonStringScanner stringScanner = new JsonStringScanner(); - String padded = padWithSpaces(str); - - // when - JsonStringBlock block = next(stringScanner, padded); - - // then - assertThat(block.quote()).isEqualTo(0); - } - - @Test - public void testEscapedQuoteSpanningMultipleBlocks() { - // given - JsonStringScanner stringScanner = new JsonStringScanner(); - String str0 = "a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 c0 \\"; - String str1 = padWithSpaces("\"def"); - - // when - JsonStringBlock firstBlock = next(stringScanner, str0); - JsonStringBlock secondBlock = next(stringScanner, str1); - - // then - assertThat(firstBlock.quote()).isEqualTo(0); - assertThat(secondBlock.quote()).isEqualTo(0); - } - - @ParameterizedTest - @ValueSource(strings = { - "abc \\\\\"123", // abc \\"123 - "abc \\\\\\\\\"123" // abc \\\\"123 - }) - public void testUnescapedQuote(String str) { - // given - JsonStringScanner stringScanner = new JsonStringScanner(); - String padded = padWithSpaces(str); - - // when - JsonStringBlock block = next(stringScanner, padded); - - // then - assertThat(block.quote()).isEqualTo(0x1L << str.indexOf('"')); - } - - @Test - public void testUnescapedQuoteSpanningMultipleBlocks() { - // given - JsonStringScanner stringScanner = new JsonStringScanner(); - String str0 = padWithSpaces("a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 c0 \\"); - String str1 = padWithSpaces("\\\"abc"); - - // when - JsonStringBlock firstBlock = next(stringScanner, str0); - JsonStringBlock secondBlock = next(stringScanner, str1); - - // then - assertThat(firstBlock.quote()).isEqualTo(0); - assertThat(secondBlock.quote()).isEqualTo(0x2); - } - - private JsonStringBlock next(JsonStringScanner scanner, String str) { - return switch (StructuralIndexer.N_CHUNKS) { - case 1 -> scanner.next(chunk(str, 0)); - case 2 -> scanner.next(chunk(str, 0), chunk(str, 1)); - default -> throw new RuntimeException("Unsupported chunk count: " + StructuralIndexer.N_CHUNKS); - }; - } -} diff --git a/src/test/java/org/simdjson/NullParsingTest.java b/src/test/java/org/simdjson/NullParsingTest.java index cc2cbc4..2345f04 100644 --- a/src/test/java/org/simdjson/NullParsingTest.java +++ b/src/test/java/org/simdjson/NullParsingTest.java @@ -8,8 +8,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; +import static org.simdjson.testutils.TestUtils.toUtf8; public class NullParsingTest { @@ -94,7 +93,7 @@ public void arrayOfNulls() { public void passedLengthSmallerThanNullLength() { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("null")); + byte[] json = toUtf8("null"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 3)); diff --git a/src/test/java/org/simdjson/NumberParsingTest.java b/src/test/java/org/simdjson/NumberParsingTest.java index 1599add..2f7b64e 100644 --- a/src/test/java/org/simdjson/NumberParsingTest.java +++ b/src/test/java/org/simdjson/NumberParsingTest.java @@ -14,9 +14,8 @@ import java.util.Iterator; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; import static org.simdjson.testutils.SimdJsonAssertions.assertThat; +import static org.simdjson.testutils.TestUtils.toUtf8; public class NumberParsingTest { @@ -636,7 +635,7 @@ public void arrayOfNumbers() { public void passedLengthSmallerThanNumberLength() { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("1234")); + byte[] json = toUtf8("1234"); // when JsonValue value = parser.parse(json, 2); diff --git a/src/test/java/org/simdjson/ObjectParsingTest.java b/src/test/java/org/simdjson/ObjectParsingTest.java index 3aa94c7..76bd3a0 100644 --- a/src/test/java/org/simdjson/ObjectParsingTest.java +++ b/src/test/java/org/simdjson/ObjectParsingTest.java @@ -7,7 +7,7 @@ import java.util.Map; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.toUtf8; +import static org.simdjson.testutils.TestUtils.toUtf8; import static org.simdjson.testutils.SimdJsonAssertions.assertThat; public class ObjectParsingTest { diff --git a/src/test/java/org/simdjson/ObjectSchemaBasedParsingTest.java b/src/test/java/org/simdjson/ObjectSchemaBasedParsingTest.java index c19265c..071c77d 100644 --- a/src/test/java/org/simdjson/ObjectSchemaBasedParsingTest.java +++ b/src/test/java/org/simdjson/ObjectSchemaBasedParsingTest.java @@ -32,9 +32,8 @@ import java.util.List; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; import static org.simdjson.testutils.SimdJsonAssertions.assertThat; +import static org.simdjson.testutils.TestUtils.toUtf8; public class ObjectSchemaBasedParsingTest { @@ -595,7 +594,7 @@ public void emptyJson() { public void passedLengthSmallerThanNullLength() { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("null")); + byte[] json = toUtf8("null"); // when JsonParsingException ex = assertThrows( diff --git a/src/test/java/org/simdjson/StringParsingTest.java b/src/test/java/org/simdjson/StringParsingTest.java index 580a5a7..5d80fa1 100644 --- a/src/test/java/org/simdjson/StringParsingTest.java +++ b/src/test/java/org/simdjson/StringParsingTest.java @@ -13,10 +13,9 @@ import static org.apache.commons.text.StringEscapeUtils.unescapeJava; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.fail; -import static org.simdjson.TestUtils.loadTestFile; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; import static org.simdjson.testutils.SimdJsonAssertions.assertThat; +import static org.simdjson.testutils.TestUtils.loadTestFile; +import static org.simdjson.testutils.TestUtils.toUtf8; public class StringParsingTest { @@ -264,7 +263,7 @@ public void arrayOfStrings() { public void passedLengthSmallerThanStringLength() { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("\"aaaaa\"")); + byte[] json = toUtf8("\"aaaaa\""); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 6)); diff --git a/src/test/java/org/simdjson/StringSchemaBasedParsingTest.java b/src/test/java/org/simdjson/StringSchemaBasedParsingTest.java index 52ebcb0..7c771cd 100644 --- a/src/test/java/org/simdjson/StringSchemaBasedParsingTest.java +++ b/src/test/java/org/simdjson/StringSchemaBasedParsingTest.java @@ -27,8 +27,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.fail; -import static org.simdjson.TestUtils.padWithSpaces; -import static org.simdjson.TestUtils.toUtf8; +import static org.simdjson.testutils.TestUtils.toUtf8; public class StringSchemaBasedParsingTest { @@ -1331,7 +1330,7 @@ public void emptyJson(Class expectedType) { public void passedLengthSmallerThanNullLength(Class expectedType) { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("null")); + byte[] json = toUtf8("null"); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 3, expectedType)); @@ -1345,7 +1344,7 @@ public void passedLengthSmallerThanNullLength(Class expectedType) { public void passedLengthSmallerThanStringLength() { // given SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8(padWithSpaces("\"aaaaa\"")); + byte[] json = toUtf8("\"aaaaa\""); // when JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, 3, String.class)); diff --git a/src/test/java/org/simdjson/StructuralIndexerTest.java b/src/test/java/org/simdjson/StructuralIndexerTest.java new file mode 100644 index 0000000..3d65792 --- /dev/null +++ b/src/test/java/org/simdjson/StructuralIndexerTest.java @@ -0,0 +1,278 @@ +package org.simdjson; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.simdjson.testutils.TestUtils.toUtf8; + +public class StructuralIndexerTest { + + @Test + public void unquotedString() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + String input = "abc 123"; + + // when + indexer.index(toUtf8(input), len(input)); + + // then + assertThat(bitIndexes.isEnd()).isFalse(); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(0); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(4); + assertThat(bitIndexes.isEnd()).isTrue(); + } + + @Test + public void quotedString() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + String input = "\"abc 123\""; + + // when + indexer.index(toUtf8(input), len(input)); + + // then + assertThat(bitIndexes.isEnd()).isFalse(); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(0); + assertThat(bitIndexes.isEnd()).isTrue(); + } + + @Test + public void unclosedString() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + String input = "\"abc 123"; + + // when + JsonParsingException ex = assertThrows( + JsonParsingException.class, + () -> indexer.index(toUtf8(input), len(input)) + ); + + // then + assertThat(ex) + .hasMessage("Unclosed string. A string is opened, but never closed."); + } + + @Test + public void quotedStringSpanningMultipleBlocks() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + String input = "abc \"a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 d0 d1 d2 d3 d4 d5 d6 d7 d8 d\" def"; + + // when + indexer.index(toUtf8(input), len(input)); + + // then + assertThat(bitIndexes.isEnd()).isFalse(); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(0); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(4); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(125); + assertThat(bitIndexes.isEnd()).isTrue(); + } + + @ParameterizedTest + @ValueSource(strings = { + "abc \\\"123", // abc \"123 + "abc \\\\\\\"123" // abc \\\"123 + }) + public void escapedQuote(String input) { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + + // when + indexer.index(toUtf8(input), len(input)); + + // then + assertThat(bitIndexes.isEnd()).isFalse(); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(0); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(4); + assertThat(bitIndexes.isEnd()).isTrue(); + } + + @Test + public void escapedQuoteSpanningMultipleBlocks() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + String input = "a0ba1ca2ca3ca4ca5ca6ca7ca8ca9cb0cb1cb2cb3cb4cb5cb6cb7cb8cb9cc0 \\\"def"; + + // when + indexer.index(toUtf8(input), len(input)); + + // then + assertThat(bitIndexes.isEnd()).isFalse(); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(0); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(63); + assertThat(bitIndexes.isEnd()).isTrue(); + } + + @ParameterizedTest + @ValueSource(strings = { + "abc \\\\\"123", // abc \\"123 + "abc \\\\\\\\\"123" // abc \\\\"123 + }) + public void unescapedQuote(String input) { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + + // when + JsonParsingException ex = assertThrows( + JsonParsingException.class, + () -> indexer.index(toUtf8(input), len(input)) + ); + + // then + assertThat(ex) + .hasMessage("Unclosed string. A string is opened, but never closed."); + } + + @Test + public void unescapedQuoteSpanningMultipleBlocks() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + String input = "a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 c0 \\\\\"abc"; + + // when + JsonParsingException ex = assertThrows( + JsonParsingException.class, + () -> indexer.index(toUtf8(input), len(input)) + ); + + // then + assertThat(ex) + .hasMessage("Unclosed string. A string is opened, but never closed."); + } + + @Test + public void operatorsClassification() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + String input = "a{bc}1:2,3[efg]aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + + // when + indexer.index(toUtf8(input), len(input)); + + // then + assertThat(bitIndexes.isEnd()).isFalse(); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(0); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(1); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(2); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(4); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(5); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(6); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(7); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(8); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(9); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(10); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(11); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(14); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(15); + assertThat(bitIndexes.isEnd()).isTrue(); + } + + @Test + public void controlCharactersClassification() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + byte[] input = new byte[] { + 'a', 'a', 'a', 0x1a, 'a', 0x0c, 'a', 'a', // 0x1a = , 0x0c = + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a' + }; + + // when + indexer.index(input, input.length); + + // then + assertThat(bitIndexes.isEnd()).isFalse(); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(0); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(3); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(4); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(5); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(6); + assertThat(bitIndexes.isEnd()).isTrue(); + } + + @Test + public void whitespacesClassification() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + String input = "a bc\t1\n2\r3efgaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + + // when + indexer.index(toUtf8(input), len(input)); + + // then + assertThat(bitIndexes.isEnd()).isFalse(); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(0); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(2); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(5); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(7); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(9); + assertThat(bitIndexes.isEnd()).isTrue(); + } + + @ParameterizedTest + @ValueSource(strings = { + "aaaaaaaaaaaaaaa", // 120 bits + "aaaaaaaaaaaaaaaa", // 128 bits + "aaaaaaaaaaaaaaaaa", // 136 bits + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 248 bits + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 256 bits + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 264 bits + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 504 bits + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 512 bits + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 520 bits + }) + public void inputLengthCloseToVectorWidth(String input) { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + + // when + indexer.index(toUtf8(input), len(input)); + + // then + assertThat(bitIndexes.isEnd()).isFalse(); + assertThat(bitIndexes.getAndAdvance()).isEqualTo(0); + assertThat(bitIndexes.isEnd()).isTrue(); + } + + @Test + public void emptyInput() { + // given + BitIndexes bitIndexes = new BitIndexes(1024); + StructuralIndexer indexer = new StructuralIndexer(bitIndexes); + + // when + indexer.index(toUtf8(""), 0); + + // then + assertThat(bitIndexes.isEnd()).isTrue(); + } + + private static int len(String input) { + return input.getBytes(UTF_8).length; + } +} diff --git a/src/test/java/org/simdjson/TestUtils.java b/src/test/java/org/simdjson/TestUtils.java deleted file mode 100644 index 8d63221..0000000 --- a/src/test/java/org/simdjson/TestUtils.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.simdjson; - -import jdk.incubator.vector.ByteVector; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; - -import static java.nio.charset.StandardCharsets.UTF_8; - -class TestUtils { - - static String padWithSpaces(String str) { - byte[] strBytes = toUtf8(str); - byte[] padded = new byte[strBytes.length + 64]; - Arrays.fill(padded, (byte) ' '); - System.arraycopy(strBytes, 0, padded, 0, strBytes.length); - return new String(padded, UTF_8); - } - - static ByteVector chunk(String str, int n) { - return ByteVector.fromArray(StructuralIndexer.BYTE_SPECIES, str.getBytes(UTF_8), n * StructuralIndexer.BYTE_SPECIES.vectorByteSize()); - } - - static byte[] toUtf8(String str) { - return str.getBytes(UTF_8); - } - - static byte[] loadTestFile(String name) throws IOException { - try (InputStream is = TestUtils.class.getResourceAsStream(name)) { - return is.readAllBytes(); - } - } -} diff --git a/src/test/java/org/simdjson/Utf8ValidationTest.java b/src/test/java/org/simdjson/Utf8ValidationTest.java new file mode 100644 index 0000000..d89734e --- /dev/null +++ b/src/test/java/org/simdjson/Utf8ValidationTest.java @@ -0,0 +1,449 @@ +package org.simdjson; + +import org.junit.jupiter.api.Test; +import org.simdjson.testutils.TestUtils; + +import java.io.IOException; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatCode; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.simdjson.testutils.TestUtils.toHexString; +import static org.simdjson.testutils.Utf8TestData.randomUtf8ByteArray; +import static org.simdjson.testutils.Utf8TestData.randomUtf8ByteArrayIncluding; +import static org.simdjson.testutils.Utf8TestData.randomUtf8ByteArrayEndedWith; +import static org.simdjson.testutils.Utf8TestData.utf8Sequences; + +public class Utf8ValidationTest { + + @Test + public void valid() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArray(); + + try { + // when + parser.parse(input, input.length); + } catch (JsonParsingException ex) { + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessageNotContaining("The input is not valid UTF-8"); + } + } + + @Test + public void invalidAscii() { + // given + SimdJsonParser parser = new SimdJsonParser(); + for (int invalidAsciiByte = 128; invalidAsciiByte <= 255; invalidAsciiByte++) { + byte[] input = randomUtf8ByteArrayIncluding((byte) invalidAsciiByte); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + } + + @Test + public void continuationByteWithoutPrecedingLeadingByte() { + // given + SimdJsonParser parser = new SimdJsonParser(); + for (int continuationByte = 0b10_000000; continuationByte <= 0b10_111111; continuationByte++) { + byte[] input = randomUtf8ByteArrayIncluding((byte) continuationByte); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + } + + @Test + public void twoByteSequenceWithTwoContinuationBytes() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayIncluding( + (byte) 0b110_00010, + (byte) 0b10_000000, + (byte) 0b10_000000 + ); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void twoByteSequenceWithoutContinuationBytes() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayIncluding((byte) 0b110_00010); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void twoByteSequenceWithoutContinuationBytesAtTheEnd() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayEndedWith((byte) 0b110_00010); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void threeByteSequenceWithThreeContinuationBytes() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayIncluding( + (byte) 0b1110_0000, + (byte) 0b10_100000, + (byte) 0b10_000000, + (byte) 0b10_000000 + ); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void threeByteSequenceWithOneContinuationByte() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayIncluding( + (byte) 0b1110_0000, + (byte) 0b10_100000 + ); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void threeByteSequenceWithoutContinuationBytes() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayIncluding((byte) 0b1110_0000); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void threeByteSequenceWithOneContinuationByteAtTheEnd() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayEndedWith( + (byte) 0b1110_0000, + (byte) 0b10_100000 + ); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void threeByteSequenceWithoutContinuationBytesAtTheEnd() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayEndedWith((byte) 0b1110_0000); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void fourByteSequenceWithFourContinuationBytes() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayIncluding( + (byte) 0b11110_000, + (byte) 0b10_010000, + (byte) 0b10_000000, + (byte) 0b10_000000, + (byte) 0b10_000000 + ); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void fourByteSequenceWithTwoContinuationBytes() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayIncluding( + (byte) 0b11110_000, + (byte) 0b10_010000, + (byte) 0b10_000000 + ); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void fourByteSequenceWithOneContinuationByte() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayIncluding( + (byte) 0b11110_000, + (byte) 0b10_010000 + ); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void fourByteSequenceWithoutContinuationBytes() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayIncluding((byte) 0b11110_000); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void fourByteSequenceWithTwoContinuationBytesAtTheEnd() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayEndedWith( + (byte) 0b11110_000, + (byte) 0b10_010000, + (byte) 0b10_000000 + ); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void fourByteSequenceWithOneContinuationByteAtTheEnd() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayEndedWith( + (byte) 0b11110_000, + (byte) 0b10_010000 + ); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void fourByteSequenceWithoutContinuationBytesAtTheEnd() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = randomUtf8ByteArrayEndedWith((byte) 0b11110_000); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for input: %s.", toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + + @Test + public void overlongTwoByteSequence() { + // given + SimdJsonParser parser = new SimdJsonParser(); + List sequences = utf8Sequences(0x0000, 0x007F, 2); + + for (byte[] sequence : sequences) { + byte[] input = randomUtf8ByteArrayIncluding(sequence); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for sequence: %s and input: %s.", toHexString(sequence), toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + } + + @Test + public void overlongThreeByteSequence() { + // given + SimdJsonParser parser = new SimdJsonParser(); + List sequences = utf8Sequences(0x0000, 0x07FF, 3); + + for (byte[] sequence : sequences) { + byte[] input = randomUtf8ByteArrayIncluding(sequence); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for sequence: %s and input: %s.", toHexString(sequence), toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + } + + @Test + public void surrogateCodePoints() { + // given + SimdJsonParser parser = new SimdJsonParser(); + List sequences = utf8Sequences(0xD800, 0xDFFF, 3); + + for (byte[] sequence : sequences) { + byte[] input = randomUtf8ByteArrayIncluding(sequence); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for sequence: %s and input: %s.", toHexString(sequence), toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + } + + @Test + public void overlongFourByteSequence() { + // given + SimdJsonParser parser = new SimdJsonParser(); + List sequences = utf8Sequences(0x0000, 0xFFFF, 4); + + for (byte[] sequence : sequences) { + byte[] input = randomUtf8ByteArrayIncluding(sequence); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for sequence: %s and input: %s.", toHexString(sequence), toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + } + + @Test + public void tooLargeFourByteSequence() { + // given + SimdJsonParser parser = new SimdJsonParser(); + List sequences = utf8Sequences(0x110000, 0x110400, 4); + + for (byte[] sequence : sequences) { + byte[] input = randomUtf8ByteArrayIncluding(sequence); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .overridingErrorMessage("Failed for sequence: %s and input: %s.", toHexString(sequence), toHexString(input)) + .hasMessage("The input is not valid UTF-8"); + } + } + + @Test + public void validTestFile() throws IOException { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = TestUtils.loadTestFile("/nhkworld.json"); + + // when / then + assertThatCode(() -> parser.parse(input, input.length)).doesNotThrowAnyException(); + } + + @Test + public void invalidTestFile() throws IOException { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] input = TestUtils.loadTestFile("/malformed.txt"); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(input, input.length)); + + // then + assertThat(ex) + .hasMessage("The input is not valid UTF-8"); + } +} diff --git a/src/test/java/org/simdjson/Utf8ValidatorTest.java b/src/test/java/org/simdjson/Utf8ValidatorTest.java deleted file mode 100644 index 995323b..0000000 --- a/src/test/java/org/simdjson/Utf8ValidatorTest.java +++ /dev/null @@ -1,496 +0,0 @@ -package org.simdjson; - -import jdk.incubator.vector.VectorSpecies; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -import java.io.IOException; -import java.util.Arrays; - -import static org.assertj.core.api.Assertions.*; - -class Utf8ValidatorTest { - private static final VectorSpecies VECTOR_SPECIES = StructuralIndexer.BYTE_SPECIES; - - - /* ASCII / 1 BYTE TESTS */ - - @Test - void validate_allEightBitValues_invalidAscii() { - byte[] invalidAscii = new byte[128]; - - int index = 0; - for (int eightBitVal = 255; eightBitVal >= 128; eightBitVal--) { - invalidAscii[index++] = (byte) eightBitVal; - } - - SimdJsonParser parser = new SimdJsonParser(); - for (int i = 0; i < 128; i += VECTOR_SPECIES.vectorByteSize()) { - byte[] vectorChunk = Arrays.copyOfRange(invalidAscii, i, i + VECTOR_SPECIES.vectorByteSize()); - - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length)) - .withMessage("Invalid UTF8"); - } - } - - - /* CONTINUATION BYTE TESTS */ - - // continuation byte is never valid without a preceding leader byte - @Test - void validate_continuationByteOutOfOrder_invalid() { - byte minContinuationByte = (byte) 0b10_000000; - byte maxContinuationByte = (byte) 0b10_111111; - byte[] inputBytes = new byte[64]; - int index = 0; - - byte continuationByte = minContinuationByte; - while (continuationByte <= maxContinuationByte) { - inputBytes[index++] = continuationByte; - continuationByte++; - } - - SimdJsonParser parser = new SimdJsonParser(); - for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.length()) { - byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize()); - - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length)) - .withMessage("Invalid UTF8"); - } - } - - @Test - void validate_extraContinuationByte_2Byte_invalid() { - byte[] inputBytes = new byte[3]; - inputBytes[0] = (byte) 0b110_00010; - inputBytes[1] = (byte) 0b10_000000; - inputBytes[2] = (byte) 0b10_000000; // two byte lead should only have one continuation byte - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationOneByteTooShort_2Byte_invalid() { - byte[] inputBytes = new byte[1]; - inputBytes[0] = (byte) 0b110_00010; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_extraContinuationByte_3Byte_invalid() { - byte[] inputBytes = new byte[4]; - inputBytes[0] = (byte) 0b1110_0000; - inputBytes[1] = (byte) 0b10_100000; - inputBytes[2] = (byte) 0b10_000000; - inputBytes[3] = (byte) 0b10_000000; // three byte lead should only have two continuation bytes - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationOneByteTooShort_3Byte_invalid() { - byte[] inputBytes = new byte[2]; - inputBytes[0] = (byte) 0b1110_0000; - inputBytes[1] = (byte) 0b10_100000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationTwoBytesTooShort_3Byte_invalid() { - byte[] inputBytes = new byte[1]; - inputBytes[0] = (byte) 0b1110_0000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_extraContinuationByte_4Byte_invalid() { - byte[] inputBytes = new byte[5]; - inputBytes[0] = (byte) 0b11110_000; - inputBytes[1] = (byte) 0b10_010000; - inputBytes[2] = (byte) 0b10_000000; - inputBytes[3] = (byte) 0b10_000000; - inputBytes[4] = (byte) 0b10_000000; // four byte lead should only have three continuation bytes - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationOneByteTooShort_4Byte_invalid() { - byte[] inputBytes = new byte[3]; - inputBytes[0] = (byte) 0b11110_000; - inputBytes[1] = (byte) 0b10_010000; - inputBytes[2] = (byte) 0b10_000000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationTwoBytesTooShort_4Byte_invalid() { - byte[] inputBytes = new byte[2]; - inputBytes[0] = (byte) 0b11110_000; - inputBytes[1] = (byte) 0b10_010000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationThreeBytesTooShort_4Byte_invalid() { - byte[] inputBytes = new byte[1]; - inputBytes[0] = (byte) 0b11110_000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - - /* 2 BYTE / LATIN TESTS */ - - @Test - void validate_overlong_2byte_invalid() { - byte minLeaderByte = (byte) 0b110_00000; - byte maxLeaderByte = (byte) 0b110_00001; - byte minContinuationByte = (byte) 0b10_000000; - byte maxContinuationByte = (byte) 0b10_111111; - - /* 7 bit code points in 2 byte utf8 is invalid - 2 to the power of 7 = 128 code points * 2 bytes = 256 bytes */ - byte[] inputBytes = new byte[256]; - int index = 0; - - byte leaderByte = minLeaderByte; - byte continuationByte = minContinuationByte; - while (leaderByte <= maxLeaderByte) { - inputBytes[index++] = leaderByte; - inputBytes[index++] = continuationByte; - if (continuationByte == maxContinuationByte) { - leaderByte++; - continuationByte = minContinuationByte; - } else { - continuationByte++; - } - } - - SimdJsonParser parser = new SimdJsonParser(); - for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.length()) { - byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize()); - - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length)) - .withMessage("Invalid UTF8"); - } - } - - - /* 3 BYTE / Asiatic TESTS */ - - /* first valid three byte character: 1110_0000 10_100000 10_000000 - anything smaller is invalid as it would fit into 11 bits (two byte utf8) */ - @Test - void validate_overlong_3Byte_allInvalid() { - byte minLeaderByte = (byte) 0b1110_0000; - byte firstValidContinuationByte = (byte) 0b10_100000; - byte minContinuationByte = (byte) 0b10_000000; - byte maxContinuationByte = (byte) 0b10_111111; - - // 2 to the power of 11 = 2048 code points * 3 bytes = 6144 - byte[] inputBytes = new byte[6144]; - int index = 0; - - byte firstContinuationByte = minContinuationByte; - byte secondContinuationByte = minContinuationByte; - while (firstContinuationByte < firstValidContinuationByte) { - inputBytes[index++] = minLeaderByte; - inputBytes[index++] = firstContinuationByte; - inputBytes[index++] = secondContinuationByte; - - if (secondContinuationByte == maxContinuationByte) { - secondContinuationByte = minContinuationByte; - firstContinuationByte++; - } else { - secondContinuationByte++; - } - } - - SimdJsonParser parser = new SimdJsonParser(); - for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.length()) { - byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize()); - - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length)) - .withMessage("Invalid UTF8"); - } - } - - /* code points in the range of U+D800 - U+DFFF (inclusive) are the surrogates for UTF-16. - These 2048 code points that are reserved for UTF-16 are disallowed in UTF-8 - 1101 1000 0000 0000 -> 1101 1111 1111 1111 */ - @Test - void validate_surrogateCodePoints_invalid() { - final byte leaderByte = (byte) 0b1101_1110; - final byte minContinuationByte = (byte) 0b10_000000; - final byte maxContinuationByte = (byte) 0b10_111111; - final byte minFirstContinuationByte = (byte) 0b10_100000; - - byte firstContinuationByte = minFirstContinuationByte; - byte secondContinuationByte = minContinuationByte; - - // 2048 invalid code points * 3 bytes = 6144 bytes - byte[] inputBytes = new byte[6144]; - int index = 0; - - while (firstContinuationByte <= maxContinuationByte) { - inputBytes[index++] = leaderByte; - inputBytes[index++] = firstContinuationByte; - inputBytes[index++] = secondContinuationByte; - - if (secondContinuationByte == maxContinuationByte) { - firstContinuationByte++; - secondContinuationByte = minContinuationByte; - } else { - secondContinuationByte++; - } - } - - SimdJsonParser parser = new SimdJsonParser(); - for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.vectorByteSize()) { - byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize()); - - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length)) - .withMessage("Invalid UTF8"); - } - } - - - /* 4 BYTE / Supplementary TESTS */ - - /* Overlong Test, the decoded character must be above U+FFFF / 11110_000 10_001111 10_111111 10_111111 */ - @Test - void validate_overlong_4Byte_allInvalid() { - byte leaderByte = (byte) 0b11110_000; - byte minContinuationByte = (byte) 0b10_000000; - byte maxContinuationByte = (byte) 0b10_111111; - byte maxFirstContinuationByte = (byte) 0b10_001111; - - // 2 to the power of 16 = 65536 valid code points * 4 bytes = 262144 bytes - byte[] inputBytes = new byte[262144]; - int index = 0; - - byte firstContinuationByte = minContinuationByte; - byte secondContinuationByte = minContinuationByte; - byte thirdContinuationByte = minContinuationByte; - while (firstContinuationByte <= maxFirstContinuationByte) { - inputBytes[index++] = leaderByte; - inputBytes[index++] = firstContinuationByte; - inputBytes[index++] = secondContinuationByte; - inputBytes[index++] = thirdContinuationByte; - - if (thirdContinuationByte == maxContinuationByte) { - if (secondContinuationByte == maxContinuationByte) { - firstContinuationByte++; - secondContinuationByte = minContinuationByte; - } else { - secondContinuationByte++; - } - thirdContinuationByte = minContinuationByte; - } else { - thirdContinuationByte++; - } - } - - SimdJsonParser parser = new SimdJsonParser(); - for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.vectorByteSize()) { - byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize()); - - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length)) - .withMessage("Invalid UTF8"); - } - } - - /* last valid four byte character: 11110_100 10_001111 10_111111 10_111111 - Any code point greater than U+10FFFF will result in a TOO_LARGE error */ - @Test - void validate_tooLarge_4Byte_allInvalid() { - byte minLeaderByte = (byte) 0b11110_100; - byte maxLeaderByte = (byte) 0b11111_111; - byte minContinuationByte = (byte) 0b10_000000; - byte maxContinuationByte = (byte) 0b10_111111; - byte minFirstContinuationByte = (byte) 0b10_010000; - - - byte leaderByte = minLeaderByte; - byte firstContinuationByte = minFirstContinuationByte; - byte secondContinuationByte = minContinuationByte; - byte thirdContinuationByte = minContinuationByte; - - int codePoints = 0x3FFFFF - 0x110000 + 1; - byte[] inputBytes = new byte[codePoints * 4]; - int index = 0; - - while (leaderByte <= maxLeaderByte) { - inputBytes[index++] = leaderByte; - inputBytes[index++] = firstContinuationByte; - inputBytes[index++] = secondContinuationByte; - inputBytes[index++] = thirdContinuationByte; - - if (thirdContinuationByte == maxContinuationByte) { - if (secondContinuationByte == maxContinuationByte) { - if (firstContinuationByte == maxContinuationByte) { - leaderByte++; - firstContinuationByte = minContinuationByte; - } else { - firstContinuationByte++; - } - secondContinuationByte = minContinuationByte; - } else { - secondContinuationByte++; - } - thirdContinuationByte = minContinuationByte; - } else { - thirdContinuationByte++; - } - } - - SimdJsonParser parser = new SimdJsonParser(); - for (int i = 0; i < inputBytes.length; i += VECTOR_SPECIES.vectorByteSize()) { - byte[] vectorChunk = Arrays.copyOfRange(inputBytes, i, i + VECTOR_SPECIES.vectorByteSize()); - - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(vectorChunk, vectorChunk.length)) - .withMessage("Invalid UTF8"); - } - } - - /* check that the data stream does not terminate with an incomplete code point - We just have to check that the last byte in the last vector is strictly smaller than 0xC0 (using an unsigned comparison) - that the second last byte is strictly smaller than 0xE0 - the third last byte is strictly smaller than 0xF0 */ - @Test - void validate_continuationOneByteTooShort_2Byte_eof_invalid() { - int vectorBytes = VECTOR_SPECIES.vectorByteSize(); - byte[] inputBytes = new byte[vectorBytes]; - inputBytes[vectorBytes - 1] = (byte) 0b110_00010; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationOneByteTooShort_3Byte_eof_invalid() { - int vectorBytes = VECTOR_SPECIES.vectorByteSize(); - byte[] inputBytes = new byte[vectorBytes]; - inputBytes[vectorBytes - 2] = (byte) 0b1110_0000; - inputBytes[vectorBytes - 1] = (byte) 0b10_100000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationTwoBytesTooShort_3Byte_eof_invalid() { - int vectorBytes = VECTOR_SPECIES.vectorByteSize(); - byte[] inputBytes = new byte[vectorBytes]; - inputBytes[vectorBytes - 1] = (byte) 0b1110_0000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationOneByteTooShort_4Byte_eof_invalid() { - int vectorBytes = VECTOR_SPECIES.vectorByteSize(); - byte[] inputBytes = new byte[vectorBytes]; - inputBytes[vectorBytes - 3] = (byte) 0b11110_000; - inputBytes[vectorBytes - 2] = (byte) 0b10_010000; - inputBytes[vectorBytes - 1] = (byte) 0b10_000000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationTwoBytesTooShort_4Byte_eof_invalid() { - int vectorBytes = VECTOR_SPECIES.vectorByteSize(); - byte[] inputBytes = new byte[vectorBytes]; - inputBytes[vectorBytes - 2] = (byte) 0b11110_000; - inputBytes[vectorBytes - 1] = (byte) 0b10_010000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - @Test - void validate_continuationThreeBytesTooShort_4Byte_eof_invalid() { - int vectorBytes = VECTOR_SPECIES.vectorByteSize(); - byte[] inputBytes = new byte[vectorBytes]; - inputBytes[vectorBytes - 1] = (byte) 0b11110_000; - - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } - - - /* file tests */ - - @ParameterizedTest - @ValueSource(strings = {"/twitter.json", "/nhkworld.json"}) - void validate_utf8InputFiles_valid(String inputFilePath) throws IOException { - byte[] inputBytes = TestUtils.loadTestFile(inputFilePath); - SimdJsonParser parser = new SimdJsonParser(); - assertThatCode(() -> parser.parse(inputBytes, inputBytes.length)).doesNotThrowAnyException(); - } - - @Test - void validate_utf8InputFile_invalid() throws IOException { - byte[] inputBytes = TestUtils.loadTestFile("/malformed.txt"); - SimdJsonParser parser = new SimdJsonParser(); - assertThatExceptionOfType(JsonParsingException.class) - .isThrownBy(() -> parser.parse(inputBytes, inputBytes.length)) - .withMessage("Invalid UTF8"); - } -} \ No newline at end of file diff --git a/src/test/java/org/simdjson/testutils/TestUtils.java b/src/test/java/org/simdjson/testutils/TestUtils.java new file mode 100644 index 0000000..aba84c2 --- /dev/null +++ b/src/test/java/org/simdjson/testutils/TestUtils.java @@ -0,0 +1,40 @@ +package org.simdjson.testutils; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +import static java.nio.charset.StandardCharsets.UTF_8; + +public class TestUtils { + + public static byte[] toUtf8PaddedWithSpaces(String str) { + byte[] strBytes = toUtf8(str); + byte[] padded = new byte[strBytes.length + 64]; + Arrays.fill(padded, (byte) ' '); + System.arraycopy(strBytes, 0, padded, 0, strBytes.length); + return padded; + } + + public static byte[] toUtf8(String str) { + return str.getBytes(UTF_8); + } + + public static byte[] loadTestFile(String name) throws IOException { + try (InputStream is = TestUtils.class.getResourceAsStream(name)) { + return is.readAllBytes(); + } + } + + public static String toHexString(byte[] array) { + var sb = new StringBuilder("["); + for (int i = 0; i < array.length; i++) { + sb.append(String.format("%02X", array[i])); + if (i < array.length - 1) { + sb.append(" "); + } + } + sb.append("]"); + return sb.toString(); + } +} diff --git a/src/test/java/org/simdjson/testutils/Utf8TestData.java b/src/test/java/org/simdjson/testutils/Utf8TestData.java new file mode 100644 index 0000000..5d9d348 --- /dev/null +++ b/src/test/java/org/simdjson/testutils/Utf8TestData.java @@ -0,0 +1,62 @@ +package org.simdjson.testutils; + +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.commons.lang3.RandomUtils; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +public class Utf8TestData { + + /** + * Generates UTF-8 sequences from the provided range. Each sequence is of the given length. + * Note that when the length is greater than necessary for a given code point, this function + * produces sequences that are invalid UTF-8. This is a useful property when one wants to + * generate overlong encodings for testing purposes. + */ + public static List utf8Sequences(int from, int to, int length) { + List result = new ArrayList<>(); + for (int i = from; i <= to; i++) { + byte[] bytes = new byte[length]; + int current = i; + // continuation bytes + for (int byteIdx = length - 1; byteIdx >= 1; byteIdx--) { + bytes[byteIdx] = (byte) (0b1000_0000 | (current & 0b0011_1111)); + current = current >>> 6; + } + // leading byte + bytes[0] = (byte) ((0x80000000 >> (24 + length - 1)) | (current & 0b0011_111)); + result.add(bytes); + } + return result; + } + + public static byte[] randomUtf8ByteArray() { + return randomUtf8ByteArray(1, 1000); + } + + public static byte[] randomUtf8ByteArrayIncluding(byte... sequence) { + byte[] prefix = randomUtf8ByteArray(0, 500); + byte[] suffix = randomUtf8ByteArray(0, 500); + byte[] result = new byte[prefix.length + sequence.length + suffix.length]; + System.arraycopy(prefix, 0, result, 0, prefix.length); + System.arraycopy(sequence, 0, result, prefix.length, sequence.length); + System.arraycopy(suffix, 0, result, prefix.length + sequence.length, suffix.length); + return result; + } + + public static byte[] randomUtf8ByteArrayEndedWith(byte... sequence) { + byte[] array = randomUtf8ByteArray(0, 1000); + byte[] result = new byte[array.length + sequence.length]; + System.arraycopy(array, 0, result, 0, array.length); + System.arraycopy(sequence, 0, result, array.length, sequence.length); + return result; + } + + private static byte[] randomUtf8ByteArray(int minChars, int maxChars) { + int stringLen = RandomUtils.nextInt(minChars, maxChars + 1); + var string = RandomStringUtils.random(stringLen); + return string.getBytes(StandardCharsets.UTF_8); + } +}