@@ -34,15 +34,7 @@ struct structural_parser {
34
34
structural_iterator structurals;
35
35
uint32_t depth;
36
36
37
- really_inline structural_parser (
38
- size_t len,
39
- parser &_doc_parser
40
- ) : structurals(_doc_parser, len), depth{0 } {}
41
-
42
- really_inline structural_parser (
43
- parser &_doc_parser,
44
- uint32_t _depth
45
- ) : structurals(_doc_parser, 0 ), depth{_depth} {}
37
+ really_inline structural_parser (parser &_doc_parser, uint32_t _depth=0 ) : structurals(_doc_parser), depth{_depth} {}
46
38
47
39
really_inline parser &doc_parser () {
48
40
return structurals.doc_parser ;
@@ -168,21 +160,21 @@ struct structural_parser {
168
160
return false ;
169
161
}
170
162
171
- WARN_UNUSED really_inline bool parse_root_atom () {
163
+ WARN_UNUSED really_inline bool parse_root_atom (size_t len ) {
172
164
switch (structurals.current_char ()) {
173
165
case ' t' :
174
166
log_value (" true" );
175
- if (!atomparsing::is_valid_true_atom (structurals.current (), structurals. remaining_len ())) { return true ; }
167
+ if (!atomparsing::is_valid_true_atom (structurals.current (), remaining_len (len ))) { return true ; }
176
168
write_tape (0 , internal::tape_type::TRUE_VALUE);
177
169
return false ;
178
170
case ' f' :
179
171
log_value (" false" );
180
- if (!atomparsing::is_valid_false_atom (structurals.current (), structurals. remaining_len ())) { return true ; }
172
+ if (!atomparsing::is_valid_false_atom (structurals.current (), remaining_len (len ))) { return true ; }
181
173
write_tape (0 , internal::tape_type::FALSE_VALUE);
182
174
return false ;
183
175
case ' n' :
184
176
log_value (" null" );
185
- if (!atomparsing::is_valid_null_atom (structurals.current (), structurals. remaining_len ())) { return true ; }
177
+ if (!atomparsing::is_valid_null_atom (structurals.current (), remaining_len (len ))) { return true ; }
186
178
write_tape (0 , internal::tape_type::NULL_VALUE);
187
179
return false ;
188
180
default :
@@ -212,7 +204,7 @@ struct structural_parser {
212
204
}
213
205
}
214
206
215
- WARN_UNUSED really_inline bool parse_root_value () {
207
+ WARN_UNUSED really_inline bool parse_root_value (size_t len ) {
216
208
// Parse the root value of the document. This is similar to parse_value(), but atoms and numbers
217
209
// in particular get special treatment because those parsers normally rely on being in an object
218
210
// or array (and thus the buffer having at least whitespace ] } or , after them).
@@ -224,18 +216,16 @@ struct structural_parser {
224
216
case ' "' :
225
217
return parse_string ();
226
218
case ' t' : case ' f' : case ' n' :
227
- return parse_root_atom ();
219
+ return parse_root_atom (len );
228
220
case ' 0' : case ' 1' : case ' 2' : case ' 3' : case ' 4' :
229
221
case ' 5' : case ' 6' : case ' 7' : case ' 8' : case ' 9' :
230
- return
231
- structurals.with_space_terminated_copy ([&](const uint8_t *copy, size_t idx) {
232
- return parse_number (©[idx], false );
233
- });
222
+ return with_space_terminated_copy (len, [&](const uint8_t *copy, size_t idx) {
223
+ return parse_number (©[idx], false );
224
+ });
234
225
case ' -' :
235
- return
236
- structurals.with_space_terminated_copy ([&](const uint8_t *copy, size_t idx) {
237
- return parse_number (©[idx], true );
238
- });
226
+ return with_space_terminated_copy (len, [&](const uint8_t *copy, size_t idx) {
227
+ return parse_number (©[idx], true );
228
+ });
239
229
default :
240
230
log_error (" Document starts with a non-value character" );
241
231
return true ;
@@ -333,7 +323,7 @@ struct structural_parser {
333
323
334
324
WARN_UNUSED really_inline error_code finish () {
335
325
// the string might not be NULL terminated.
336
- if ( !structurals.at_end (doc_parser (). n_structural_indexes ) ) {
326
+ if ( !structurals.at_end () ) {
337
327
log_error (" More than one JSON value at the root of the document, or extra characters at the end of the JSON!" );
338
328
return on_error (TAPE_ERROR);
339
329
}
@@ -418,6 +408,36 @@ struct structural_parser {
418
408
return structurals.advance_char ();
419
409
}
420
410
411
+ really_inline size_t remaining_len (size_t len) {
412
+ return len - structurals.idx ;
413
+ }
414
+
415
+ template <typename F>
416
+ really_inline bool with_space_terminated_copy (size_t len, const F& f) {
417
+ /* *
418
+ * We need to make a copy to make sure that the string is space terminated.
419
+ * This is not about padding the input, which should already padded up
420
+ * to len + SIMDJSON_PADDING. However, we have no control at this stage
421
+ * on how the padding was done. What if the input string was padded with nulls?
422
+ * It is quite common for an input string to have an extra null character (C string).
423
+ * We do not want to allow 9\0 (where \0 is the null character) inside a JSON
424
+ * document, but the string "9\0" by itself is fine. So we make a copy and
425
+ * pad the input with spaces when we know that there is just one input element.
426
+ * This copy is relatively expensive, but it will almost never be called in
427
+ * practice unless you are in the strange scenario where you have many JSON
428
+ * documents made of single atoms.
429
+ */
430
+ char *copy = static_cast <char *>(malloc (len + SIMDJSON_PADDING));
431
+ if (copy == nullptr ) {
432
+ return true ;
433
+ }
434
+ memcpy (copy, doc_parser ().parsing_buf , len);
435
+ memset (copy + len, ' ' , SIMDJSON_PADDING);
436
+ bool result = f (reinterpret_cast <const uint8_t *>(copy), structurals.idx );
437
+ free (copy);
438
+ return result;
439
+ }
440
+
421
441
really_inline void log_value (const char *type) {
422
442
logger::log_line (structurals, " " , type, " " );
423
443
}
@@ -442,8 +462,7 @@ struct structural_parser {
442
462
};
443
463
444
464
struct streaming_structural_parser : structural_parser {
445
- really_inline streaming_structural_parser (size_t len, parser &_doc_parser)
446
- : structural_parser(len, _doc_parser) {}
465
+ really_inline streaming_structural_parser (parser &_doc_parser) : structural_parser(_doc_parser) {}
447
466
448
467
// override to add streaming
449
468
WARN_UNUSED really_inline error_code start (UNUSED size_t len) {
@@ -461,7 +480,7 @@ struct streaming_structural_parser: structural_parser {
461
480
462
481
// override to add streaming
463
482
WARN_UNUSED really_inline error_code finish () {
464
- if ( structurals.past_end (doc_parser (). n_structural_indexes ) ) {
483
+ if ( structurals.past_end () ) {
465
484
log_error (" IMPOSSIBLE: past the end of the JSON!" );
466
485
return on_error (TAPE_ERROR);
467
486
}
@@ -470,7 +489,7 @@ struct streaming_structural_parser: structural_parser {
470
489
log_error (" Unclosed objects or arrays!" );
471
490
return on_error (TAPE_ERROR);
472
491
}
473
- bool finished = structurals.at_end (doc_parser (). n_structural_indexes );
492
+ bool finished = structurals.at_end ();
474
493
if (!finished) { log_value (" (and has more)" ); }
475
494
return on_success (finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
476
495
}
@@ -487,11 +506,11 @@ struct streaming_structural_parser: structural_parser {
487
506
WARN_UNUSED error_code implementation::stage2 (const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
488
507
doc_parser.parsing_buf = buf;
489
508
doc_parser.next_structural = 0 ;
490
- stage2::structural_parser parser (len, doc_parser);
509
+ stage2::structural_parser parser (doc_parser);
491
510
error_code result = parser.start (len);
492
511
if (result) { return result; }
493
512
494
- if (parser.parse_root_value ()) {
513
+ if (parser.parse_root_value (len )) {
495
514
return parser.error ();
496
515
}
497
516
return parser.finish ();
@@ -504,11 +523,11 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
504
523
WARN_UNUSED error_code implementation::stage2 (const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
505
524
doc_parser.parsing_buf = buf;
506
525
doc_parser.next_structural = next_json;
507
- stage2::streaming_structural_parser parser (len, doc_parser);
526
+ stage2::streaming_structural_parser parser (doc_parser);
508
527
error_code result = parser.start (len);
509
528
if (result) { return result; }
510
529
511
- if (parser.parse_root_value ()) {
530
+ if (parser.parse_root_value (len )) {
512
531
return parser.error ();
513
532
}
514
533
next_json = doc_parser.next_structural ;
0 commit comments