8000 Don't pass/store len in structural_iterator · JavaScriptExpert/simdjson@58e2542 · GitHub
[go: up one dir, main page]

Skip to content

Commit 58e2542

Browse files
committed
Don't pass/store len in structural_iterator
1 parent 53408c9 commit 58e2542

File tree

2 files changed

+56
-67
lines changed

2 files changed

+56
-67
lines changed

src/generic/stage2/structural_iterator.h

Lines changed: 5 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@ namespace stage2 {
22

33
class structural_iterator {
44
public:
5-
really_inline structural_iterator(parser &_doc_parser, size_t _len)
5+
really_inline structural_iterator(parser &_doc_parser)
66
: doc_parser{_doc_parser},
77
buf{_doc_parser.parsing_buf},
8-
len{_len},
98
structural_indexes{_doc_parser.structural_indexes.get()}
109
{}
1110
really_inline char advance_char() {
@@ -23,39 +22,11 @@ class structural_iterator {
2322
really_inline const uint8_t* current() {
2423
return &buf[idx];
2524
}
26-
really_inline size_t remaining_len() {
27-
return len - idx;
25+
really_inline bool past_end() {
26+
return doc_parser.next_structural+1 > doc_parser.n_structural_indexes;
2827
}
29-
template<typename F>
30-
really_inline bool with_space_terminated_copy(const F& f) {
31-
/**
32-
* We need to make a copy to make sure that the string is space terminated.
33-
* This is not about padding the input, which should already padded up
34-
* to len + SIMDJSON_PADDING. However, we have no control at this stage
35-
* on how the padding was done. What if the input string was padded with nulls?
36-
* It is quite common for an input string to have an extra null character (C string).
37-
* We do not want to allow 9\0 (where \0 is the null character) inside a JSON
38-
* document, but the string "9\0" by itself is fine. So we make a copy and
39-
* pad the input with spaces when we know that there is just one input element.
40-
* This copy is relatively expensive, but it will almost never be called in
41-
* practice unless you are in the strange scenario where you have many JSON
42-
* documents made of single atoms.
43-
*/
44-
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
45-
if (copy == nullptr) {
46-
return true;
47-
}
48-
memcpy(copy, buf, len);
49-
memset(copy + len, ' ', SIMDJSON_PADDING);
50-
bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
51-
free(copy);
52-
return result;
53-
}
54-
really_inline bool past_end(uint32_t n_structural_indexes) {
55-
return doc_parser.next_structural+1 > n_structural_indexes;
56-
}
57-
really_inline bool at_end(uint32_t n_structural_indexes) {
58-
return doc_parser.next_structural+1 == n_structural_indexes;
28+
really_inline bool at_end() {
29+
return doc_parser.next_structural+1 == doc_parser.n_structural_indexes;
5930
}
6031
really_inline bool at_beginning() {
6132
return doc_parser.next_structural == 0;
@@ -66,7 +37,6 @@ class structural_iterator {
6637

6738
parser &doc_parser;
6839
const uint8_t* const buf;
69-
const size_t len;
7040
const uint32_t* const structural_indexes;
7141
size_t idx{0}; // location of the structural character in the input (buf)
7242
uint8_t c{0}; // used to track the (structural) character we are looking at

src/generic/stage2/structural_parser.h

Lines changed: 51 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,7 @@ struct structural_parser {
3434
structural_iterator structurals;
3535
uint32_t depth;
3636

37-
really_inline structural_parser(
38-
size_t len,
39-
parser &_doc_parser
40-
) : structurals(_doc_parser, len), depth{0} {}
41-
42-
really_inline structural_parser(
43-
parser &_doc_parser,
44-
uint32_t _depth
45-
) : structurals(_doc_parser, 0), depth{_depth} {}
37+
really_inline structural_parser(parser &_doc_parser, uint32_t _depth=0) : structurals(_doc_parser), depth{_depth} {}
4638

4739
really_inline parser &doc_parser() {
4840
return structurals.doc_parser;
@@ -168,21 +160,21 @@ struct structural_parser {
168160
return false;
169161
}
170162

171-
WARN_UNUSED really_inline bool parse_root_atom() {
163+
WARN_UNUSED really_inline bool parse_root_atom(size_t len) {
172164
switch (structurals.current_char()) {
173165
case 't':
174166
log_value("true");
175-
if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
167+
if (!atomparsing::is_valid_true_atom(structurals.current(), remaining_len(len))) { return true; }
176168
write_tape(0, internal::tape_type::TRUE_VALUE);
177169
return false;
178170
case 'f':
179171
log_value("false");
180-
if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
172+
if (!atomparsing::is_valid_false_atom(structurals.current(), remaining_len(len))) { return true; }
181173
write_tape(0, internal::tape_type::FALSE_VALUE);
182174
return false;
183175
case 'n':
184176
log_value("null");
185-
if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
177+
if (!atomparsing::is_valid_null_atom(structurals.current(), remaining_len(len))) { return true; }
186178
write_tape(0, internal::tape_type::NULL_VALUE);
187179
return false;
188180
default:
@@ -212,7 +204,7 @@ struct structural_parser {
212204
}
213205
}
214206

215-
WARN_UNUSED really_inline bool parse_root_value() {
207+
WARN_UNUSED really_inline bool parse_root_value(size_t len) {
216208
// Parse the root value of the document. This is similar to parse_value(), but atoms and numbers
217209
// in particular get special treatment because those parsers normally rely on being in an object
218210
// or array (and thus the buffer having at least whitespace ] } or , after them).
@@ -224,18 +216,16 @@ struct structural_parser {
224216
case '"':
225217
return parse_string();
226218
case 't': case 'f': case 'n':
227-
return parse_root_atom();
219+
return parse_root_atom(len);
228220
case '0': case '1': case '2': case '3': case '4':
229221
case '5': case '6': case '7': case '8': case '9':
230-
return
231-
structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
232-
return parse_number(&copy[idx], false);
233-
});
222+
return with_space_terminated_copy(len, [&](const uint8_t *copy, size_t idx) {
223+
return parse_number(&copy[idx], false);
224+
});
234225
case '-':
235-
return
236-
structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
237-
return parse_number(&copy[idx], true);
238-
});
226+
return with_space_terminated_copy(len, [&](const uint8_t *copy, size_t idx) {
227+
return parse_number(&copy[idx], true);
228+
});
239229
default:
240230
log_error("Document starts with a non-value character");
241231
return true;
@@ -333,7 +323,7 @@ struct structural_parser {
333323

334324
WARN_UNUSED really_inline error_code finish() {
335325
// the string might not be NULL terminated.
336-
if ( !structurals.at_end(doc_parser().n_structural_indexes) ) {
326+
if ( !structurals.at_end() ) {
337327
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
338328
return on_error(TAPE_ERROR);
339329
}
@@ -418,6 +408,36 @@ struct structural_parser {
418408
return structurals.advance_char();
419409
}
420410

411+
really_inline size_t remaining_len(size_t len) {
412+
return len - structurals.idx;
413+
}
414+
415+
template<typename F>
416+
really_inline bool with_space_terminated_copy(size_t len, const F& f) {
417+
/**
418+
* We need to make a copy to make sure that the string is space terminated.
419+
* This is not about padding the input, which should already padded up
420+
* to len + SIMDJSON_PADDING. However, we have no control at this stage
421+
* on how the padding was done. What if the input string was padded with nulls?
422+
* It is quite common for an input string to have an extra null character (C string).
423+
* We do not want to allow 9\0 (where \0 is the null character) inside a JSON
424+
* document, but the string "9\0" by itself is fine. So we make a copy and
425+
* pad the input with spaces when we know that there is just one input element.
426+
* This copy is relatively expensive, but it will almost never be called in
427+
* practice unless you are in the strange scenario where you have many JSON
428+
* documents made of single atoms.
429+
*/
430+
char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
431+
if (copy == nullptr) {
432+
return true;
433+
}
434+
memcpy(copy, doc_parser().parsing_buf, len);
435+
memset(copy + len, ' ', SIMDJSON_PADDING);
436+
bool result = f(reinterpret_cast<const uint8_t*>(copy), structurals.idx);
437+
free(copy);
438+
return result;
439+
}
440+
421441
really_inline void log_value(const char *type) {
422442
logger::log_line(structurals, "", type, "");
423443
}
@@ -442,8 +462,7 @@ struct structural_parser {
442462
};
443463

444464
struct streaming_structural_parser: structural_parser {
445-
really_inline streaming_structural_parser(size_t len, parser &_doc_parser)
446-
: structural_parser(len, _doc_parser) {}
465+
really_inline streaming_structural_parser(parser &_doc_parser) : structural_parser(_doc_parser) {}
447466

448467
// override to add streaming
449468
WARN_UNUSED really_inline error_code start(UNUSED size_t len) {
@@ -461,7 +480,7 @@ struct streaming_structural_parser: structural_parser {
461480

462481
// override to add streaming
463482
WARN_UNUSED really_inline error_code finish() {
464-
if ( structurals.past_end(doc_parser().n_structural_indexes) ) {
483+
if ( structurals.past_end() ) {
465484
log_error("IMPOSSIBLE: past the end of the JSON!");
466485
return on_error(TAPE_ERROR);
467486
}
@@ -470,7 +489,7 @@ struct streaming_structural_parser: structural_parser {
470489
log_error("Unclosed objects or arrays!");
471490
return on_error(TAPE_ERROR);
472491
}
473-
bool finished = structurals.at_end(doc_parser().n_structural_indexes);
492+
bool finished = structurals.at_end();
474493
if (!finished) { log_value("(and has more)"); }
475494
return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
476495
}
@@ -487,11 +506,11 @@ struct streaming_structural_parser: structural_parser {
487506
WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
488507
doc_parser.parsing_buf = buf;
489508
doc_parser.next_structural = 0;
490-
stage2::structural_parser parser(len, doc_parser);
509+
stage2::structural_parser parser(doc_parser);
491510
error_code result = parser.start(len);
492511
if (result) { return result; }
493512

494-
if (parser.parse_root_value()) {
513+
if (parser.parse_root_value(len)) {
495514
return parser.error();
496515
}
497516
return parser.finish();
@@ -504,11 +523,11 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
504523
WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
505524
doc_parser.parsing_buf = buf;
506525
doc_parser.next_structural = next_json;
507-
stage2::streaming_structural_parser parser(len, doc_parser);
526+
stage2::streaming_structural_parser parser(doc_parser);
508527
error_code result = parser.start(len);
509528
if (result) { return result; }
510529

511-
if (parser.parse_root_value()) {
530+
if (parser.parse_root_value(len)) {
512531
return parser.error();
513532
}
514533
next_json = doc_parser.next_structural;

0 commit comments

Comments
 (0)
0